isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,1221 @@
1
+ #include "frt_analysis.h"
2
+ #include <string.h>
3
+ #include <locale.h>
4
+ #include <libstemmer.h>
5
+ #include "test.h"
6
+
7
+ #define test_token(mtk, mstr, mstart, mend) \
8
+ tt_token(mtk, mstr, mstart, mend, tc, __LINE__)
9
+
10
+ static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase *tc, int line_num)
11
+ {
12
+ FrtToken frt_tk_exp;
13
+ static char buf[3000];
14
+ if (tk == NULL) {
15
+ sprintf(buf, "Token1[NULL] != Token2[%d:%d:%s]\n",
16
+ start, end, str);
17
+ tst_assert(line_num, tc, false, buf);
18
+ return;
19
+ }
20
+ if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, 1), tk)) {
21
+ sprintf(buf, "Token1[%d:%d:%s] != Token2[%d:%d:%s]\n",
22
+ (int)tk->start, (int)tk->end, tk->text, start, end, str);
23
+ tst_assert(line_num, tc, false, buf);
24
+ }
25
+ tst_int_equal(line_num, tc, strlen(tk->text), tk->len);
26
+ }
27
+
28
+ static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int pi, TestCase *tc, int line_num)
29
+ {
30
+ FrtToken frt_tk_exp;
31
+ static char buf[3000];
32
+ if (tk == NULL) {
33
+ sprintf(buf, "Token1[NULL] != Token2[%d:%d:%s]\n",
34
+ start, end, str);
35
+ tst_assert(line_num, tc, false, buf);
36
+ return;
37
+ }
38
+ if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, pi), tk)) {
39
+ sprintf(buf, "Token1[%d:%d:%s-%d] != Token2[%d:%d:%s-%d]\n",
40
+ (int)tk->start, (int)tk->end, tk->text, tk->pos_inc,
41
+ start, end, str, pi);
42
+ tst_assert(line_num, tc, false, buf);
43
+ }
44
+ tst_int_equal(line_num, tc, strlen(tk->text), tk->len);
45
+ }
46
+
47
+ #define test_token_pi(mtk, mstr, mstart, mend, mpi) \
48
+ tt_token_pi(mtk, mstr, mstart, mend, mpi, tc, __LINE__)
49
+
50
+ static void test_tk(TestCase *tc, void *data)
51
+ {
52
+ FrtToken *tk1 = frt_tk_new();
53
+ FrtToken *tk2 = frt_tk_new();
54
+ (void)data;
55
+
56
+ frt_tk_set_no_len(tk1, (char *)"DBalmain", 1, 8, 5);
57
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 5);
58
+ Assert(frt_tk_eq(tk1, tk2), "tokens are equal");
59
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 1);
60
+ Assert(!frt_tk_eq(tk1, tk2), "tokens are not equal");
61
+
62
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 5);
63
+ Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
64
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 0, 8, 5);
65
+ Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
66
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 7, 5);
67
+ Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
68
+
69
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 2, 7, 1);
70
+ Aiequal(-1, frt_tk_cmp(tk1, tk2));
71
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 0, 9, 1);
72
+ Aiequal(1, frt_tk_cmp(tk1, tk2));
73
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 9, 1);
74
+ Aiequal(-1, frt_tk_cmp(tk1, tk2));
75
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 7, 1);
76
+ Aiequal(1, frt_tk_cmp(tk1, tk2));
77
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 8, 1);
78
+ Aiequal(-1, frt_tk_cmp(tk1, tk2));
79
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 1);
80
+ Aiequal(1, frt_tk_cmp(tk1, tk2));
81
+
82
+ Asequal("DBalmain", tk1->text);
83
+ sprintf(tk1->text, "Hello");
84
+ Asequal("Hello", tk1->text);
85
+ Aiequal(1, tk1->start);
86
+ Aiequal(8, tk1->end);
87
+ Aiequal(5, tk1->pos_inc);
88
+ frt_tk_destroy(tk1);
89
+ frt_tk_destroy(tk2);
90
+ }
91
+
92
+ /****************************************************************************
93
+ *
94
+ * Non
95
+ *
96
+ ****************************************************************************/
97
+
98
+ static void test_non_tokenizer(TestCase *tc, void *data)
99
+ {
100
+ FrtToken *tk = frt_tk_new();
101
+ FrtTokenStream *ts = frt_non_tokenizer_new();
102
+ char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
103
+ (void)data;
104
+
105
+ ts->reset(ts, text);
106
+ test_token(frt_ts_next(ts), text, 0, strlen(text));
107
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
108
+ frt_tk_destroy(tk);
109
+ FRT_REF(ts); /* test ref_cnt */
110
+ Aiequal(2, ts->ref_cnt);
111
+ frt_ts_deref(ts);
112
+ Aiequal(1, ts->ref_cnt);
113
+ frt_ts_deref(ts);
114
+ }
115
+
116
+ static void test_non_analyzer(TestCase *tc, void *data)
117
+ {
118
+ FrtToken *tk = frt_tk_new();
119
+ FrtAnalyzer *a = frt_non_analyzer_new();
120
+ char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
121
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
122
+ (void)data;
123
+
124
+ test_token(frt_ts_next(ts), text, 0, strlen(text));
125
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
126
+ frt_tk_destroy(tk);
127
+ frt_ts_deref(ts);
128
+ frt_a_deref(a);
129
+ }
130
+
131
+ /****************************************************************************
132
+ *
133
+ * Whitespace
134
+ *
135
+ ****************************************************************************/
136
+
137
+ static void test_whitespace_tokenizer(TestCase *tc, void *data)
138
+ {
139
+ FrtToken *tk = frt_tk_new();
140
+ FrtTokenStream *ts = frt_whitespace_tokenizer_new();
141
+ char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
142
+ (void)data;
143
+
144
+ ts->reset(ts, text);
145
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
146
+ test_token(frt_ts_next(ts), "is", 19, 21);
147
+ test_token(frt_ts_next(ts), "My", 22, 24);
148
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
149
+ test_token(frt_ts_next(ts), "52", 32, 34);
150
+ test_token(frt_ts_next(ts), "#$", 37, 39);
151
+ test_token(frt_ts_next(ts), "address.", 40, 48);
152
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
153
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
154
+ frt_tk_destroy(tk);
155
+ FRT_REF(ts); /* test ref_cnt */
156
+ Aiequal(2, ts->ref_cnt);
157
+ frt_ts_deref(ts);
158
+ Aiequal(1, ts->ref_cnt);
159
+ frt_ts_deref(ts);
160
+ }
161
+
162
+ static void test_mb_whitespace_tokenizer(TestCase *tc, void *data)
163
+ {
164
+ FrtToken *t, *tk = frt_tk_new();
165
+ FrtTokenStream *ts = frt_mb_whitespace_tokenizer_new(false);
166
+ char text[100] =
167
+ "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
168
+ (void)data;
169
+
170
+ ts->reset(ts, text);
171
+ test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18);
172
+ test_token(frt_ts_next(ts), "is", 19, 21);
173
+ test_token(frt_ts_next(ts), "My", 22, 24);
174
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
175
+ test_token(frt_ts_next(ts), "52", 32, 34);
176
+ test_token(frt_ts_next(ts), "#$", 37, 39);
177
+ test_token(frt_ts_next(ts), "address.", 40, 48);
178
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
179
+ test_token(t = frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86);
180
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
181
+ ts = frt_mb_lowercase_filter_new(ts);
182
+ ts->reset(ts, text);
183
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
184
+ test_token(frt_ts_next(ts), "is", 19, 21);
185
+ test_token(frt_ts_next(ts), "my", 22, 24);
186
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
187
+ test_token(frt_ts_next(ts), "52", 32, 34);
188
+ test_token(frt_ts_next(ts), "#$", 37, 39);
189
+ test_token(frt_ts_next(ts), "address.", 40, 48);
190
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
191
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
192
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
193
+ frt_ts_deref(ts);
194
+ ts = frt_mb_whitespace_tokenizer_new(true);
195
+ ts->reset(ts, text);
196
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
197
+ test_token(frt_ts_next(ts), "is", 19, 21);
198
+ test_token(frt_ts_next(ts), "my", 22, 24);
199
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
200
+ test_token(frt_ts_next(ts), "52", 32, 34);
201
+ test_token(frt_ts_next(ts), "#$", 37, 39);
202
+ test_token(frt_ts_next(ts), "address.", 40, 48);
203
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
204
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
205
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
206
+ FRT_REF(ts); /* test ref_cnt */
207
+ Aiequal(2, ts->ref_cnt);
208
+ frt_ts_deref(ts);
209
+ Aiequal(1, ts->ref_cnt);
210
+ frt_ts_deref(ts);
211
+ frt_tk_destroy(tk);
212
+ }
213
+
214
+ static void test_whitespace_analyzer(TestCase *tc, void *data)
215
+ {
216
+ FrtToken *tk = frt_tk_new();
217
+ FrtAnalyzer *a = frt_whitespace_analyzer_new(false);
218
+ char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
219
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
220
+ (void)data;
221
+
222
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
223
+ test_token(frt_ts_next(ts), "is", 19, 21);
224
+ test_token(frt_ts_next(ts), "My", 22, 24);
225
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
226
+ test_token(frt_ts_next(ts), "52", 32, 34);
227
+ test_token(frt_ts_next(ts), "#$", 37, 39);
228
+ test_token(frt_ts_next(ts), "address.", 40, 48);
229
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
230
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
231
+ frt_tk_destroy(tk);
232
+ frt_ts_deref(ts);
233
+ frt_a_deref(a);
234
+ }
235
+
236
+ static void test_mb_whitespace_analyzer(TestCase *tc, void *data)
237
+ {
238
+ FrtToken *tk = frt_tk_new();
239
+ FrtAnalyzer *a = frt_mb_whitespace_analyzer_new(false);
240
+ char text[100] =
241
+ "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
242
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
243
+ (void)data;
244
+
245
+ test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18);
246
+ test_token(frt_ts_next(ts), "is", 19, 21);
247
+ test_token(frt_ts_next(ts), "My", 22, 24);
248
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
249
+ test_token(frt_ts_next(ts), "52", 32, 34);
250
+ test_token(frt_ts_next(ts), "#$", 37, 39);
251
+ test_token(frt_ts_next(ts), "address.", 40, 48);
252
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
253
+ test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86);
254
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
255
+ frt_ts_deref(ts);
256
+ frt_a_deref(a);
257
+ a = frt_mb_whitespace_analyzer_new(true);
258
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
259
+ ts->reset(ts, text);
260
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
261
+ test_token(frt_ts_next(ts), "is", 19, 21);
262
+ test_token(frt_ts_next(ts), "my", 22, 24);
263
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
264
+ test_token(frt_ts_next(ts), "52", 32, 34);
265
+ test_token(frt_ts_next(ts), "#$", 37, 39);
266
+ test_token(frt_ts_next(ts), "address.", 40, 48);
267
+ test_token(frt_ts_next(ts), "23#!$", 49, 54);
268
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
269
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
270
+ frt_tk_destroy(tk);
271
+ frt_ts_deref(ts);
272
+ frt_a_deref(a);
273
+ }
274
+
275
+ /****************************************************************************
276
+ *
277
+ * Letter
278
+ *
279
+ ****************************************************************************/
280
+
281
+ static void test_letter_tokenizer(TestCase *tc, void *data)
282
+ {
283
+ FrtToken *tk = frt_tk_new();
284
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
285
+ char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
286
+ (void)data;
287
+
288
+ ts->reset(ts, text);
289
+ test_token(frt_ts_next(ts), "DBalmain", 0, 8);
290
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
291
+ test_token(frt_ts_next(ts), "com", 15, 18);
292
+ test_token(frt_ts_next(ts), "is", 19, 21);
293
+ test_token(frt_ts_next(ts), "My", 22, 24);
294
+ test_token(frt_ts_next(ts), "e", 25, 26);
295
+ test_token(frt_ts_next(ts), "mail", 27, 31);
296
+ test_token(frt_ts_next(ts), "address", 40, 47);
297
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
298
+ frt_tk_destroy(tk);
299
+ FRT_REF(ts); /* test ref_cnt */
300
+ Aiequal(2, ts->ref_cnt);
301
+ frt_ts_deref(ts);
302
+ Aiequal(1, ts->ref_cnt);
303
+ frt_ts_deref(ts);
304
+ }
305
+
306
+ static void test_mb_letter_tokenizer(TestCase *tc, void *data)
307
+ {
308
+ FrtToken *tk = frt_tk_new();
309
+ FrtTokenStream *ts = frt_mb_letter_tokenizer_new(false);
310
+ char text[100] =
311
+ "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
312
+ (void)data;
313
+
314
+ ts->reset(ts, text);
315
+ test_token(frt_ts_next(ts), "DBalmän", 0, 8);
316
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
317
+ test_token(frt_ts_next(ts), "com", 15, 18);
318
+ test_token(frt_ts_next(ts), "is", 19, 21);
319
+ test_token(frt_ts_next(ts), "My", 22, 24);
320
+ test_token(frt_ts_next(ts), "e", 25, 26);
321
+ test_token(frt_ts_next(ts), "mail", 27, 31);
322
+ test_token(frt_ts_next(ts), "address", 40, 47);
323
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62);
324
+ test_token(frt_ts_next(ts), "ÊËÌ", 64, 70);
325
+ test_token(frt_ts_next(ts), "ÚØÃ", 72, 78);
326
+ test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86);
327
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
328
+ ts = frt_mb_lowercase_filter_new(ts);
329
+ ts->reset(ts, text);
330
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8);
331
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
332
+ test_token(frt_ts_next(ts), "com", 15, 18);
333
+ test_token(frt_ts_next(ts), "is", 19, 21);
334
+ test_token(frt_ts_next(ts), "my", 22, 24);
335
+ test_token(frt_ts_next(ts), "e", 25, 26);
336
+ test_token(frt_ts_next(ts), "mail", 27, 31);
337
+ test_token(frt_ts_next(ts), "address", 40, 47);
338
+ test_token(frt_ts_next(ts), "áägç", 55, 62);
339
+ test_token(frt_ts_next(ts), "êëì", 64, 70);
340
+ test_token(frt_ts_next(ts), "úøã", 72, 78);
341
+ test_token(frt_ts_next(ts), "öîí", 80, 86);
342
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
343
+ frt_ts_deref(ts);
344
+ ts = frt_mb_letter_tokenizer_new(true);
345
+ ts->reset(ts, text);
346
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8);
347
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
348
+ test_token(frt_ts_next(ts), "com", 15, 18);
349
+ test_token(frt_ts_next(ts), "is", 19, 21);
350
+ test_token(frt_ts_next(ts), "my", 22, 24);
351
+ test_token(frt_ts_next(ts), "e", 25, 26);
352
+ test_token(frt_ts_next(ts), "mail", 27, 31);
353
+ test_token(frt_ts_next(ts), "address", 40, 47);
354
+ test_token(frt_ts_next(ts), "áägç", 55, 62);
355
+ test_token(frt_ts_next(ts), "êëì", 64, 70);
356
+ test_token(frt_ts_next(ts), "úøã", 72, 78);
357
+ test_token(frt_ts_next(ts), "öîí", 80, 86);
358
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
359
+ FRT_REF(ts); /* test ref_cnt */
360
+ Aiequal(2, ts->ref_cnt);
361
+ frt_ts_deref(ts);
362
+ Aiequal(1, ts->ref_cnt);
363
+ frt_ts_deref(ts);
364
+ frt_tk_destroy(tk);
365
+ }
366
+
367
+ static void test_letter_analyzer(TestCase *tc, void *data)
368
+ {
369
+ FrtToken *tk = frt_tk_new();
370
+ FrtAnalyzer *a = frt_letter_analyzer_new(true);
371
+ char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
372
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
373
+ (void)data;
374
+
375
+ test_token(frt_ts_next(ts), "dbalmain", 0, 8);
376
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
377
+ test_token(frt_ts_next(ts), "com", 15, 18);
378
+ test_token(frt_ts_next(ts), "is", 19, 21);
379
+ test_token(frt_ts_next(ts), "my", 22, 24);
380
+ test_token(frt_ts_next(ts), "e", 25, 26);
381
+ test_token(frt_ts_next(ts), "mail", 27, 31);
382
+ test_token(frt_ts_next(ts), "address", 40, 47);
383
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
384
+ frt_tk_destroy(tk);
385
+ frt_ts_deref(ts);
386
+ frt_a_deref(a);
387
+ }
388
+
389
+ static void test_mb_letter_analyzer(TestCase *tc, void *data)
390
+ {
391
+ FrtToken *tk = frt_tk_new();
392
+ FrtAnalyzer *a = frt_mb_letter_analyzer_new(false);
393
+ char text[100] =
394
+ "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ "
395
+ "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
396
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
397
+ (void)data;
398
+
399
+ test_token(frt_ts_next(ts), "DBalmän", 0, 8);
400
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
401
+ test_token(frt_ts_next(ts), "com", 15, 18);
402
+ test_token(frt_ts_next(ts), "is", 19, 21);
403
+ test_token(frt_ts_next(ts), "My", 22, 24);
404
+ test_token(frt_ts_next(ts), "e", 25, 26);
405
+ test_token(frt_ts_next(ts), "mail", 27, 31);
406
+ test_token(frt_ts_next(ts), "address", 40, 47);
407
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62);
408
+ test_token(frt_ts_next(ts), "ÊËÌ", 64, 70);
409
+ test_token(frt_ts_next(ts), "ÚØÃ", 72, 78);
410
+ test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86);
411
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
412
+ frt_ts_deref(ts);
413
+ frt_a_deref(a);
414
+ a = frt_mb_letter_analyzer_new(true);
415
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
416
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8);
417
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
418
+ test_token(frt_ts_next(ts), "com", 15, 18);
419
+ test_token(frt_ts_next(ts), "is", 19, 21);
420
+ test_token(frt_ts_next(ts), "my", 22, 24);
421
+ test_token(frt_ts_next(ts), "e", 25, 26);
422
+ test_token(frt_ts_next(ts), "mail", 27, 31);
423
+ test_token(frt_ts_next(ts), "address", 40, 47);
424
+ test_token(frt_ts_next(ts), "áägç", 55, 62);
425
+ test_token(frt_ts_next(ts), "êëì", 64, 70);
426
+ test_token(frt_ts_next(ts), "úøã", 72, 78);
427
+ test_token(frt_ts_next(ts), "öîí", 80, 86);
428
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
429
+ frt_a_deref(a);
430
+ frt_ts_deref(ts);
431
+ frt_tk_destroy(tk);
432
+ }
433
+
434
+
435
+ /****************************************************************************
436
+ *
437
+ * Standard
438
+ *
439
+ ****************************************************************************/
440
+
441
+ static void do_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
442
+ {
443
+ FrtToken *tk = frt_tk_new();
444
+ char text[200] =
445
+ "DBalmain@gmail.com is My e-mail -52 #$ Address. 23#!$ "
446
+ "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 "
447
+ "underscored_word, won't we're";
448
+
449
+ ts->reset(ts, text);
450
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
451
+ test_token(frt_ts_next(ts), "is", 19, 21);
452
+ test_token(frt_ts_next(ts), "My", 22, 24);
453
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
454
+ test_token(frt_ts_next(ts), "-52", 32, 35);
455
+ test_token(frt_ts_next(ts), "Address", 40, 47);
456
+ test_token(frt_ts_next(ts), "23", 49, 51);
457
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
458
+ test_token(frt_ts_next(ts), "TNT", 86, 91);
459
+ test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110);
460
+ test_token(frt_ts_next(ts), "underscored_word", 111, 127);
461
+ test_token(frt_ts_next(ts), "won't", 129, 134);
462
+ test_token(frt_ts_next(ts), "we're", 135, 140);
463
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
464
+ frt_tk_destroy(tk);
465
+ FRT_REF(ts); /* test ref_cnt */
466
+ Aiequal(2, ts->ref_cnt);
467
+ frt_ts_deref(ts);
468
+ Aiequal(1, ts->ref_cnt);
469
+ ts->reset(ts, (char *)"http://xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
470
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
471
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
472
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
473
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
474
+ test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
475
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
476
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
477
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
478
+ "xxxxxxxxxxxxxxxxxxx", 0, 280);
479
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
480
+ }
481
+
482
+ static void test_standard_tokenizer(TestCase *tc, void *data)
483
+ {
484
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
485
+ (void)data;
486
+ do_standard_tokenizer(tc, ts);
487
+ frt_ts_deref(ts);
488
+ }
489
+
490
+ static void test_legacy_standard_tokenizer(TestCase *tc, void *data)
491
+ {
492
+ FrtTokenStream *ts = frt_legacy_standard_tokenizer_new();
493
+ (void)data;
494
+ do_standard_tokenizer(tc, ts);
495
+ frt_ts_deref(ts);
496
+ }
497
+
498
+ static void do_mb_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
499
+ {
500
+ FrtToken *tk = frt_tk_new();
501
+ char text[512] =
502
+ "DBalmain@gmail.com is My e-mail -52 #$ Address. 23#!$ "
503
+ "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 "
504
+ "underscored_word, won't we're 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ "
505
+ "\200 badchar it's groups' Barnes&Noble file:///home/user/ "
506
+ "svn://www.davebalmain.com/ www,.google.com www.google.com "
507
+ "dave@balmain@gmail.com \"quoted string\" continue *star";
508
+
509
+ ts->reset(ts, text);
510
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
511
+ test_token(frt_ts_next(ts), "is", 19, 21);
512
+ test_token(frt_ts_next(ts), "My", 22, 24);
513
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
514
+ test_token(frt_ts_next(ts), "-52", 32, 35);
515
+ test_token(frt_ts_next(ts), "Address", 40, 47);
516
+ test_token(frt_ts_next(ts), "23", 49, 51);
517
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
518
+ test_token(frt_ts_next(ts), "TNT", 86, 91);
519
+ test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110);
520
+ test_token(frt_ts_next(ts), "underscored_word", 111, 127);
521
+ test_token(frt_ts_next(ts), "won't", 129, 134);
522
+ test_token(frt_ts_next(ts), "we're", 135, 140);
523
+ test_token(frt_ts_next(ts), "23", 141, 143);
524
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 147, 154);
525
+ test_token(frt_ts_next(ts), "ÊËÌ", 156, 162);
526
+ test_token(frt_ts_next(ts), "ÚØÃ", 164, 170);
527
+ test_token(frt_ts_next(ts), "ÖÎÍ", 172, 178);
528
+ test_token(frt_ts_next(ts), "badchar", 181, 188);
529
+ test_token(frt_ts_next(ts), "it", 189, 193);
530
+ test_token(frt_ts_next(ts), "groups", 194, 201);
531
+ test_token(frt_ts_next(ts), "Barnes&Noble", 202, 214);
532
+ test_token(frt_ts_next(ts), "home/user", 215, 233);
533
+ test_token(frt_ts_next(ts), "svn://www.davebalmain.com", 234, 260);
534
+ test_token(frt_ts_next(ts), "www", 261, 264);
535
+ test_token(frt_ts_next(ts), "google.com", 266, 276);
536
+ test_token(frt_ts_next(ts), "www.google.com", 277, 291);
537
+ test_token(frt_ts_next(ts), "dave@balmain", 292, 304);
538
+ test_token(frt_ts_next(ts), "gmail.com", 305, 314);
539
+ test_token(frt_ts_next(ts), "quoted", 316, 322);
540
+ test_token(frt_ts_next(ts), "string", 323, 329);
541
+ test_token(frt_ts_next(ts), "continue", 331, 339);
542
+ test_token(frt_ts_next(ts), "star", 341, 345);
543
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
544
+ frt_tk_destroy(tk);
545
+ FRT_REF(ts); /* test ref_cnt */
546
+ Aiequal(2, ts->ref_cnt);
547
+ frt_ts_deref(ts);
548
+ Aiequal(1, ts->ref_cnt);
549
+ ts->reset(ts, (char *)"http://xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
550
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
551
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
552
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
553
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
554
+ test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
555
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
556
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
557
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
558
+ "xxxxxxxxxxxxxxxxxxx", 0, 280);
559
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
560
+ ts->reset(ts, (char *)"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
561
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
562
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
563
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
564
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
565
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
566
+ test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
567
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
568
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
569
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
570
+ "xxxxxxxxxxxxxxxxxxx", 0, 348);
571
+ }
572
+
573
+ static void test_mb_standard_tokenizer(TestCase *tc, void *data)
574
+ {
575
+ FrtTokenStream *ts = frt_mb_standard_tokenizer_new();
576
+ (void)data;
577
+ do_mb_standard_tokenizer(tc, ts);
578
+ frt_ts_deref(ts);
579
+ }
580
+
581
+ static void test_mb_legacy_standard_tokenizer(TestCase *tc, void *data)
582
+ {
583
+ FrtTokenStream *ts = frt_mb_legacy_standard_tokenizer_new();
584
+ (void)data;
585
+ do_mb_standard_tokenizer(tc, ts);
586
+ frt_ts_deref(ts);
587
+ }
588
+
589
+ static void test_standard_analyzer(TestCase *tc, void *data)
590
+ {
591
+ FrtToken *tk = frt_tk_new();
592
+ FrtAnalyzer *a = frt_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
593
+ char text[200] =
594
+ "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
595
+ "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
596
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
597
+ (void)data;
598
+
599
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
600
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
601
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
602
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
603
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
604
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
605
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
606
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
607
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
608
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
609
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
610
+ frt_tk_destroy(tk);
611
+ frt_ts_deref(ts);
612
+ frt_a_deref(a);
613
+ }
614
+
615
+ static void test_mb_standard_analyzer(TestCase *tc, void *data)
616
+ {
617
+ FrtToken *tk = frt_tk_new();
618
+ FrtAnalyzer *a =
619
+ frt_mb_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, false);
620
+ const char *words[] = { "is", "the", "-23", "tnt", NULL };
621
+ char text[200] =
622
+ "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
623
+ "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ "
624
+ "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
625
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text), *ts2;
626
+ (void)data;
627
+
628
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
629
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 2);
630
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
631
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
632
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
633
+ test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3);
634
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
635
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
636
+ test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1);
637
+ test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1);
638
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
639
+ test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1);
640
+ test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1);
641
+ test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1);
642
+ test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1);
643
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
644
+ frt_ts_deref(ts);
645
+ frt_a_deref(a);
646
+ a = frt_mb_standard_analyzer_new(true);
647
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
648
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
649
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
650
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
651
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
652
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
653
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
654
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
655
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
656
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
657
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
658
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
659
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
660
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
661
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
662
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
663
+ frt_ts_deref(ts);
664
+ frt_a_deref(a);
665
+ a = frt_mb_standard_analyzer_new_with_words(words, true);
666
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
667
+ ts2 = frt_a_get_ts(a, rb_intern("random"), text);
668
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
669
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
670
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
671
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
672
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
673
+ test_token_pi(frt_ts_next(ts), "and", 32, 35, 1);
674
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 2);
675
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2);
676
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2);
677
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
678
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
679
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
680
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
681
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
682
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
683
+ frt_ts_deref(ts);
684
+ test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1);
685
+ test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2);
686
+ test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1);
687
+ test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0);
688
+ test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1);
689
+ test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1);
690
+ test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2);
691
+ test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2);
692
+ test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2);
693
+ test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1);
694
+ test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1);
695
+ test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1);
696
+ test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1);
697
+ test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1);
698
+ Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
699
+ ts2->ref_cnt = 3;
700
+ ts = frt_ts_clone(ts2);
701
+ Aiequal(3, ts2->ref_cnt);
702
+ Aiequal(1, ts->ref_cnt);
703
+ frt_ts_deref(ts2);
704
+ Aiequal(2, ts2->ref_cnt);
705
+ frt_ts_deref(ts2);
706
+ Aiequal(1, ts2->ref_cnt);
707
+ frt_ts_deref(ts2);
708
+ frt_ts_deref(ts);
709
+ frt_a_deref(a);
710
+ frt_tk_destroy(tk);
711
+ }
712
+
713
+ static void test_legacy_standard_analyzer(TestCase *tc, void *data)
714
+ {
715
+ FrtToken *tk = frt_tk_new();
716
+ FrtAnalyzer *a =
717
+ frt_legacy_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
718
+ char text[200] =
719
+ "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
720
+ "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
721
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
722
+ (void)data;
723
+
724
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
725
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
726
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
727
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
728
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
729
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
730
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
731
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
732
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
733
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
734
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
735
+ frt_tk_destroy(tk);
736
+ frt_ts_deref(ts);
737
+ frt_a_deref(a);
738
+ }
739
+
740
+ static void test_mb_legacy_standard_analyzer(TestCase *tc, void *data)
741
+ {
742
+ FrtToken *tk = frt_tk_new();
743
+ FrtAnalyzer *a =
744
+ frt_mb_legacy_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, false);
745
+ const char *words[] = { "is", "the", "-23", "tnt", NULL };
746
+ char text[200] =
747
+ "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
748
+ "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ "
749
+ "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
750
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text), *ts2;
751
+ (void)data;
752
+
753
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
754
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 2);
755
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
756
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
757
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
758
+ test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3);
759
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
760
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
761
+ test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1);
762
+ test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1);
763
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
764
+ test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1);
765
+ test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1);
766
+ test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1);
767
+ test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1);
768
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
769
+ frt_ts_deref(ts);
770
+ frt_a_deref(a);
771
+ a = frt_mb_legacy_standard_analyzer_new(true);
772
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
773
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
774
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
775
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
776
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
777
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
778
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
779
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
780
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
781
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
782
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
783
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
784
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
785
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
786
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
787
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
788
+ frt_ts_deref(ts);
789
+ frt_a_deref(a);
790
+ a = frt_mb_legacy_standard_analyzer_new_with_words(words, true);
791
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
792
+ ts2 = frt_a_get_ts(a, rb_intern("random"), text);
793
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
794
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
795
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
796
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
797
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
798
+ test_token_pi(frt_ts_next(ts), "and", 32, 35, 1);
799
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 2);
800
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2);
801
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2);
802
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
803
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
804
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
805
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
806
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
807
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
808
+ frt_ts_deref(ts);
809
+ test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1);
810
+ test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2);
811
+ test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1);
812
+ test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0);
813
+ test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1);
814
+ test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1);
815
+ test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2);
816
+ test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2);
817
+ test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2);
818
+ test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1);
819
+ test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1);
820
+ test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1);
821
+ test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1);
822
+ test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1);
823
+ Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
824
+ ts2->ref_cnt = 3;
825
+ ts = frt_ts_clone(ts2);
826
+ Aiequal(3, ts2->ref_cnt);
827
+ Aiequal(1, ts->ref_cnt);
828
+ frt_ts_deref(ts2);
829
+ Aiequal(2, ts2->ref_cnt);
830
+ frt_ts_deref(ts2);
831
+ Aiequal(1, ts2->ref_cnt);
832
+ frt_ts_deref(ts2);
833
+ frt_ts_deref(ts);
834
+ frt_a_deref(a);
835
+ frt_tk_destroy(tk);
836
+ }
837
+
838
+ static void test_long_word(TestCase *tc, void *data)
839
+ {
840
+ FrtToken *tk = frt_tk_new();
841
+ FrtAnalyzer *a = frt_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
842
+ char text[400] =
843
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
844
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
845
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
846
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
847
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" " two";
848
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
849
+ (void)data;
850
+
851
+ test_token_pi(frt_ts_next(ts), text, 0, 290, 1); /* text gets truncated anyway */
852
+ test_token_pi(frt_ts_next(ts), "two", 291, 294, 1);
853
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
854
+ frt_ts_deref(ts);
855
+ frt_a_deref(a);
856
+ a = frt_mb_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
857
+ ts = frt_a_get_ts(a, rb_intern("random"), text);
858
+ test_token_pi(frt_ts_next(ts), text, 0, 290, 1); /* text gets truncated anyway */
859
+ test_token_pi(frt_ts_next(ts), "two", 291, 294, 1);
860
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
861
+ frt_ts_deref(ts);
862
+ frt_a_deref(a);
863
+ frt_tk_destroy(tk);
864
+ }
865
+
866
+ /****************************************************************************
867
+ *
868
+ * Filters
869
+ *
870
+ ****************************************************************************/
871
+
872
+ static void test_lowercase_filter(TestCase *tc, void *data)
873
+ {
874
+ FrtToken *tk = frt_tk_new();
875
+ FrtTokenStream *ts = frt_lowercase_filter_new(frt_standard_tokenizer_new());
876
+ char text[200] =
877
+ "DBalmain@gmail.com is My e-mail 52 #$ Address. -23!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
878
+ (void)data;
879
+
880
+ ts->reset(ts, text);
881
+ test_token(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18);
882
+ test_token(frt_ts_next(ts), "is", 19, 21);
883
+ test_token(frt_ts_next(ts), "my", 22, 24);
884
+ test_token(frt_ts_next(ts), "e-mail", 25, 31);
885
+ test_token(frt_ts_next(ts), "52", 32, 34);
886
+ test_token(frt_ts_next(ts), "address", 40, 47);
887
+ test_token(frt_ts_next(ts), "-23", 49, 52);
888
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
889
+ test_token(frt_ts_next(ts), "tnt", 86, 91);
890
+ test_token(frt_ts_next(ts), "123-1235-asd-1234", 93, 110);
891
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
892
+ frt_tk_destroy(tk);
893
+ FRT_REF(ts);
894
+ Aiequal(2, ts->ref_cnt);
895
+ frt_ts_deref(ts);
896
+ Aiequal(1, ts->ref_cnt);
897
+ frt_ts_deref(ts);
898
+ }
899
+
900
+ static void test_hyphen_filter(TestCase *tc, void *data)
901
+ {
902
+ FrtToken *tk = frt_tk_new();
903
+ FrtTokenStream *ts = frt_hyphen_filter_new(frt_lowercase_filter_new(frt_standard_tokenizer_new()));
904
+ char text[200] =
905
+ "DBalmain@gmail.com is My e-mail 52 #$ Address. -23!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 long-hyph-en-at-ed-word";
906
+ (void)data;
907
+
908
+ ts->reset(ts, text);
909
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
910
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
911
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 1);
912
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
913
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
914
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
915
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
916
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 1);
917
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
918
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
919
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
920
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
921
+ test_token_pi(frt_ts_next(ts), "longhyphenatedword", 111, 134, 1);
922
+ test_token_pi(frt_ts_next(ts), "long", 111, 115, 0);
923
+ test_token_pi(frt_ts_next(ts), "hyph", 116, 120, 1);
924
+ test_token_pi(frt_ts_next(ts), "en", 121, 123, 1);
925
+ test_token_pi(frt_ts_next(ts), "at", 124, 126, 1);
926
+ test_token_pi(frt_ts_next(ts), "ed", 127, 129, 1);
927
+ test_token_pi(frt_ts_next(ts), "word", 130, 134, 1);
928
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
929
+ frt_tk_destroy(tk);
930
+ FRT_REF(ts);
931
+ Aiequal(2, ts->ref_cnt);
932
+ frt_ts_deref(ts);
933
+ Aiequal(1, ts->ref_cnt);
934
+ frt_ts_deref(ts);
935
+ }
936
+
937
+ const char *words[] = { "one", "four", "five", "seven", NULL };
938
+ static void test_stop_filter(TestCase *tc, void *data)
939
+ {
940
+ FrtToken *tk = frt_tk_new();
941
+ FrtTokenStream *ts =
942
+ frt_stop_filter_new_with_words(frt_letter_tokenizer_new(), words);
943
+ char text[200] =
944
+ "one, two, three, four, five, six, seven, eight, nine, ten.";
945
+ (void)data;
946
+
947
+ ts->reset(ts, text);
948
+ test_token_pi(frt_ts_next(ts), "two", 5, 8, 2);
949
+ test_token_pi(frt_ts_next(ts), "three", 10, 15, 1);
950
+ test_token_pi(frt_ts_next(ts), "six", 29, 32, 3);
951
+ test_token_pi(frt_ts_next(ts), "eight", 41, 46, 2);
952
+ test_token_pi(frt_ts_next(ts), "nine", 48, 52, 1);
953
+ test_token_pi(frt_ts_next(ts), "ten", 54, 57, 1);
954
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
955
+ frt_tk_destroy(tk);
956
+ FRT_REF(ts);
957
+ Aiequal(2, ts->ref_cnt);
958
+ frt_ts_deref(ts);
959
+ Aiequal(1, ts->ref_cnt);
960
+ frt_ts_deref(ts);
961
+ }
962
+
963
+ static void test_mapping_filter(TestCase *tc, void *data)
964
+ {
965
+ FrtToken *tk = frt_tk_new();
966
+ FrtTokenStream *ts = frt_mapping_filter_new(frt_letter_tokenizer_new());
967
+ char text[200] =
968
+ "one, two, three, four, five, six, seven, eight, nine, ten.";
969
+ char long_word[301] =
970
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
971
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
972
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
973
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
974
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
975
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
976
+ (void)data;
977
+
978
+ frt_mapping_filter_add(ts, "ne", "hello");
979
+ frt_mapping_filter_add(ts, "four", long_word);
980
+
981
+ ts->reset(ts, text);
982
+ test_token(frt_ts_next(ts), "ohello", 0, 3);
983
+ test_token(frt_ts_next(ts), "two", 5, 8);
984
+ test_token(frt_ts_next(ts), "three", 10, 15);
985
+ test_token(frt_ts_next(ts), long_word, 17, 21);
986
+ test_token(frt_ts_next(ts), "five", 23, 27);
987
+ test_token(frt_ts_next(ts), "six", 29, 32);
988
+ test_token(frt_ts_next(ts), "seven", 34, 39);
989
+ test_token(frt_ts_next(ts), "eight", 41, 46);
990
+ test_token(frt_ts_next(ts), "nihello", 48, 52);
991
+ test_token(frt_ts_next(ts), "ten", 54, 57);
992
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
993
+
994
+ frt_mapping_filter_add(ts, "thr", "start");
995
+ frt_mapping_filter_add(ts, "en", "goodbye");
996
+ ts->reset(ts, text);
997
+ test_token(frt_ts_next(ts), "ohello", 0, 3);
998
+ test_token(frt_ts_next(ts), "two", 5, 8);
999
+ test_token(frt_ts_next(ts), "startee", 10, 15);
1000
+ test_token(frt_ts_next(ts), long_word, 17, 21);
1001
+ test_token(frt_ts_next(ts), "five", 23, 27);
1002
+ test_token(frt_ts_next(ts), "six", 29, 32);
1003
+ test_token(frt_ts_next(ts), "sevgoodbye", 34, 39);
1004
+ test_token(frt_ts_next(ts), "eight", 41, 46);
1005
+ test_token(frt_ts_next(ts), "nihello", 48, 52);
1006
+ test_token(frt_ts_next(ts), "tgoodbye", 54, 57);
1007
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1008
+ frt_tk_destroy(tk);
1009
+ FRT_REF(ts);
1010
+ Aiequal(2, ts->ref_cnt);
1011
+ frt_ts_deref(ts);
1012
+ Aiequal(1, ts->ref_cnt);
1013
+ frt_ts_deref(ts);
1014
+ }
1015
+
1016
+ static void test_stemmer(TestCase *tc, void *data)
1017
+ {
1018
+ int stemmer_cnt = 0;
1019
+ const char **stemmers = sb_stemmer_list();
1020
+ const char **st = stemmers;
1021
+ struct sb_stemmer *stemmer;
1022
+ (void)data;
1023
+
1024
+ while (*st) {
1025
+ stemmer_cnt++;
1026
+ st++;
1027
+ }
1028
+
1029
+ stemmer = sb_stemmer_new("english", NULL);
1030
+
1031
+ Apnotnull(stemmer);
1032
+ sb_stemmer_delete(stemmer);
1033
+ Assert(stemmer_cnt >= 13, "There should be at least 10 stemmers");
1034
+ }
1035
+
1036
+ static void test_stem_filter(TestCase *tc, void *data)
1037
+ {
1038
+ FrtToken *tk = frt_tk_new();
1039
+ FrtTokenStream *ts = frt_stem_filter_new(frt_mb_letter_tokenizer_new(true),
1040
+ "english", NULL);
1041
+ FrtTokenStream *ts2;
1042
+ char text[200] = "debate debates debated debating debater";
1043
+ char text2[200] = "dêbate dêbates dêbated dêbating dêbater";
1044
+ (void)data;
1045
+
1046
+ ts->reset(ts, text);
1047
+ ts2 = frt_ts_clone(ts);
1048
+ test_token(frt_ts_next(ts), "debat", 0, 6);
1049
+ test_token(frt_ts_next(ts), "debat", 7, 14);
1050
+ test_token(frt_ts_next(ts), "debat", 15, 22);
1051
+ test_token(frt_ts_next(ts), "debat", 23, 31);
1052
+ test_token(frt_ts_next(ts), "debat", 32, 39);
1053
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1054
+ ts->reset(ts, text2);
1055
+ test_token(frt_ts_next(ts), "dêbate", 0, 7);
1056
+ test_token(frt_ts_next(ts), "dêbate", 8, 16);
1057
+ test_token(frt_ts_next(ts), "dêbate", 17, 25);
1058
+ test_token(frt_ts_next(ts), "dêbate", 26, 35);
1059
+ test_token(frt_ts_next(ts), "dêbater", 36, 44);
1060
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1061
+ FRT_REF(ts);
1062
+ Aiequal(2, ts->ref_cnt);
1063
+ frt_ts_deref(ts);
1064
+ Aiequal(1, ts->ref_cnt);
1065
+ frt_ts_deref(ts);
1066
+ test_token(frt_ts_next(ts2), "debat", 0, 6);
1067
+ test_token(frt_ts_next(ts2), "debat", 7, 14);
1068
+ test_token(frt_ts_next(ts2), "debat", 15, 22);
1069
+ test_token(frt_ts_next(ts2), "debat", 23, 31);
1070
+ test_token(frt_ts_next(ts2), "debat", 32, 39);
1071
+ Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
1072
+ frt_tk_destroy(tk);
1073
+ frt_ts_deref(ts2);
1074
+ }
1075
+
1076
+ static void test_per_field_analyzer(TestCase *tc, void *data)
1077
+ {
1078
+ FrtTokenStream *ts;
1079
+ FrtToken *tk = frt_tk_new();
1080
+ char text[100] = "DBalmain@gmail.com is My E-mail 52 #$ address. 23#!$";
1081
+ FrtAnalyzer *pfa = frt_per_field_analyzer_new(frt_standard_analyzer_new(true));
1082
+ (void)data;
1083
+
1084
+ frt_pfa_add_field(pfa, rb_intern("white"), frt_whitespace_analyzer_new(false));
1085
+ frt_pfa_add_field(pfa, rb_intern("white_l"), frt_whitespace_analyzer_new(true));
1086
+ frt_pfa_add_field(pfa, rb_intern("letter"), frt_letter_analyzer_new(false));
1087
+ frt_pfa_add_field(pfa, rb_intern("letter"), frt_letter_analyzer_new(true));
1088
+ frt_pfa_add_field(pfa, rb_intern("letter_u"), frt_letter_analyzer_new(false));
1089
+ ts = frt_a_get_ts(pfa, rb_intern("white"), text);
1090
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
1091
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
1092
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 1);
1093
+ test_token_pi(frt_ts_next(ts), "E-mail", 25, 31, 1);
1094
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1095
+ test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1);
1096
+ test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1);
1097
+ test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1);
1098
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1099
+ frt_ts_deref(ts);
1100
+ ts = frt_a_get_ts(pfa, rb_intern("white_l"), text);
1101
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
1102
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
1103
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 1);
1104
+ test_token_pi(frt_ts_next(ts), "e-mail", 25, 31, 1);
1105
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1106
+ test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1);
1107
+ test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1);
1108
+ test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1);
1109
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1110
+ frt_ts_deref(ts);
1111
+ ts = frt_a_get_ts(pfa, rb_intern("letter_u"), text);
1112
+ test_token(frt_ts_next(ts), "DBalmain", 0, 8);
1113
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
1114
+ test_token(frt_ts_next(ts), "com", 15, 18);
1115
+ test_token(frt_ts_next(ts), "is", 19, 21);
1116
+ test_token(frt_ts_next(ts), "My", 22, 24);
1117
+ test_token(frt_ts_next(ts), "E", 25, 26);
1118
+ test_token(frt_ts_next(ts), "mail", 27, 31);
1119
+ test_token(frt_ts_next(ts), "address", 40, 47);
1120
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1121
+ frt_ts_deref(ts);
1122
+ ts = frt_a_get_ts(pfa, rb_intern("letter"), text);
1123
+ test_token(frt_ts_next(ts), "dbalmain", 0, 8);
1124
+ test_token(frt_ts_next(ts), "gmail", 9, 14);
1125
+ test_token(frt_ts_next(ts), "com", 15, 18);
1126
+ test_token(frt_ts_next(ts), "is", 19, 21);
1127
+ test_token(frt_ts_next(ts), "my", 22, 24);
1128
+ test_token(frt_ts_next(ts), "e", 25, 26);
1129
+ test_token(frt_ts_next(ts), "mail", 27, 31);
1130
+ test_token(frt_ts_next(ts), "address", 40, 47);
1131
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1132
+ frt_ts_deref(ts);
1133
+ ts = frt_a_get_ts(pfa, rb_intern("XXX"), text); /* should use default analyzer */
1134
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
1135
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
1136
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
1137
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
1138
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1139
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 1);
1140
+ test_token_pi(frt_ts_next(ts), "23", 49, 51, 1);
1141
+ Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1142
+ frt_tk_destroy(tk);
1143
+ frt_ts_deref(ts);
1144
+ frt_a_deref(pfa);
1145
+ }
1146
+
1147
+ TestSuite *ts_analysis(TestSuite *suite)
1148
+ {
1149
+ bool u = false;
1150
+ char *original_locale = setlocale(LC_ALL, NULL);
1151
+ char *locale = setlocale(LC_ALL, "");
1152
+ if (locale && (strstr(locale, "utf") || strstr(locale, "UTF"))) u = true;
1153
+
1154
+ suite = ADD_SUITE(suite);
1155
+
1156
+ tst_run_test(suite, test_tk, NULL);
1157
+
1158
+ /* Non */
1159
+ tst_run_test(suite, test_non_tokenizer, NULL);
1160
+ tst_run_test(suite, test_non_analyzer, NULL);
1161
+
1162
+ /* Whitespace */
1163
+ tst_run_test(suite, test_whitespace_tokenizer, NULL);
1164
+ if (u) {
1165
+ tst_run_test(suite, test_mb_whitespace_tokenizer, NULL);
1166
+ }
1167
+
1168
+ tst_run_test(suite, test_whitespace_analyzer, NULL);
1169
+ if (u) {
1170
+ tst_run_test(suite, test_mb_whitespace_analyzer, NULL);
1171
+ }
1172
+
1173
+ /* Letter */
1174
+ tst_run_test(suite, test_letter_tokenizer, NULL);
1175
+ if (u) {
1176
+ tst_run_test(suite, test_mb_letter_tokenizer, NULL);
1177
+ }
1178
+
1179
+ tst_run_test(suite, test_letter_analyzer, NULL);
1180
+ if (u) {
1181
+ tst_run_test(suite, test_mb_letter_analyzer, NULL);
1182
+ }
1183
+
1184
+ /* Standard */
1185
+ tst_run_test(suite, test_standard_tokenizer, NULL);
1186
+ if (u) {
1187
+ tst_run_test(suite, test_mb_standard_tokenizer, NULL);
1188
+ }
1189
+ tst_run_test(suite, test_standard_analyzer, NULL);
1190
+ if (u) {
1191
+ tst_run_test(suite, test_mb_standard_analyzer, NULL);
1192
+ }
1193
+
1194
+ /* LegacyStandard */
1195
+ tst_run_test(suite, test_legacy_standard_tokenizer, NULL);
1196
+ if (u) {
1197
+ tst_run_test(suite, test_mb_legacy_standard_tokenizer, NULL);
1198
+ }
1199
+ tst_run_test(suite, test_legacy_standard_analyzer, NULL);
1200
+ if (u) {
1201
+ tst_run_test(suite, test_mb_legacy_standard_analyzer, NULL);
1202
+ }
1203
+
1204
+ tst_run_test(suite, test_long_word, NULL);
1205
+
1206
+ /* PerField */
1207
+ tst_run_test(suite, test_per_field_analyzer, NULL);
1208
+
1209
+ /* Filters */
1210
+ tst_run_test(suite, test_lowercase_filter, NULL);
1211
+ tst_run_test(suite, test_hyphen_filter, NULL);
1212
+ tst_run_test(suite, test_stop_filter, NULL);
1213
+ tst_run_test(suite, test_mapping_filter, NULL);
1214
+ tst_run_test(suite, test_stemmer, NULL);
1215
+ if (u) {
1216
+ tst_run_test(suite, test_stem_filter, NULL);
1217
+ }
1218
+
1219
+ setlocale(LC_ALL, original_locale);
1220
+ return suite;
1221
+ }