isomorfeus-ferret 0.12.7 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,13 +1,14 @@
1
1
  #include "frt_analysis.h"
2
2
  #include <string.h>
3
- #include <locale.h>
4
3
  #include <libstemmer.h>
5
4
  #include "test.h"
6
5
 
7
- #define test_token(mtk, mstr, mstart, mend) \
8
- tt_token(mtk, mstr, mstart, mend, tc, __LINE__)
6
+ extern rb_encoding *utf8_encoding;
9
7
 
10
- static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase *tc, int line_num)
8
+ #define test_token(mtk, mstr, mstart, mend, menc) \
9
+ tt_token(mtk, mstr, mstart, mend, menc, tc, __LINE__)
10
+
11
+ static void tt_token(FrtToken *tk, const char *str, int start, int end, rb_encoding * enc, TestCase *tc, int line_num)
11
12
  {
12
13
  FrtToken frt_tk_exp;
13
14
  static char buf[3000];
@@ -17,7 +18,7 @@ static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase
17
18
  tst_assert(line_num, tc, false, buf);
18
19
  return;
19
20
  }
20
- if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, 1), tk)) {
21
+ if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, 1, enc), tk)) {
21
22
  sprintf(buf, "Token1[%d:%d:%s] != Token2[%d:%d:%s]\n",
22
23
  (int)tk->start, (int)tk->end, tk->text, start, end, str);
23
24
  tst_assert(line_num, tc, false, buf);
@@ -25,7 +26,7 @@ static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase
25
26
  tst_int_equal(line_num, tc, strlen(tk->text), tk->len);
26
27
  }
27
28
 
28
- static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int pi, TestCase *tc, int line_num)
29
+ static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int pi, rb_encoding *enc, TestCase *tc, int line_num)
29
30
  {
30
31
  FrtToken frt_tk_exp;
31
32
  static char buf[3000];
@@ -35,48 +36,52 @@ static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int p
35
36
  tst_assert(line_num, tc, false, buf);
36
37
  return;
37
38
  }
38
- if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, pi), tk)) {
39
- sprintf(buf, "Token1[%d:%d:%s-%d] != Token2[%d:%d:%s-%d]\n",
39
+ if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, pi, enc), tk)) {
40
+ fprintf(stderr, "Token1[%d:%d:%s-%d] != \nToken2[%d:%d:%s-%d]\n",
41
+ (int)tk->start, (int)tk->end, tk->text, tk->pos_inc,
42
+ start, end, frt_tk_exp.text, pi);
43
+ sprintf(buf, "Token1[%d:%d:%s-%d] != \nToken2[%d:%d:%s-%d]\n",
40
44
  (int)tk->start, (int)tk->end, tk->text, tk->pos_inc,
41
- start, end, str, pi);
45
+ start, end, frt_tk_exp.text, pi);
42
46
  tst_assert(line_num, tc, false, buf);
43
47
  }
44
48
  tst_int_equal(line_num, tc, strlen(tk->text), tk->len);
45
49
  }
46
50
 
47
- #define test_token_pi(mtk, mstr, mstart, mend, mpi) \
48
- tt_token_pi(mtk, mstr, mstart, mend, mpi, tc, __LINE__)
51
+ #define test_token_pi(mtk, mstr, mstart, mend, mpi, menc) \
52
+ tt_token_pi(mtk, mstr, mstart, mend, mpi, menc, tc, __LINE__)
49
53
 
50
54
  static void test_tk(TestCase *tc, void *data)
51
55
  {
52
56
  FrtToken *tk1 = frt_tk_new();
53
57
  FrtToken *tk2 = frt_tk_new();
58
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
54
59
  (void)data;
55
60
 
56
- frt_tk_set_no_len(tk1, (char *)"DBalmain", 1, 8, 5);
57
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 5);
61
+ frt_tk_set_no_len(tk1, (char *)"DBalmain", 1, 8, 5, enc);
62
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 5, enc);
58
63
  Assert(frt_tk_eq(tk1, tk2), "tokens are equal");
59
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 1);
64
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 1, enc);
60
65
  Assert(!frt_tk_eq(tk1, tk2), "tokens are not equal");
61
66
 
62
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 5);
67
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 5, enc);
63
68
  Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
64
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 0, 8, 5);
69
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 0, 8, 5, enc);
65
70
  Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
66
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 7, 5);
71
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 7, 5, enc);
67
72
  Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
68
73
 
69
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 2, 7, 1);
74
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 2, 7, 1, enc);
70
75
  Aiequal(-1, frt_tk_cmp(tk1, tk2));
71
- frt_tk_set_no_len(tk2, (char *)"EBalmain", 0, 9, 1);
76
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 0, 9, 1, enc);
72
77
  Aiequal(1, frt_tk_cmp(tk1, tk2));
73
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 9, 1);
78
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 9, 1, enc);
74
79
  Aiequal(-1, frt_tk_cmp(tk1, tk2));
75
- frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 7, 1);
80
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 7, 1, enc);
76
81
  Aiequal(1, frt_tk_cmp(tk1, tk2));
77
- frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 8, 1);
82
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 8, 1, enc);
78
83
  Aiequal(-1, frt_tk_cmp(tk1, tk2));
79
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 1);
84
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 1, enc);
80
85
  Aiequal(1, frt_tk_cmp(tk1, tk2));
81
86
 
82
87
  Asequal("DBalmain", tk1->text);
@@ -101,9 +106,9 @@ static void test_non_tokenizer(TestCase *tc, void *data)
101
106
  FrtTokenStream *ts = frt_non_tokenizer_new();
102
107
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
103
108
  (void)data;
104
-
105
- ts->reset(ts, text);
106
- test_token(frt_ts_next(ts), text, 0, strlen(text));
109
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
110
+ ts->reset(ts, text, enc);
111
+ test_token(frt_ts_next(ts), text, 0, strlen(text), enc);
107
112
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
108
113
  frt_tk_destroy(tk);
109
114
  FRT_REF(ts); /* test ref_cnt */
@@ -118,10 +123,11 @@ static void test_non_analyzer(TestCase *tc, void *data)
118
123
  FrtToken *tk = frt_tk_new();
119
124
  FrtAnalyzer *a = frt_non_analyzer_new();
120
125
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
121
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
126
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
127
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
122
128
  (void)data;
123
129
 
124
- test_token(frt_ts_next(ts), text, 0, strlen(text));
130
+ test_token(frt_ts_next(ts), text, 0, strlen(text), enc);
125
131
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
126
132
  frt_tk_destroy(tk);
127
133
  frt_ts_deref(ts);
@@ -140,16 +146,16 @@ static void test_whitespace_tokenizer(TestCase *tc, void *data)
140
146
  FrtTokenStream *ts = frt_whitespace_tokenizer_new();
141
147
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
142
148
  (void)data;
143
-
144
- ts->reset(ts, text);
145
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
146
- test_token(frt_ts_next(ts), "is", 19, 21);
147
- test_token(frt_ts_next(ts), "My", 22, 24);
148
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
149
- test_token(frt_ts_next(ts), "52", 32, 34);
150
- test_token(frt_ts_next(ts), "#$", 37, 39);
151
- test_token(frt_ts_next(ts), "address.", 40, 48);
152
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
149
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
150
+ ts->reset(ts, text, enc);
151
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
152
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
153
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
154
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
155
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
156
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
157
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
158
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
153
159
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
154
160
  frt_tk_destroy(tk);
155
161
  FRT_REF(ts); /* test ref_cnt */
@@ -161,47 +167,47 @@ static void test_whitespace_tokenizer(TestCase *tc, void *data)
161
167
 
162
168
  static void test_mb_whitespace_tokenizer(TestCase *tc, void *data)
163
169
  {
164
- FrtToken *t, *tk = frt_tk_new();
165
- FrtTokenStream *ts = frt_mb_whitespace_tokenizer_new(false);
166
- char text[100] =
167
- "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
170
+ FrtToken *tk = frt_tk_new();
171
+ FrtTokenStream *ts = frt_whitespace_tokenizer_new();
172
+ char text[100] = "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
173
+ rb_encoding *enc = utf8_encoding;
168
174
  (void)data;
169
-
170
- ts->reset(ts, text);
171
- test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18);
172
- test_token(frt_ts_next(ts), "is", 19, 21);
173
- test_token(frt_ts_next(ts), "My", 22, 24);
174
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
175
- test_token(frt_ts_next(ts), "52", 32, 34);
176
- test_token(frt_ts_next(ts), "#$", 37, 39);
177
- test_token(frt_ts_next(ts), "address.", 40, 48);
178
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
179
- test_token(t = frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86);
175
+ ts->reset(ts, text, enc);
176
+ test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18, enc);
177
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
178
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
179
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
180
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
181
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
182
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
183
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
184
+ test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86, enc);
180
185
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
181
- ts = frt_mb_lowercase_filter_new(ts);
182
- ts->reset(ts, text);
183
- test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
184
- test_token(frt_ts_next(ts), "is", 19, 21);
185
- test_token(frt_ts_next(ts), "my", 22, 24);
186
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
187
- test_token(frt_ts_next(ts), "52", 32, 34);
188
- test_token(frt_ts_next(ts), "#$", 37, 39);
189
- test_token(frt_ts_next(ts), "address.", 40, 48);
190
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
191
- test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
186
+ ts = frt_lowercase_filter_new(ts);
187
+ ts->reset(ts, text, enc);
188
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18, enc);
189
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
190
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
191
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
192
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
193
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
194
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
195
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
196
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86, enc);
192
197
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
193
198
  frt_ts_deref(ts);
194
- ts = frt_mb_whitespace_tokenizer_new(true);
195
- ts->reset(ts, text);
196
- test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
197
- test_token(frt_ts_next(ts), "is", 19, 21);
198
- test_token(frt_ts_next(ts), "my", 22, 24);
199
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
200
- test_token(frt_ts_next(ts), "52", 32, 34);
201
- test_token(frt_ts_next(ts), "#$", 37, 39);
202
- test_token(frt_ts_next(ts), "address.", 40, 48);
203
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
204
- test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
199
+ ts = frt_whitespace_tokenizer_new();
200
+ ts = frt_lowercase_filter_new(ts);
201
+ ts->reset(ts, text, enc);
202
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18, enc);
203
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
204
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
205
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
206
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
207
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
208
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
209
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
210
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86, enc);
205
211
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
206
212
  FRT_REF(ts); /* test ref_cnt */
207
213
  Aiequal(2, ts->ref_cnt);
@@ -216,17 +222,18 @@ static void test_whitespace_analyzer(TestCase *tc, void *data)
216
222
  FrtToken *tk = frt_tk_new();
217
223
  FrtAnalyzer *a = frt_whitespace_analyzer_new(false);
218
224
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
219
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
225
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
226
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
220
227
  (void)data;
221
228
 
222
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
223
- test_token(frt_ts_next(ts), "is", 19, 21);
224
- test_token(frt_ts_next(ts), "My", 22, 24);
225
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
226
- test_token(frt_ts_next(ts), "52", 32, 34);
227
- test_token(frt_ts_next(ts), "#$", 37, 39);
228
- test_token(frt_ts_next(ts), "address.", 40, 48);
229
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
229
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
230
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
231
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
232
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
233
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
234
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
235
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
236
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
230
237
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
231
238
  frt_tk_destroy(tk);
232
239
  frt_ts_deref(ts);
@@ -236,36 +243,36 @@ static void test_whitespace_analyzer(TestCase *tc, void *data)
236
243
  static void test_mb_whitespace_analyzer(TestCase *tc, void *data)
237
244
  {
238
245
  FrtToken *tk = frt_tk_new();
239
- FrtAnalyzer *a = frt_mb_whitespace_analyzer_new(false);
240
- char text[100] =
241
- "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
242
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
246
+ FrtAnalyzer *a = frt_whitespace_analyzer_new(false);
247
+ char text[100] = "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
248
+ rb_encoding *enc = utf8_encoding;
249
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
243
250
  (void)data;
244
251
 
245
- test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18);
246
- test_token(frt_ts_next(ts), "is", 19, 21);
247
- test_token(frt_ts_next(ts), "My", 22, 24);
248
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
249
- test_token(frt_ts_next(ts), "52", 32, 34);
250
- test_token(frt_ts_next(ts), "#$", 37, 39);
251
- test_token(frt_ts_next(ts), "address.", 40, 48);
252
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
253
- test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86);
252
+ test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18, enc);
253
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
254
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
255
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
256
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
257
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
258
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
259
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
260
+ test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86, enc);
254
261
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
255
262
  frt_ts_deref(ts);
256
263
  frt_a_deref(a);
257
- a = frt_mb_whitespace_analyzer_new(true);
258
- ts = frt_a_get_ts(a, rb_intern("random"), text);
259
- ts->reset(ts, text);
260
- test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
261
- test_token(frt_ts_next(ts), "is", 19, 21);
262
- test_token(frt_ts_next(ts), "my", 22, 24);
263
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
264
- test_token(frt_ts_next(ts), "52", 32, 34);
265
- test_token(frt_ts_next(ts), "#$", 37, 39);
266
- test_token(frt_ts_next(ts), "address.", 40, 48);
267
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
268
- test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
264
+ a = frt_whitespace_analyzer_new(true);
265
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
266
+ ts->reset(ts, text, enc);
267
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18, enc);
268
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
269
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
270
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
271
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
272
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
273
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
274
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
275
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86, enc);
269
276
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
270
277
  frt_tk_destroy(tk);
271
278
  frt_ts_deref(ts);
@@ -284,16 +291,16 @@ static void test_letter_tokenizer(TestCase *tc, void *data)
284
291
  FrtTokenStream *ts = frt_letter_tokenizer_new();
285
292
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
286
293
  (void)data;
287
-
288
- ts->reset(ts, text);
289
- test_token(frt_ts_next(ts), "DBalmain", 0, 8);
290
- test_token(frt_ts_next(ts), "gmail", 9, 14);
291
- test_token(frt_ts_next(ts), "com", 15, 18);
292
- test_token(frt_ts_next(ts), "is", 19, 21);
293
- test_token(frt_ts_next(ts), "My", 22, 24);
294
- test_token(frt_ts_next(ts), "e", 25, 26);
295
- test_token(frt_ts_next(ts), "mail", 27, 31);
296
- test_token(frt_ts_next(ts), "address", 40, 47);
294
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
295
+ ts->reset(ts, text, enc);
296
+ test_token(frt_ts_next(ts), "DBalmain", 0, 8, enc);
297
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
298
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
299
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
300
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
301
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
302
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
303
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
297
304
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
298
305
  frt_tk_destroy(tk);
299
306
  FRT_REF(ts); /* test ref_cnt */
@@ -306,55 +313,55 @@ static void test_letter_tokenizer(TestCase *tc, void *data)
306
313
  static void test_mb_letter_tokenizer(TestCase *tc, void *data)
307
314
  {
308
315
  FrtToken *tk = frt_tk_new();
309
- FrtTokenStream *ts = frt_mb_letter_tokenizer_new(false);
310
- char text[100] =
311
- "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
316
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
317
+ char text[100] = "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
312
318
  (void)data;
313
-
314
- ts->reset(ts, text);
315
- test_token(frt_ts_next(ts), "DBalmän", 0, 8);
316
- test_token(frt_ts_next(ts), "gmail", 9, 14);
317
- test_token(frt_ts_next(ts), "com", 15, 18);
318
- test_token(frt_ts_next(ts), "is", 19, 21);
319
- test_token(frt_ts_next(ts), "My", 22, 24);
320
- test_token(frt_ts_next(ts), "e", 25, 26);
321
- test_token(frt_ts_next(ts), "mail", 27, 31);
322
- test_token(frt_ts_next(ts), "address", 40, 47);
323
- test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62);
324
- test_token(frt_ts_next(ts), "ÊËÌ", 64, 70);
325
- test_token(frt_ts_next(ts), "ÚØÃ", 72, 78);
326
- test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86);
319
+ rb_encoding *enc = utf8_encoding;
320
+ ts->reset(ts, text, enc);
321
+ test_token(frt_ts_next(ts), "DBalmän", 0, 8, enc);
322
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
323
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
324
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
325
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
326
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
327
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
328
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
329
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62, enc);
330
+ test_token(frt_ts_next(ts), "ÊËÌ", 64, 70, enc);
331
+ test_token(frt_ts_next(ts), "ÚØÃ", 72, 78, enc);
332
+ test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86, enc);
327
333
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
328
- ts = frt_mb_lowercase_filter_new(ts);
329
- ts->reset(ts, text);
330
- test_token(frt_ts_next(ts), "dbalmän", 0, 8);
331
- test_token(frt_ts_next(ts), "gmail", 9, 14);
332
- test_token(frt_ts_next(ts), "com", 15, 18);
333
- test_token(frt_ts_next(ts), "is", 19, 21);
334
- test_token(frt_ts_next(ts), "my", 22, 24);
335
- test_token(frt_ts_next(ts), "e", 25, 26);
336
- test_token(frt_ts_next(ts), "mail", 27, 31);
337
- test_token(frt_ts_next(ts), "address", 40, 47);
338
- test_token(frt_ts_next(ts), "áägç", 55, 62);
339
- test_token(frt_ts_next(ts), "êëì", 64, 70);
340
- test_token(frt_ts_next(ts), "úøã", 72, 78);
341
- test_token(frt_ts_next(ts), "öîí", 80, 86);
334
+ ts = frt_lowercase_filter_new(ts);
335
+ ts->reset(ts, text, enc);
336
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8, enc);
337
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
338
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
339
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
340
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
341
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
342
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
343
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
344
+ test_token(frt_ts_next(ts), "áägç", 55, 62, enc);
345
+ test_token(frt_ts_next(ts), "êëì", 64, 70, enc);
346
+ test_token(frt_ts_next(ts), "úøã", 72, 78, enc);
347
+ test_token(frt_ts_next(ts), "öîí", 80, 86, enc);
342
348
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
343
349
  frt_ts_deref(ts);
344
- ts = frt_mb_letter_tokenizer_new(true);
345
- ts->reset(ts, text);
346
- test_token(frt_ts_next(ts), "dbalmän", 0, 8);
347
- test_token(frt_ts_next(ts), "gmail", 9, 14);
348
- test_token(frt_ts_next(ts), "com", 15, 18);
349
- test_token(frt_ts_next(ts), "is", 19, 21);
350
- test_token(frt_ts_next(ts), "my", 22, 24);
351
- test_token(frt_ts_next(ts), "e", 25, 26);
352
- test_token(frt_ts_next(ts), "mail", 27, 31);
353
- test_token(frt_ts_next(ts), "address", 40, 47);
354
- test_token(frt_ts_next(ts), "áägç", 55, 62);
355
- test_token(frt_ts_next(ts), "êëì", 64, 70);
356
- test_token(frt_ts_next(ts), "úøã", 72, 78);
357
- test_token(frt_ts_next(ts), "öîí", 80, 86);
350
+ ts = frt_letter_tokenizer_new();
351
+ ts = frt_lowercase_filter_new(ts);
352
+ ts->reset(ts, text, enc);
353
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8, enc);
354
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
355
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
356
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
357
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
358
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
359
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
360
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
361
+ test_token(frt_ts_next(ts), "áägç", 55, 62, enc);
362
+ test_token(frt_ts_next(ts), "êëì", 64, 70, enc);
363
+ test_token(frt_ts_next(ts), "úøã", 72, 78, enc);
364
+ test_token(frt_ts_next(ts), "öîí", 80, 86, enc);
358
365
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
359
366
  FRT_REF(ts); /* test ref_cnt */
360
367
  Aiequal(2, ts->ref_cnt);
@@ -369,17 +376,18 @@ static void test_letter_analyzer(TestCase *tc, void *data)
369
376
  FrtToken *tk = frt_tk_new();
370
377
  FrtAnalyzer *a = frt_letter_analyzer_new(true);
371
378
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
372
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
379
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
380
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
373
381
  (void)data;
374
382
 
375
- test_token(frt_ts_next(ts), "dbalmain", 0, 8);
376
- test_token(frt_ts_next(ts), "gmail", 9, 14);
377
- test_token(frt_ts_next(ts), "com", 15, 18);
378
- test_token(frt_ts_next(ts), "is", 19, 21);
379
- test_token(frt_ts_next(ts), "my", 22, 24);
380
- test_token(frt_ts_next(ts), "e", 25, 26);
381
- test_token(frt_ts_next(ts), "mail", 27, 31);
382
- test_token(frt_ts_next(ts), "address", 40, 47);
383
+ test_token(frt_ts_next(ts), "dbalmain", 0, 8, enc);
384
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
385
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
386
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
387
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
388
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
389
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
390
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
383
391
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
384
392
  frt_tk_destroy(tk);
385
393
  frt_ts_deref(ts);
@@ -389,42 +397,43 @@ static void test_letter_analyzer(TestCase *tc, void *data)
389
397
  static void test_mb_letter_analyzer(TestCase *tc, void *data)
390
398
  {
391
399
  FrtToken *tk = frt_tk_new();
392
- FrtAnalyzer *a = frt_mb_letter_analyzer_new(false);
400
+ FrtAnalyzer *a = frt_letter_analyzer_new(false);
393
401
  char text[100] =
394
402
  "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ "
395
403
  "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
396
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
404
+ rb_encoding *enc = utf8_encoding;
405
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
397
406
  (void)data;
398
407
 
399
- test_token(frt_ts_next(ts), "DBalmän", 0, 8);
400
- test_token(frt_ts_next(ts), "gmail", 9, 14);
401
- test_token(frt_ts_next(ts), "com", 15, 18);
402
- test_token(frt_ts_next(ts), "is", 19, 21);
403
- test_token(frt_ts_next(ts), "My", 22, 24);
404
- test_token(frt_ts_next(ts), "e", 25, 26);
405
- test_token(frt_ts_next(ts), "mail", 27, 31);
406
- test_token(frt_ts_next(ts), "address", 40, 47);
407
- test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62);
408
- test_token(frt_ts_next(ts), "ÊËÌ", 64, 70);
409
- test_token(frt_ts_next(ts), "ÚØÃ", 72, 78);
410
- test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86);
408
+ test_token(frt_ts_next(ts), "DBalmän", 0, 8, enc);
409
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
410
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
411
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
412
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
413
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
414
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
415
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
416
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62, enc);
417
+ test_token(frt_ts_next(ts), "ÊËÌ", 64, 70, enc);
418
+ test_token(frt_ts_next(ts), "ÚØÃ", 72, 78, enc);
419
+ test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86, enc);
411
420
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
412
421
  frt_ts_deref(ts);
413
422
  frt_a_deref(a);
414
- a = frt_mb_letter_analyzer_new(true);
415
- ts = frt_a_get_ts(a, rb_intern("random"), text);
416
- test_token(frt_ts_next(ts), "dbalmän", 0, 8);
417
- test_token(frt_ts_next(ts), "gmail", 9, 14);
418
- test_token(frt_ts_next(ts), "com", 15, 18);
419
- test_token(frt_ts_next(ts), "is", 19, 21);
420
- test_token(frt_ts_next(ts), "my", 22, 24);
421
- test_token(frt_ts_next(ts), "e", 25, 26);
422
- test_token(frt_ts_next(ts), "mail", 27, 31);
423
- test_token(frt_ts_next(ts), "address", 40, 47);
424
- test_token(frt_ts_next(ts), "áägç", 55, 62);
425
- test_token(frt_ts_next(ts), "êëì", 64, 70);
426
- test_token(frt_ts_next(ts), "úøã", 72, 78);
427
- test_token(frt_ts_next(ts), "öîí", 80, 86);
423
+ a = frt_letter_analyzer_new(true);
424
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
425
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8, enc);
426
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
427
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
428
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
429
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
430
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
431
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
432
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
433
+ test_token(frt_ts_next(ts), "áägç", 55, 62, enc);
434
+ test_token(frt_ts_next(ts), "êëì", 64, 70, enc);
435
+ test_token(frt_ts_next(ts), "úøã", 72, 78, enc);
436
+ test_token(frt_ts_next(ts), "öîí", 80, 86, enc);
428
437
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
429
438
  frt_a_deref(a);
430
439
  frt_ts_deref(ts);
@@ -445,21 +454,21 @@ static void do_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
445
454
  "DBalmain@gmail.com is My e-mail -52 #$ Address. 23#!$ "
446
455
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 "
447
456
  "underscored_word, won't we're";
448
-
449
- ts->reset(ts, text);
450
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
451
- test_token(frt_ts_next(ts), "is", 19, 21);
452
- test_token(frt_ts_next(ts), "My", 22, 24);
453
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
454
- test_token(frt_ts_next(ts), "-52", 32, 35);
455
- test_token(frt_ts_next(ts), "Address", 40, 47);
456
- test_token(frt_ts_next(ts), "23", 49, 51);
457
- test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
458
- test_token(frt_ts_next(ts), "TNT", 86, 91);
459
- test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110);
460
- test_token(frt_ts_next(ts), "underscored_word", 111, 127);
461
- test_token(frt_ts_next(ts), "won't", 129, 134);
462
- test_token(frt_ts_next(ts), "we're", 135, 140);
457
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
458
+ ts->reset(ts, text, enc);
459
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
460
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
461
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
462
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
463
+ test_token(frt_ts_next(ts), "-52", 32, 35, enc);
464
+ test_token(frt_ts_next(ts), "Address", 40, 47, enc);
465
+ test_token(frt_ts_next(ts), "23", 49, 51, enc);
466
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85, enc);
467
+ test_token(frt_ts_next(ts), "TNT", 86, 91, enc);
468
+ test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, enc);
469
+ test_token(frt_ts_next(ts), "underscored_word", 111, 127, enc);
470
+ test_token(frt_ts_next(ts), "won't", 129, 134, enc);
471
+ test_token(frt_ts_next(ts), "we're", 135, 140, enc);
463
472
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
464
473
  frt_tk_destroy(tk);
465
474
  FRT_REF(ts); /* test ref_cnt */
@@ -470,12 +479,12 @@ static void do_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
470
479
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
471
480
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
472
481
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
473
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
482
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", enc);
474
483
  test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
475
484
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
476
485
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
477
486
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
478
- "xxxxxxxxxxxxxxxxxxx", 0, 280);
487
+ "xxxxxxxxxxxxxxxxxxx", 0, 280, enc);
479
488
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
480
489
  }
481
490
 
@@ -489,7 +498,7 @@ static void test_standard_tokenizer(TestCase *tc, void *data)
489
498
 
490
499
  static void test_legacy_standard_tokenizer(TestCase *tc, void *data)
491
500
  {
492
- FrtTokenStream *ts = frt_legacy_standard_tokenizer_new();
501
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
493
502
  (void)data;
494
503
  do_standard_tokenizer(tc, ts);
495
504
  frt_ts_deref(ts);
@@ -502,44 +511,44 @@ static void do_mb_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
502
511
  "DBalmain@gmail.com is My e-mail -52 #$ Address. 23#!$ "
503
512
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 "
504
513
  "underscored_word, won't we're 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ "
505
- "\200 badchar it's groups' Barnes&Noble file:///home/user/ "
514
+ " badchar it's groups' Barnes&Noble file:///home/user/ "
506
515
  "svn://www.davebalmain.com/ www,.google.com www.google.com "
507
516
  "dave@balmain@gmail.com \"quoted string\" continue *star";
508
-
509
- ts->reset(ts, text);
510
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
511
- test_token(frt_ts_next(ts), "is", 19, 21);
512
- test_token(frt_ts_next(ts), "My", 22, 24);
513
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
514
- test_token(frt_ts_next(ts), "-52", 32, 35);
515
- test_token(frt_ts_next(ts), "Address", 40, 47);
516
- test_token(frt_ts_next(ts), "23", 49, 51);
517
- test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
518
- test_token(frt_ts_next(ts), "TNT", 86, 91);
519
- test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110);
520
- test_token(frt_ts_next(ts), "underscored_word", 111, 127);
521
- test_token(frt_ts_next(ts), "won't", 129, 134);
522
- test_token(frt_ts_next(ts), "we're", 135, 140);
523
- test_token(frt_ts_next(ts), "23", 141, 143);
524
- test_token(frt_ts_next(ts), "ÁÄGÇ", 147, 154);
525
- test_token(frt_ts_next(ts), "ÊËÌ", 156, 162);
526
- test_token(frt_ts_next(ts), "ÚØÃ", 164, 170);
527
- test_token(frt_ts_next(ts), "ÖÎÍ", 172, 178);
528
- test_token(frt_ts_next(ts), "badchar", 181, 188);
529
- test_token(frt_ts_next(ts), "it", 189, 193);
530
- test_token(frt_ts_next(ts), "groups", 194, 201);
531
- test_token(frt_ts_next(ts), "Barnes&Noble", 202, 214);
532
- test_token(frt_ts_next(ts), "home/user", 215, 233);
533
- test_token(frt_ts_next(ts), "svn://www.davebalmain.com", 234, 260);
534
- test_token(frt_ts_next(ts), "www", 261, 264);
535
- test_token(frt_ts_next(ts), "google.com", 266, 276);
536
- test_token(frt_ts_next(ts), "www.google.com", 277, 291);
537
- test_token(frt_ts_next(ts), "dave@balmain", 292, 304);
538
- test_token(frt_ts_next(ts), "gmail.com", 305, 314);
539
- test_token(frt_ts_next(ts), "quoted", 316, 322);
540
- test_token(frt_ts_next(ts), "string", 323, 329);
541
- test_token(frt_ts_next(ts), "continue", 331, 339);
542
- test_token(frt_ts_next(ts), "star", 341, 345);
517
+ rb_encoding *enc = utf8_encoding;
518
+ ts->reset(ts, text, enc);
519
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
520
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
521
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
522
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
523
+ test_token(frt_ts_next(ts), "-52", 32, 35, enc);
524
+ test_token(frt_ts_next(ts), "Address", 40, 47, enc);
525
+ test_token(frt_ts_next(ts), "23", 49, 51, enc);
526
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85, enc);
527
+ test_token(frt_ts_next(ts), "TNT", 86, 91, enc);
528
+ test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, enc);
529
+ test_token(frt_ts_next(ts), "underscored_word", 111, 127, enc);
530
+ test_token(frt_ts_next(ts), "won't", 129, 134, enc);
531
+ test_token(frt_ts_next(ts), "we're", 135, 140, enc);
532
+ test_token(frt_ts_next(ts), "23", 141, 143, enc);
533
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 147, 154, enc);
534
+ test_token(frt_ts_next(ts), "ÊËÌ", 156, 162, enc);
535
+ test_token(frt_ts_next(ts), "ÚØÃ", 164, 170, enc);
536
+ test_token(frt_ts_next(ts), "ÖÎÍ", 172, 178, enc);
537
+ test_token(frt_ts_next(ts), "badchar", 181, 188, enc);
538
+ test_token(frt_ts_next(ts), "it", 189, 193, enc);
539
+ test_token(frt_ts_next(ts), "groups", 194, 201, enc);
540
+ test_token(frt_ts_next(ts), "Barnes&Noble", 202, 214, enc);
541
+ test_token(frt_ts_next(ts), "home/user", 215, 233, enc);
542
+ test_token(frt_ts_next(ts), "svn://www.davebalmain.com", 234, 260, enc);
543
+ test_token(frt_ts_next(ts), "www", 261, 264, enc);
544
+ test_token(frt_ts_next(ts), "google.com", 266, 276, enc);
545
+ test_token(frt_ts_next(ts), "www.google.com", 277, 291, enc);
546
+ test_token(frt_ts_next(ts), "dave@balmain", 292, 304, enc);
547
+ test_token(frt_ts_next(ts), "gmail.com", 305, 314, enc);
548
+ test_token(frt_ts_next(ts), "quoted", 316, 322, enc);
549
+ test_token(frt_ts_next(ts), "string", 323, 329, enc);
550
+ test_token(frt_ts_next(ts), "continue", 331, 339, enc);
551
+ test_token(frt_ts_next(ts), "star", 341, 345, enc);
543
552
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
544
553
  frt_tk_destroy(tk);
545
554
  FRT_REF(ts); /* test ref_cnt */
@@ -550,29 +559,29 @@ static void do_mb_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
550
559
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
551
560
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
552
561
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
553
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
562
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", enc);
554
563
  test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
555
564
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
556
565
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
557
566
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
558
- "xxxxxxxxxxxxxxxxxxx", 0, 280);
567
+ "xxxxxxxxxxxxxxxxxxx", 0, 280, enc);
559
568
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
560
569
  ts->reset(ts, (char *)"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
561
570
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
562
571
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
563
572
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
564
573
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
565
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
574
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", enc);
566
575
  test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
567
576
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
568
577
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
569
578
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
570
- "xxxxxxxxxxxxxxxxxxx", 0, 348);
579
+ "xxxxxxxxxxxxxxxxxxx", 0, 348, enc);
571
580
  }
572
581
 
573
582
  static void test_mb_standard_tokenizer(TestCase *tc, void *data)
574
583
  {
575
- FrtTokenStream *ts = frt_mb_standard_tokenizer_new();
584
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
576
585
  (void)data;
577
586
  do_mb_standard_tokenizer(tc, ts);
578
587
  frt_ts_deref(ts);
@@ -580,7 +589,7 @@ static void test_mb_standard_tokenizer(TestCase *tc, void *data)
580
589
 
581
590
  static void test_mb_legacy_standard_tokenizer(TestCase *tc, void *data)
582
591
  {
583
- FrtTokenStream *ts = frt_mb_legacy_standard_tokenizer_new();
592
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
584
593
  (void)data;
585
594
  do_mb_standard_tokenizer(tc, ts);
586
595
  frt_ts_deref(ts);
@@ -589,23 +598,24 @@ static void test_mb_legacy_standard_tokenizer(TestCase *tc, void *data)
589
598
  static void test_standard_analyzer(TestCase *tc, void *data)
590
599
  {
591
600
  FrtToken *tk = frt_tk_new();
592
- FrtAnalyzer *a = frt_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
601
+ FrtAnalyzer *a = frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
593
602
  char text[200] =
594
603
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
595
604
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
596
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
605
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
606
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
597
607
  (void)data;
598
608
 
599
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
600
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
601
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
602
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
603
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
604
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
605
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
606
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
607
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
608
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
609
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
610
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
611
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
612
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
613
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
614
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
615
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
616
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
617
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
618
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
609
619
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
610
620
  frt_tk_destroy(tk);
611
621
  frt_ts_deref(ts);
@@ -616,85 +626,86 @@ static void test_mb_standard_analyzer(TestCase *tc, void *data)
616
626
  {
617
627
  FrtToken *tk = frt_tk_new();
618
628
  FrtAnalyzer *a =
619
- frt_mb_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, false);
629
+ frt_standard_analyzer_new_with_words(false, FRT_ENGLISH_STOP_WORDS);
620
630
  const char *words[] = { "is", "the", "-23", "tnt", NULL };
621
631
  char text[200] =
622
632
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
623
633
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ "
624
634
  "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
625
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text), *ts2;
635
+ rb_encoding *enc = utf8_encoding;
636
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc), *ts2;
626
637
  (void)data;
627
638
 
628
- test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
629
- test_token_pi(frt_ts_next(ts), "My", 22, 24, 2);
630
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
631
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
632
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
633
- test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3);
634
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
635
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
636
- test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1);
637
- test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1);
638
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
639
- test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1);
640
- test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1);
641
- test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1);
642
- test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1);
639
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1, enc);
640
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 2, enc);
641
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
642
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
643
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
644
+ test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3, enc);
645
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
646
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
647
+ test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1, enc);
648
+ test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1, enc);
649
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
650
+ test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1, enc);
651
+ test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1, enc);
652
+ test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1, enc);
653
+ test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1, enc);
643
654
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
644
655
  frt_ts_deref(ts);
645
656
  frt_a_deref(a);
646
- a = frt_mb_standard_analyzer_new(true);
647
- ts = frt_a_get_ts(a, rb_intern("random"), text);
648
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
649
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
650
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
651
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
652
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
653
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
654
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
655
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
656
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
657
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
658
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
659
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
660
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
661
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
657
+ a = frt_standard_analyzer_new(true);
658
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
659
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
660
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3, enc);
661
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
662
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
663
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
664
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
665
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
666
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
667
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
668
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
669
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
670
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
671
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
672
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
662
673
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
663
674
  frt_ts_deref(ts);
664
675
  frt_a_deref(a);
665
- a = frt_mb_standard_analyzer_new_with_words(words, true);
666
- ts = frt_a_get_ts(a, rb_intern("random"), text);
667
- ts2 = frt_a_get_ts(a, rb_intern("random"), text);
668
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
669
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
670
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
671
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
672
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
673
- test_token_pi(frt_ts_next(ts), "and", 32, 35, 1);
674
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 2);
675
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2);
676
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2);
677
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
678
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
679
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
680
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
681
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
676
+ a = frt_standard_analyzer_new_with_words(true, words);
677
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
678
+ ts2 = frt_a_get_ts(a, rb_intern("random"), text, enc);
679
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
680
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
681
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
682
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
683
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
684
+ test_token_pi(frt_ts_next(ts), "and", 32, 35, 1, enc);
685
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 2, enc);
686
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2, enc);
687
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2, enc);
688
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
689
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
690
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
691
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
692
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
682
693
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
683
694
  frt_ts_deref(ts);
684
- test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1);
685
- test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2);
686
- test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1);
687
- test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0);
688
- test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1);
689
- test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1);
690
- test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2);
691
- test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2);
692
- test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2);
693
- test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1);
694
- test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1);
695
- test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1);
696
- test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1);
697
- test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1);
695
+ test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1, enc);
696
+ test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2, enc);
697
+ test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1, enc);
698
+ test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0, enc);
699
+ test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1, enc);
700
+ test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1, enc);
701
+ test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2, enc);
702
+ test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2, enc);
703
+ test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2, enc);
704
+ test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1, enc);
705
+ test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1, enc);
706
+ test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1, enc);
707
+ test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1, enc);
708
+ test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1, enc);
698
709
  Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
699
710
  ts2->ref_cnt = 3;
700
711
  ts = frt_ts_clone(ts2);
@@ -714,23 +725,24 @@ static void test_legacy_standard_analyzer(TestCase *tc, void *data)
714
725
  {
715
726
  FrtToken *tk = frt_tk_new();
716
727
  FrtAnalyzer *a =
717
- frt_legacy_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
728
+ frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
718
729
  char text[200] =
719
730
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
720
731
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
721
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
732
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
733
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
722
734
  (void)data;
723
735
 
724
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
725
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
726
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
727
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
728
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
729
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
730
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
731
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
732
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
733
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
736
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
737
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
738
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
739
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
740
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
741
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
742
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
743
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
744
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
745
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
734
746
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
735
747
  frt_tk_destroy(tk);
736
748
  frt_ts_deref(ts);
@@ -741,85 +753,86 @@ static void test_mb_legacy_standard_analyzer(TestCase *tc, void *data)
741
753
  {
742
754
  FrtToken *tk = frt_tk_new();
743
755
  FrtAnalyzer *a =
744
- frt_mb_legacy_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, false);
756
+ frt_standard_analyzer_new_with_words(false, FRT_ENGLISH_STOP_WORDS);
745
757
  const char *words[] = { "is", "the", "-23", "tnt", NULL };
746
758
  char text[200] =
747
759
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
748
760
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ "
749
761
  "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
750
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text), *ts2;
762
+ rb_encoding *enc = utf8_encoding;
763
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc), *ts2;
751
764
  (void)data;
752
765
 
753
- test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
754
- test_token_pi(frt_ts_next(ts), "My", 22, 24, 2);
755
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
756
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
757
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
758
- test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3);
759
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
760
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
761
- test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1);
762
- test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1);
763
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
764
- test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1);
765
- test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1);
766
- test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1);
767
- test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1);
766
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1, enc);
767
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 2, enc);
768
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
769
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
770
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
771
+ test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3, enc);
772
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
773
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
774
+ test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1, enc);
775
+ test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1, enc);
776
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
777
+ test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1, enc);
778
+ test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1, enc);
779
+ test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1, enc);
780
+ test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1, enc);
768
781
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
769
782
  frt_ts_deref(ts);
770
783
  frt_a_deref(a);
771
- a = frt_mb_legacy_standard_analyzer_new(true);
772
- ts = frt_a_get_ts(a, rb_intern("random"), text);
773
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
774
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
775
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
776
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
777
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
778
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
779
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
780
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
781
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
782
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
783
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
784
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
785
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
786
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
784
+ a = frt_standard_analyzer_new(true);
785
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
786
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
787
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3, enc);
788
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
789
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
790
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
791
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
792
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
793
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
794
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
795
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
796
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
797
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
798
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
799
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
787
800
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
788
801
  frt_ts_deref(ts);
789
802
  frt_a_deref(a);
790
- a = frt_mb_legacy_standard_analyzer_new_with_words(words, true);
791
- ts = frt_a_get_ts(a, rb_intern("random"), text);
792
- ts2 = frt_a_get_ts(a, rb_intern("random"), text);
793
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
794
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
795
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
796
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
797
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
798
- test_token_pi(frt_ts_next(ts), "and", 32, 35, 1);
799
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 2);
800
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2);
801
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2);
802
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
803
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
804
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
805
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
806
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
803
+ a = frt_standard_analyzer_new_with_words(true, words);
804
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
805
+ ts2 = frt_a_get_ts(a, rb_intern("random"), text, enc);
806
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
807
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
808
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
809
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
810
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
811
+ test_token_pi(frt_ts_next(ts), "and", 32, 35, 1, enc);
812
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 2, enc);
813
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2, enc);
814
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2, enc);
815
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
816
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
817
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
818
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
819
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
807
820
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
808
821
  frt_ts_deref(ts);
809
- test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1);
810
- test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2);
811
- test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1);
812
- test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0);
813
- test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1);
814
- test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1);
815
- test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2);
816
- test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2);
817
- test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2);
818
- test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1);
819
- test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1);
820
- test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1);
821
- test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1);
822
- test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1);
822
+ test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1, enc);
823
+ test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2, enc);
824
+ test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1, enc);
825
+ test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0, enc);
826
+ test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1, enc);
827
+ test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1, enc);
828
+ test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2, enc);
829
+ test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2, enc);
830
+ test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2, enc);
831
+ test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1, enc);
832
+ test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1, enc);
833
+ test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1, enc);
834
+ test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1, enc);
835
+ test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1, enc);
823
836
  Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
824
837
  ts2->ref_cnt = 3;
825
838
  ts = frt_ts_clone(ts2);
@@ -838,25 +851,32 @@ static void test_mb_legacy_standard_analyzer(TestCase *tc, void *data)
838
851
  static void test_long_word(TestCase *tc, void *data)
839
852
  {
840
853
  FrtToken *tk = frt_tk_new();
841
- FrtAnalyzer *a = frt_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
854
+ FrtAnalyzer *a = frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
842
855
  char text[400] =
843
856
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
844
857
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
845
858
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
846
859
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
847
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" " two";
848
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
860
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx two";
861
+ char text_a[400] =
862
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
863
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
864
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
865
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
866
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
867
+ rb_encoding *enc = utf8_encoding;
868
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
849
869
  (void)data;
850
870
 
851
- test_token_pi(frt_ts_next(ts), text, 0, 290, 1); /* text gets truncated anyway */
852
- test_token_pi(frt_ts_next(ts), "two", 291, 294, 1);
871
+ test_token_pi(frt_ts_next(ts), text_a, 0, 290, 1, enc);
872
+ test_token_pi(frt_ts_next(ts), "two", 291, 294, 1, enc);
853
873
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
854
874
  frt_ts_deref(ts);
855
875
  frt_a_deref(a);
856
- a = frt_mb_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
857
- ts = frt_a_get_ts(a, rb_intern("random"), text);
858
- test_token_pi(frt_ts_next(ts), text, 0, 290, 1); /* text gets truncated anyway */
859
- test_token_pi(frt_ts_next(ts), "two", 291, 294, 1);
876
+ a = frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
877
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
878
+ test_token_pi(frt_ts_next(ts), text_a, 0, 290, 1, enc);
879
+ test_token_pi(frt_ts_next(ts), "two", 291, 294, 1, enc);
860
880
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
861
881
  frt_ts_deref(ts);
862
882
  frt_a_deref(a);
@@ -872,22 +892,23 @@ static void test_long_word(TestCase *tc, void *data)
872
892
  static void test_lowercase_filter(TestCase *tc, void *data)
873
893
  {
874
894
  FrtToken *tk = frt_tk_new();
875
- FrtTokenStream *ts = frt_lowercase_filter_new(frt_standard_tokenizer_new());
895
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
896
+ ts = frt_lowercase_filter_new(ts);
876
897
  char text[200] =
877
898
  "DBalmain@gmail.com is My e-mail 52 #$ Address. -23!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
878
899
  (void)data;
879
-
880
- ts->reset(ts, text);
881
- test_token(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18);
882
- test_token(frt_ts_next(ts), "is", 19, 21);
883
- test_token(frt_ts_next(ts), "my", 22, 24);
884
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
885
- test_token(frt_ts_next(ts), "52", 32, 34);
886
- test_token(frt_ts_next(ts), "address", 40, 47);
887
- test_token(frt_ts_next(ts), "-23", 49, 52);
888
- test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
889
- test_token(frt_ts_next(ts), "tnt", 86, 91);
890
- test_token(frt_ts_next(ts), "123-1235-asd-1234", 93, 110);
900
+ rb_encoding *enc = utf8_encoding;
901
+ ts->reset(ts, text, enc);
902
+ test_token(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, enc);
903
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
904
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
905
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
906
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
907
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
908
+ test_token(frt_ts_next(ts), "-23", 49, 52, enc);
909
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85, enc);
910
+ test_token(frt_ts_next(ts), "tnt", 86, 91, enc);
911
+ test_token(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, enc);
891
912
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
892
913
  frt_tk_destroy(tk);
893
914
  FRT_REF(ts);
@@ -900,31 +921,33 @@ static void test_lowercase_filter(TestCase *tc, void *data)
900
921
  static void test_hyphen_filter(TestCase *tc, void *data)
901
922
  {
902
923
  FrtToken *tk = frt_tk_new();
903
- FrtTokenStream *ts = frt_hyphen_filter_new(frt_lowercase_filter_new(frt_standard_tokenizer_new()));
924
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
925
+ ts = frt_lowercase_filter_new(ts);
926
+ ts = frt_hyphen_filter_new(ts);
904
927
  char text[200] =
905
928
  "DBalmain@gmail.com is My e-mail 52 #$ Address. -23!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 long-hyph-en-at-ed-word";
906
929
  (void)data;
907
-
908
- ts->reset(ts, text);
909
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
910
- test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
911
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 1);
912
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
913
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
914
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
915
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
916
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 1);
917
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
918
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
919
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
920
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
921
- test_token_pi(frt_ts_next(ts), "longhyphenatedword", 111, 134, 1);
922
- test_token_pi(frt_ts_next(ts), "long", 111, 115, 0);
923
- test_token_pi(frt_ts_next(ts), "hyph", 116, 120, 1);
924
- test_token_pi(frt_ts_next(ts), "en", 121, 123, 1);
925
- test_token_pi(frt_ts_next(ts), "at", 124, 126, 1);
926
- test_token_pi(frt_ts_next(ts), "ed", 127, 129, 1);
927
- test_token_pi(frt_ts_next(ts), "word", 130, 134, 1);
930
+ rb_encoding *enc = utf8_encoding;
931
+ ts->reset(ts, text, enc);
932
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
933
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1, enc);
934
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 1, enc);
935
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
936
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
937
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
938
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
939
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 1, enc);
940
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
941
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
942
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
943
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
944
+ test_token_pi(frt_ts_next(ts), "longhyphenatedword", 111, 134, 1, enc);
945
+ test_token_pi(frt_ts_next(ts), "long", 111, 115, 0, enc);
946
+ test_token_pi(frt_ts_next(ts), "hyph", 116, 120, 1, enc);
947
+ test_token_pi(frt_ts_next(ts), "en", 121, 123, 1, enc);
948
+ test_token_pi(frt_ts_next(ts), "at", 124, 126, 1, enc);
949
+ test_token_pi(frt_ts_next(ts), "ed", 127, 129, 1, enc);
950
+ test_token_pi(frt_ts_next(ts), "word", 130, 134, 1, enc);
928
951
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
929
952
  frt_tk_destroy(tk);
930
953
  FRT_REF(ts);
@@ -943,14 +966,15 @@ static void test_stop_filter(TestCase *tc, void *data)
943
966
  char text[200] =
944
967
  "one, two, three, four, five, six, seven, eight, nine, ten.";
945
968
  (void)data;
946
-
947
- ts->reset(ts, text);
948
- test_token_pi(frt_ts_next(ts), "two", 5, 8, 2);
949
- test_token_pi(frt_ts_next(ts), "three", 10, 15, 1);
950
- test_token_pi(frt_ts_next(ts), "six", 29, 32, 3);
951
- test_token_pi(frt_ts_next(ts), "eight", 41, 46, 2);
952
- test_token_pi(frt_ts_next(ts), "nine", 48, 52, 1);
953
- test_token_pi(frt_ts_next(ts), "ten", 54, 57, 1);
969
+ rb_encoding *enc = utf8_encoding;
970
+
971
+ ts->reset(ts, text, enc);
972
+ test_token_pi(frt_ts_next(ts), "two", 5, 8, 2, enc);
973
+ test_token_pi(frt_ts_next(ts), "three", 10, 15, 1, enc);
974
+ test_token_pi(frt_ts_next(ts), "six", 29, 32, 3, enc);
975
+ test_token_pi(frt_ts_next(ts), "eight", 41, 46, 2, enc);
976
+ test_token_pi(frt_ts_next(ts), "nine", 48, 52, 1, enc);
977
+ test_token_pi(frt_ts_next(ts), "ten", 54, 57, 1, enc);
954
978
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
955
979
  frt_tk_destroy(tk);
956
980
  FRT_REF(ts);
@@ -974,36 +998,37 @@ static void test_mapping_filter(TestCase *tc, void *data)
974
998
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
975
999
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
976
1000
  (void)data;
1001
+ rb_encoding *enc = utf8_encoding;
977
1002
 
978
1003
  frt_mapping_filter_add(ts, "ne", "hello");
979
1004
  frt_mapping_filter_add(ts, "four", long_word);
980
1005
 
981
- ts->reset(ts, text);
982
- test_token(frt_ts_next(ts), "ohello", 0, 3);
983
- test_token(frt_ts_next(ts), "two", 5, 8);
984
- test_token(frt_ts_next(ts), "three", 10, 15);
985
- test_token(frt_ts_next(ts), long_word, 17, 21);
986
- test_token(frt_ts_next(ts), "five", 23, 27);
987
- test_token(frt_ts_next(ts), "six", 29, 32);
988
- test_token(frt_ts_next(ts), "seven", 34, 39);
989
- test_token(frt_ts_next(ts), "eight", 41, 46);
990
- test_token(frt_ts_next(ts), "nihello", 48, 52);
991
- test_token(frt_ts_next(ts), "ten", 54, 57);
1006
+ ts->reset(ts, text, enc);
1007
+ test_token(frt_ts_next(ts), "ohello", 0, 3, enc);
1008
+ test_token(frt_ts_next(ts), "two", 5, 8, enc);
1009
+ test_token(frt_ts_next(ts), "three", 10, 15, enc);
1010
+ test_token(frt_ts_next(ts), long_word, 17, 21, enc);
1011
+ test_token(frt_ts_next(ts), "five", 23, 27, enc);
1012
+ test_token(frt_ts_next(ts), "six", 29, 32, enc);
1013
+ test_token(frt_ts_next(ts), "seven", 34, 39, enc);
1014
+ test_token(frt_ts_next(ts), "eight", 41, 46, enc);
1015
+ test_token(frt_ts_next(ts), "nihello", 48, 52, enc);
1016
+ test_token(frt_ts_next(ts), "ten", 54, 57, enc);
992
1017
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
993
1018
 
994
1019
  frt_mapping_filter_add(ts, "thr", "start");
995
1020
  frt_mapping_filter_add(ts, "en", "goodbye");
996
- ts->reset(ts, text);
997
- test_token(frt_ts_next(ts), "ohello", 0, 3);
998
- test_token(frt_ts_next(ts), "two", 5, 8);
999
- test_token(frt_ts_next(ts), "startee", 10, 15);
1000
- test_token(frt_ts_next(ts), long_word, 17, 21);
1001
- test_token(frt_ts_next(ts), "five", 23, 27);
1002
- test_token(frt_ts_next(ts), "six", 29, 32);
1003
- test_token(frt_ts_next(ts), "sevgoodbye", 34, 39);
1004
- test_token(frt_ts_next(ts), "eight", 41, 46);
1005
- test_token(frt_ts_next(ts), "nihello", 48, 52);
1006
- test_token(frt_ts_next(ts), "tgoodbye", 54, 57);
1021
+ ts->reset(ts, text, enc);
1022
+ test_token(frt_ts_next(ts), "ohello", 0, 3, enc);
1023
+ test_token(frt_ts_next(ts), "two", 5, 8, enc);
1024
+ test_token(frt_ts_next(ts), "startee", 10, 15, enc);
1025
+ test_token(frt_ts_next(ts), long_word, 17, 21, enc);
1026
+ test_token(frt_ts_next(ts), "five", 23, 27, enc);
1027
+ test_token(frt_ts_next(ts), "six", 29, 32, enc);
1028
+ test_token(frt_ts_next(ts), "sevgoodbye", 34, 39, enc);
1029
+ test_token(frt_ts_next(ts), "eight", 41, 46, enc);
1030
+ test_token(frt_ts_next(ts), "nihello", 48, 52, enc);
1031
+ test_token(frt_ts_next(ts), "tgoodbye", 54, 57, enc);
1007
1032
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1008
1033
  frt_tk_destroy(tk);
1009
1034
  FRT_REF(ts);
@@ -1036,38 +1061,40 @@ static void test_stemmer(TestCase *tc, void *data)
1036
1061
  static void test_stem_filter(TestCase *tc, void *data)
1037
1062
  {
1038
1063
  FrtToken *tk = frt_tk_new();
1039
- FrtTokenStream *ts = frt_stem_filter_new(frt_mb_letter_tokenizer_new(true),
1040
- "english", NULL);
1064
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
1065
+ ts = frt_lowercase_filter_new(ts);
1066
+ ts = frt_stem_filter_new(ts, "english");
1041
1067
  FrtTokenStream *ts2;
1042
1068
  char text[200] = "debate debates debated debating debater";
1043
1069
  char text2[200] = "dêbate dêbates dêbated dêbating dêbater";
1044
1070
  (void)data;
1045
1071
 
1046
- ts->reset(ts, text);
1072
+ rb_encoding *enc = utf8_encoding;
1073
+ ts->reset(ts, text, enc);
1047
1074
  ts2 = frt_ts_clone(ts);
1048
- test_token(frt_ts_next(ts), "debat", 0, 6);
1049
- test_token(frt_ts_next(ts), "debat", 7, 14);
1050
- test_token(frt_ts_next(ts), "debat", 15, 22);
1051
- test_token(frt_ts_next(ts), "debat", 23, 31);
1052
- test_token(frt_ts_next(ts), "debat", 32, 39);
1075
+ test_token(frt_ts_next(ts), "debat", 0, 6, enc);
1076
+ test_token(frt_ts_next(ts), "debat", 7, 14, enc);
1077
+ test_token(frt_ts_next(ts), "debat", 15, 22, enc);
1078
+ test_token(frt_ts_next(ts), "debat", 23, 31, enc);
1079
+ test_token(frt_ts_next(ts), "debat", 32, 39, enc);
1053
1080
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1054
- ts->reset(ts, text2);
1055
- test_token(frt_ts_next(ts), "dêbate", 0, 7);
1056
- test_token(frt_ts_next(ts), "dêbate", 8, 16);
1057
- test_token(frt_ts_next(ts), "dêbate", 17, 25);
1058
- test_token(frt_ts_next(ts), "dêbate", 26, 35);
1059
- test_token(frt_ts_next(ts), "dêbater", 36, 44);
1081
+ ts->reset(ts, text2, enc);
1082
+ test_token(frt_ts_next(ts), "dêbate", 0, 7, enc);
1083
+ test_token(frt_ts_next(ts), "dêbate", 8, 16, enc);
1084
+ test_token(frt_ts_next(ts), "dêbate", 17, 25, enc);
1085
+ test_token(frt_ts_next(ts), "dêbate", 26, 35, enc);
1086
+ test_token(frt_ts_next(ts), "dêbater", 36, 44, enc);
1060
1087
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1061
1088
  FRT_REF(ts);
1062
1089
  Aiequal(2, ts->ref_cnt);
1063
1090
  frt_ts_deref(ts);
1064
1091
  Aiequal(1, ts->ref_cnt);
1065
1092
  frt_ts_deref(ts);
1066
- test_token(frt_ts_next(ts2), "debat", 0, 6);
1067
- test_token(frt_ts_next(ts2), "debat", 7, 14);
1068
- test_token(frt_ts_next(ts2), "debat", 15, 22);
1069
- test_token(frt_ts_next(ts2), "debat", 23, 31);
1070
- test_token(frt_ts_next(ts2), "debat", 32, 39);
1093
+ test_token(frt_ts_next(ts2), "debat", 0, 6, enc);
1094
+ test_token(frt_ts_next(ts2), "debat", 7, 14, enc);
1095
+ test_token(frt_ts_next(ts2), "debat", 15, 22, enc);
1096
+ test_token(frt_ts_next(ts2), "debat", 23, 31, enc);
1097
+ test_token(frt_ts_next(ts2), "debat", 32, 39, enc);
1071
1098
  Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
1072
1099
  frt_tk_destroy(tk);
1073
1100
  frt_ts_deref(ts2);
@@ -1080,64 +1107,65 @@ static void test_per_field_analyzer(TestCase *tc, void *data)
1080
1107
  char text[100] = "DBalmain@gmail.com is My E-mail 52 #$ address. 23#!$";
1081
1108
  FrtAnalyzer *pfa = frt_per_field_analyzer_new(frt_standard_analyzer_new(true));
1082
1109
  (void)data;
1110
+ rb_encoding *enc = utf8_encoding;
1083
1111
 
1084
1112
  frt_pfa_add_field(pfa, rb_intern("white"), frt_whitespace_analyzer_new(false));
1085
1113
  frt_pfa_add_field(pfa, rb_intern("white_l"), frt_whitespace_analyzer_new(true));
1086
1114
  frt_pfa_add_field(pfa, rb_intern("letter"), frt_letter_analyzer_new(false));
1087
1115
  frt_pfa_add_field(pfa, rb_intern("letter"), frt_letter_analyzer_new(true));
1088
1116
  frt_pfa_add_field(pfa, rb_intern("letter_u"), frt_letter_analyzer_new(false));
1089
- ts = frt_a_get_ts(pfa, rb_intern("white"), text);
1090
- test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
1091
- test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
1092
- test_token_pi(frt_ts_next(ts), "My", 22, 24, 1);
1093
- test_token_pi(frt_ts_next(ts), "E-mail", 25, 31, 1);
1094
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1095
- test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1);
1096
- test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1);
1097
- test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1);
1117
+ ts = frt_a_get_ts(pfa, rb_intern("white"), text, enc);
1118
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1, enc);
1119
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1, enc);
1120
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 1, enc);
1121
+ test_token_pi(frt_ts_next(ts), "E-mail", 25, 31, 1, enc);
1122
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
1123
+ test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1, enc);
1124
+ test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1, enc);
1125
+ test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1, enc);
1098
1126
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1099
1127
  frt_ts_deref(ts);
1100
- ts = frt_a_get_ts(pfa, rb_intern("white_l"), text);
1101
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
1102
- test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
1103
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 1);
1104
- test_token_pi(frt_ts_next(ts), "e-mail", 25, 31, 1);
1105
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1106
- test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1);
1107
- test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1);
1108
- test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1);
1128
+ ts = frt_a_get_ts(pfa, rb_intern("white_l"), text, enc);
1129
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
1130
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1, enc);
1131
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 1, enc);
1132
+ test_token_pi(frt_ts_next(ts), "e-mail", 25, 31, 1, enc);
1133
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
1134
+ test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1, enc);
1135
+ test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1, enc);
1136
+ test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1, enc);
1109
1137
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1110
1138
  frt_ts_deref(ts);
1111
- ts = frt_a_get_ts(pfa, rb_intern("letter_u"), text);
1112
- test_token(frt_ts_next(ts), "DBalmain", 0, 8);
1113
- test_token(frt_ts_next(ts), "gmail", 9, 14);
1114
- test_token(frt_ts_next(ts), "com", 15, 18);
1115
- test_token(frt_ts_next(ts), "is", 19, 21);
1116
- test_token(frt_ts_next(ts), "My", 22, 24);
1117
- test_token(frt_ts_next(ts), "E", 25, 26);
1118
- test_token(frt_ts_next(ts), "mail", 27, 31);
1119
- test_token(frt_ts_next(ts), "address", 40, 47);
1139
+ ts = frt_a_get_ts(pfa, rb_intern("letter_u"), text, enc);
1140
+ test_token(frt_ts_next(ts), "DBalmain", 0, 8, enc);
1141
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
1142
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
1143
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
1144
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
1145
+ test_token(frt_ts_next(ts), "E", 25, 26, enc);
1146
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
1147
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
1120
1148
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1121
1149
  frt_ts_deref(ts);
1122
- ts = frt_a_get_ts(pfa, rb_intern("letter"), text);
1123
- test_token(frt_ts_next(ts), "dbalmain", 0, 8);
1124
- test_token(frt_ts_next(ts), "gmail", 9, 14);
1125
- test_token(frt_ts_next(ts), "com", 15, 18);
1126
- test_token(frt_ts_next(ts), "is", 19, 21);
1127
- test_token(frt_ts_next(ts), "my", 22, 24);
1128
- test_token(frt_ts_next(ts), "e", 25, 26);
1129
- test_token(frt_ts_next(ts), "mail", 27, 31);
1130
- test_token(frt_ts_next(ts), "address", 40, 47);
1150
+ ts = frt_a_get_ts(pfa, rb_intern("letter"), text, enc);
1151
+ test_token(frt_ts_next(ts), "dbalmain", 0, 8, enc);
1152
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
1153
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
1154
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
1155
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
1156
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
1157
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
1158
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
1131
1159
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1132
1160
  frt_ts_deref(ts);
1133
- ts = frt_a_get_ts(pfa, rb_intern("XXX"), text); /* should use default analyzer */
1134
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
1135
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
1136
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
1137
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
1138
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1139
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 1);
1140
- test_token_pi(frt_ts_next(ts), "23", 49, 51, 1);
1161
+ ts = frt_a_get_ts(pfa, rb_intern("XXX"), text, enc); /* should use default analyzer */
1162
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
1163
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3, enc);
1164
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
1165
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
1166
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
1167
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 1, enc);
1168
+ test_token_pi(frt_ts_next(ts), "23", 49, 51, 1, enc);
1141
1169
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1142
1170
  frt_tk_destroy(tk);
1143
1171
  frt_ts_deref(ts);
@@ -1146,11 +1174,6 @@ static void test_per_field_analyzer(TestCase *tc, void *data)
1146
1174
 
1147
1175
  TestSuite *ts_analysis(TestSuite *suite)
1148
1176
  {
1149
- bool u = false;
1150
- char *original_locale = setlocale(LC_ALL, NULL);
1151
- char *locale = setlocale(LC_ALL, "");
1152
- if (locale && (strstr(locale, "utf") || strstr(locale, "UTF"))) u = true;
1153
-
1154
1177
  suite = ADD_SUITE(suite);
1155
1178
 
1156
1179
  tst_run_test(suite, test_tk, NULL);
@@ -1161,45 +1184,31 @@ TestSuite *ts_analysis(TestSuite *suite)
1161
1184
 
1162
1185
  /* Whitespace */
1163
1186
  tst_run_test(suite, test_whitespace_tokenizer, NULL);
1164
- if (u) {
1165
- tst_run_test(suite, test_mb_whitespace_tokenizer, NULL);
1166
- }
1187
+ tst_run_test(suite, test_mb_whitespace_tokenizer, NULL);
1167
1188
 
1168
1189
  tst_run_test(suite, test_whitespace_analyzer, NULL);
1169
- if (u) {
1170
- tst_run_test(suite, test_mb_whitespace_analyzer, NULL);
1171
- }
1190
+ tst_run_test(suite, test_mb_whitespace_analyzer, NULL);
1172
1191
 
1173
1192
  /* Letter */
1174
1193
  tst_run_test(suite, test_letter_tokenizer, NULL);
1175
- if (u) {
1176
- tst_run_test(suite, test_mb_letter_tokenizer, NULL);
1177
- }
1194
+ tst_run_test(suite, test_mb_letter_tokenizer, NULL);
1178
1195
 
1179
1196
  tst_run_test(suite, test_letter_analyzer, NULL);
1180
- if (u) {
1181
- tst_run_test(suite, test_mb_letter_analyzer, NULL);
1182
- }
1197
+ tst_run_test(suite, test_mb_letter_analyzer, NULL);
1183
1198
 
1184
1199
  /* Standard */
1185
1200
  tst_run_test(suite, test_standard_tokenizer, NULL);
1186
- if (u) {
1187
- tst_run_test(suite, test_mb_standard_tokenizer, NULL);
1188
- }
1201
+ tst_run_test(suite, test_mb_standard_tokenizer, NULL);
1202
+
1189
1203
  tst_run_test(suite, test_standard_analyzer, NULL);
1190
- if (u) {
1191
- tst_run_test(suite, test_mb_standard_analyzer, NULL);
1192
- }
1204
+ tst_run_test(suite, test_mb_standard_analyzer, NULL);
1193
1205
 
1194
1206
  /* LegacyStandard */
1195
1207
  tst_run_test(suite, test_legacy_standard_tokenizer, NULL);
1196
- if (u) {
1197
- tst_run_test(suite, test_mb_legacy_standard_tokenizer, NULL);
1198
- }
1208
+ tst_run_test(suite, test_mb_legacy_standard_tokenizer, NULL);
1209
+
1199
1210
  tst_run_test(suite, test_legacy_standard_analyzer, NULL);
1200
- if (u) {
1201
- tst_run_test(suite, test_mb_legacy_standard_analyzer, NULL);
1202
- }
1211
+ tst_run_test(suite, test_mb_legacy_standard_analyzer, NULL);
1203
1212
 
1204
1213
  tst_run_test(suite, test_long_word, NULL);
1205
1214
 
@@ -1211,11 +1220,9 @@ TestSuite *ts_analysis(TestSuite *suite)
1211
1220
  tst_run_test(suite, test_hyphen_filter, NULL);
1212
1221
  tst_run_test(suite, test_stop_filter, NULL);
1213
1222
  tst_run_test(suite, test_mapping_filter, NULL);
1223
+ tst_run_test(suite, test_stem_filter, NULL);
1224
+
1214
1225
  tst_run_test(suite, test_stemmer, NULL);
1215
- if (u) {
1216
- tst_run_test(suite, test_stem_filter, NULL);
1217
- }
1218
1226
 
1219
- setlocale(LC_ALL, original_locale);
1220
1227
  return suite;
1221
1228
  }