isomorfeus-ferret 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,13 +1,14 @@
1
1
  #include "frt_analysis.h"
2
2
  #include <string.h>
3
- #include <locale.h>
4
3
  #include <libstemmer.h>
5
4
  #include "test.h"
6
5
 
7
- #define test_token(mtk, mstr, mstart, mend) \
8
- tt_token(mtk, mstr, mstart, mend, tc, __LINE__)
6
+ extern rb_encoding *utf8_encoding;
9
7
 
10
- static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase *tc, int line_num)
8
+ #define test_token(mtk, mstr, mstart, mend, menc) \
9
+ tt_token(mtk, mstr, mstart, mend, menc, tc, __LINE__)
10
+
11
+ static void tt_token(FrtToken *tk, const char *str, int start, int end, rb_encoding * enc, TestCase *tc, int line_num)
11
12
  {
12
13
  FrtToken frt_tk_exp;
13
14
  static char buf[3000];
@@ -17,7 +18,7 @@ static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase
17
18
  tst_assert(line_num, tc, false, buf);
18
19
  return;
19
20
  }
20
- if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, 1), tk)) {
21
+ if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, 1, enc), tk)) {
21
22
  sprintf(buf, "Token1[%d:%d:%s] != Token2[%d:%d:%s]\n",
22
23
  (int)tk->start, (int)tk->end, tk->text, start, end, str);
23
24
  tst_assert(line_num, tc, false, buf);
@@ -25,7 +26,7 @@ static void tt_token(FrtToken *tk, const char *str, int start, int end, TestCase
25
26
  tst_int_equal(line_num, tc, strlen(tk->text), tk->len);
26
27
  }
27
28
 
28
- static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int pi, TestCase *tc, int line_num)
29
+ static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int pi, rb_encoding *enc, TestCase *tc, int line_num)
29
30
  {
30
31
  FrtToken frt_tk_exp;
31
32
  static char buf[3000];
@@ -35,48 +36,52 @@ static void tt_token_pi(FrtToken *tk, const char *str, int start, int end, int p
35
36
  tst_assert(line_num, tc, false, buf);
36
37
  return;
37
38
  }
38
- if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, pi), tk)) {
39
- sprintf(buf, "Token1[%d:%d:%s-%d] != Token2[%d:%d:%s-%d]\n",
39
+ if (!frt_tk_eq(frt_tk_set(&frt_tk_exp, (char *)str, (int)strlen(str), start, end, pi, enc), tk)) {
40
+ fprintf(stderr, "Token1[%d:%d:%s-%d] != \nToken2[%d:%d:%s-%d]\n",
41
+ (int)tk->start, (int)tk->end, tk->text, tk->pos_inc,
42
+ start, end, frt_tk_exp.text, pi);
43
+ sprintf(buf, "Token1[%d:%d:%s-%d] != \nToken2[%d:%d:%s-%d]\n",
40
44
  (int)tk->start, (int)tk->end, tk->text, tk->pos_inc,
41
- start, end, str, pi);
45
+ start, end, frt_tk_exp.text, pi);
42
46
  tst_assert(line_num, tc, false, buf);
43
47
  }
44
48
  tst_int_equal(line_num, tc, strlen(tk->text), tk->len);
45
49
  }
46
50
 
47
- #define test_token_pi(mtk, mstr, mstart, mend, mpi) \
48
- tt_token_pi(mtk, mstr, mstart, mend, mpi, tc, __LINE__)
51
+ #define test_token_pi(mtk, mstr, mstart, mend, mpi, menc) \
52
+ tt_token_pi(mtk, mstr, mstart, mend, mpi, menc, tc, __LINE__)
49
53
 
50
54
  static void test_tk(TestCase *tc, void *data)
51
55
  {
52
56
  FrtToken *tk1 = frt_tk_new();
53
57
  FrtToken *tk2 = frt_tk_new();
58
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
54
59
  (void)data;
55
60
 
56
- frt_tk_set_no_len(tk1, (char *)"DBalmain", 1, 8, 5);
57
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 5);
61
+ frt_tk_set_no_len(tk1, (char *)"DBalmain", 1, 8, 5, enc);
62
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 5, enc);
58
63
  Assert(frt_tk_eq(tk1, tk2), "tokens are equal");
59
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 1);
64
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 8, 1, enc);
60
65
  Assert(!frt_tk_eq(tk1, tk2), "tokens are not equal");
61
66
 
62
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 5);
67
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 5, enc);
63
68
  Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
64
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 0, 8, 5);
69
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 0, 8, 5, enc);
65
70
  Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
66
- frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 7, 5);
71
+ frt_tk_set_no_len(tk2, (char *)"DBalmain", 1, 7, 5, enc);
67
72
  Assert(!frt_tk_eq(tk1, tk2), "tokens aren't equal");
68
73
 
69
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 2, 7, 1);
74
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 2, 7, 1, enc);
70
75
  Aiequal(-1, frt_tk_cmp(tk1, tk2));
71
- frt_tk_set_no_len(tk2, (char *)"EBalmain", 0, 9, 1);
76
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 0, 9, 1, enc);
72
77
  Aiequal(1, frt_tk_cmp(tk1, tk2));
73
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 9, 1);
78
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 9, 1, enc);
74
79
  Aiequal(-1, frt_tk_cmp(tk1, tk2));
75
- frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 7, 1);
80
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 7, 1, enc);
76
81
  Aiequal(1, frt_tk_cmp(tk1, tk2));
77
- frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 8, 1);
82
+ frt_tk_set_no_len(tk2, (char *)"EBalmain", 1, 8, 1, enc);
78
83
  Aiequal(-1, frt_tk_cmp(tk1, tk2));
79
- frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 1);
84
+ frt_tk_set_no_len(tk2, (char *)"CBalmain", 1, 8, 1, enc);
80
85
  Aiequal(1, frt_tk_cmp(tk1, tk2));
81
86
 
82
87
  Asequal("DBalmain", tk1->text);
@@ -101,9 +106,9 @@ static void test_non_tokenizer(TestCase *tc, void *data)
101
106
  FrtTokenStream *ts = frt_non_tokenizer_new();
102
107
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
103
108
  (void)data;
104
-
105
- ts->reset(ts, text);
106
- test_token(frt_ts_next(ts), text, 0, strlen(text));
109
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
110
+ ts->reset(ts, text, enc);
111
+ test_token(frt_ts_next(ts), text, 0, strlen(text), enc);
107
112
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
108
113
  frt_tk_destroy(tk);
109
114
  FRT_REF(ts); /* test ref_cnt */
@@ -118,10 +123,11 @@ static void test_non_analyzer(TestCase *tc, void *data)
118
123
  FrtToken *tk = frt_tk_new();
119
124
  FrtAnalyzer *a = frt_non_analyzer_new();
120
125
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
121
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
126
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
127
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
122
128
  (void)data;
123
129
 
124
- test_token(frt_ts_next(ts), text, 0, strlen(text));
130
+ test_token(frt_ts_next(ts), text, 0, strlen(text), enc);
125
131
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
126
132
  frt_tk_destroy(tk);
127
133
  frt_ts_deref(ts);
@@ -140,16 +146,16 @@ static void test_whitespace_tokenizer(TestCase *tc, void *data)
140
146
  FrtTokenStream *ts = frt_whitespace_tokenizer_new();
141
147
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
142
148
  (void)data;
143
-
144
- ts->reset(ts, text);
145
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
146
- test_token(frt_ts_next(ts), "is", 19, 21);
147
- test_token(frt_ts_next(ts), "My", 22, 24);
148
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
149
- test_token(frt_ts_next(ts), "52", 32, 34);
150
- test_token(frt_ts_next(ts), "#$", 37, 39);
151
- test_token(frt_ts_next(ts), "address.", 40, 48);
152
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
149
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
150
+ ts->reset(ts, text, enc);
151
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
152
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
153
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
154
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
155
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
156
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
157
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
158
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
153
159
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
154
160
  frt_tk_destroy(tk);
155
161
  FRT_REF(ts); /* test ref_cnt */
@@ -161,47 +167,47 @@ static void test_whitespace_tokenizer(TestCase *tc, void *data)
161
167
 
162
168
  static void test_mb_whitespace_tokenizer(TestCase *tc, void *data)
163
169
  {
164
- FrtToken *t, *tk = frt_tk_new();
165
- FrtTokenStream *ts = frt_mb_whitespace_tokenizer_new(false);
166
- char text[100] =
167
- "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
170
+ FrtToken *tk = frt_tk_new();
171
+ FrtTokenStream *ts = frt_whitespace_tokenizer_new();
172
+ char text[100] = "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
173
+ rb_encoding *enc = utf8_encoding;
168
174
  (void)data;
169
-
170
- ts->reset(ts, text);
171
- test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18);
172
- test_token(frt_ts_next(ts), "is", 19, 21);
173
- test_token(frt_ts_next(ts), "My", 22, 24);
174
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
175
- test_token(frt_ts_next(ts), "52", 32, 34);
176
- test_token(frt_ts_next(ts), "#$", 37, 39);
177
- test_token(frt_ts_next(ts), "address.", 40, 48);
178
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
179
- test_token(t = frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86);
175
+ ts->reset(ts, text, enc);
176
+ test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18, enc);
177
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
178
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
179
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
180
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
181
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
182
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
183
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
184
+ test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86, enc);
180
185
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
181
- ts = frt_mb_lowercase_filter_new(ts);
182
- ts->reset(ts, text);
183
- test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
184
- test_token(frt_ts_next(ts), "is", 19, 21);
185
- test_token(frt_ts_next(ts), "my", 22, 24);
186
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
187
- test_token(frt_ts_next(ts), "52", 32, 34);
188
- test_token(frt_ts_next(ts), "#$", 37, 39);
189
- test_token(frt_ts_next(ts), "address.", 40, 48);
190
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
191
- test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
186
+ ts = frt_lowercase_filter_new(ts);
187
+ ts->reset(ts, text, enc);
188
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18, enc);
189
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
190
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
191
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
192
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
193
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
194
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
195
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
196
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86, enc);
192
197
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
193
198
  frt_ts_deref(ts);
194
- ts = frt_mb_whitespace_tokenizer_new(true);
195
- ts->reset(ts, text);
196
- test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
197
- test_token(frt_ts_next(ts), "is", 19, 21);
198
- test_token(frt_ts_next(ts), "my", 22, 24);
199
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
200
- test_token(frt_ts_next(ts), "52", 32, 34);
201
- test_token(frt_ts_next(ts), "#$", 37, 39);
202
- test_token(frt_ts_next(ts), "address.", 40, 48);
203
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
204
- test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
199
+ ts = frt_whitespace_tokenizer_new();
200
+ ts = frt_lowercase_filter_new(ts);
201
+ ts->reset(ts, text, enc);
202
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18, enc);
203
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
204
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
205
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
206
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
207
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
208
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
209
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
210
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86, enc);
205
211
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
206
212
  FRT_REF(ts); /* test ref_cnt */
207
213
  Aiequal(2, ts->ref_cnt);
@@ -216,17 +222,18 @@ static void test_whitespace_analyzer(TestCase *tc, void *data)
216
222
  FrtToken *tk = frt_tk_new();
217
223
  FrtAnalyzer *a = frt_whitespace_analyzer_new(false);
218
224
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
219
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
225
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
226
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
220
227
  (void)data;
221
228
 
222
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
223
- test_token(frt_ts_next(ts), "is", 19, 21);
224
- test_token(frt_ts_next(ts), "My", 22, 24);
225
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
226
- test_token(frt_ts_next(ts), "52", 32, 34);
227
- test_token(frt_ts_next(ts), "#$", 37, 39);
228
- test_token(frt_ts_next(ts), "address.", 40, 48);
229
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
229
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
230
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
231
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
232
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
233
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
234
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
235
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
236
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
230
237
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
231
238
  frt_tk_destroy(tk);
232
239
  frt_ts_deref(ts);
@@ -236,36 +243,36 @@ static void test_whitespace_analyzer(TestCase *tc, void *data)
236
243
  static void test_mb_whitespace_analyzer(TestCase *tc, void *data)
237
244
  {
238
245
  FrtToken *tk = frt_tk_new();
239
- FrtAnalyzer *a = frt_mb_whitespace_analyzer_new(false);
240
- char text[100] =
241
- "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
242
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
246
+ FrtAnalyzer *a = frt_whitespace_analyzer_new(false);
247
+ char text[100] = "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
248
+ rb_encoding *enc = utf8_encoding;
249
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
243
250
  (void)data;
244
251
 
245
- test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18);
246
- test_token(frt_ts_next(ts), "is", 19, 21);
247
- test_token(frt_ts_next(ts), "My", 22, 24);
248
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
249
- test_token(frt_ts_next(ts), "52", 32, 34);
250
- test_token(frt_ts_next(ts), "#$", 37, 39);
251
- test_token(frt_ts_next(ts), "address.", 40, 48);
252
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
253
- test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86);
252
+ test_token(frt_ts_next(ts), "DBalmän@gmail.com", 0, 18, enc);
253
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
254
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
255
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
256
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
257
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
258
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
259
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
260
+ test_token(frt_ts_next(ts), "ÁÄGÇ®ÊË̯ÚØìÖÎÍ", 55, 86, enc);
254
261
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
255
262
  frt_ts_deref(ts);
256
263
  frt_a_deref(a);
257
- a = frt_mb_whitespace_analyzer_new(true);
258
- ts = frt_a_get_ts(a, rb_intern("random"), text);
259
- ts->reset(ts, text);
260
- test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18);
261
- test_token(frt_ts_next(ts), "is", 19, 21);
262
- test_token(frt_ts_next(ts), "my", 22, 24);
263
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
264
- test_token(frt_ts_next(ts), "52", 32, 34);
265
- test_token(frt_ts_next(ts), "#$", 37, 39);
266
- test_token(frt_ts_next(ts), "address.", 40, 48);
267
- test_token(frt_ts_next(ts), "23#!$", 49, 54);
268
- test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86);
264
+ a = frt_whitespace_analyzer_new(true);
265
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
266
+ ts->reset(ts, text, enc);
267
+ test_token(frt_ts_next(ts), "dbalmän@gmail.com", 0, 18, enc);
268
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
269
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
270
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
271
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
272
+ test_token(frt_ts_next(ts), "#$", 37, 39, enc);
273
+ test_token(frt_ts_next(ts), "address.", 40, 48, enc);
274
+ test_token(frt_ts_next(ts), "23#!$", 49, 54, enc);
275
+ test_token(frt_ts_next(ts), "áägç®êëì¯úøã¬öîí", 55, 86, enc);
269
276
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
270
277
  frt_tk_destroy(tk);
271
278
  frt_ts_deref(ts);
@@ -284,16 +291,16 @@ static void test_letter_tokenizer(TestCase *tc, void *data)
284
291
  FrtTokenStream *ts = frt_letter_tokenizer_new();
285
292
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
286
293
  (void)data;
287
-
288
- ts->reset(ts, text);
289
- test_token(frt_ts_next(ts), "DBalmain", 0, 8);
290
- test_token(frt_ts_next(ts), "gmail", 9, 14);
291
- test_token(frt_ts_next(ts), "com", 15, 18);
292
- test_token(frt_ts_next(ts), "is", 19, 21);
293
- test_token(frt_ts_next(ts), "My", 22, 24);
294
- test_token(frt_ts_next(ts), "e", 25, 26);
295
- test_token(frt_ts_next(ts), "mail", 27, 31);
296
- test_token(frt_ts_next(ts), "address", 40, 47);
294
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
295
+ ts->reset(ts, text, enc);
296
+ test_token(frt_ts_next(ts), "DBalmain", 0, 8, enc);
297
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
298
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
299
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
300
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
301
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
302
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
303
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
297
304
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
298
305
  frt_tk_destroy(tk);
299
306
  FRT_REF(ts); /* test ref_cnt */
@@ -306,55 +313,55 @@ static void test_letter_tokenizer(TestCase *tc, void *data)
306
313
  static void test_mb_letter_tokenizer(TestCase *tc, void *data)
307
314
  {
308
315
  FrtToken *tk = frt_tk_new();
309
- FrtTokenStream *ts = frt_mb_letter_tokenizer_new(false);
310
- char text[100] =
311
- "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
316
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
317
+ char text[100] = "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
312
318
  (void)data;
313
-
314
- ts->reset(ts, text);
315
- test_token(frt_ts_next(ts), "DBalmän", 0, 8);
316
- test_token(frt_ts_next(ts), "gmail", 9, 14);
317
- test_token(frt_ts_next(ts), "com", 15, 18);
318
- test_token(frt_ts_next(ts), "is", 19, 21);
319
- test_token(frt_ts_next(ts), "My", 22, 24);
320
- test_token(frt_ts_next(ts), "e", 25, 26);
321
- test_token(frt_ts_next(ts), "mail", 27, 31);
322
- test_token(frt_ts_next(ts), "address", 40, 47);
323
- test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62);
324
- test_token(frt_ts_next(ts), "ÊËÌ", 64, 70);
325
- test_token(frt_ts_next(ts), "ÚØÃ", 72, 78);
326
- test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86);
319
+ rb_encoding *enc = utf8_encoding;
320
+ ts->reset(ts, text, enc);
321
+ test_token(frt_ts_next(ts), "DBalmän", 0, 8, enc);
322
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
323
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
324
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
325
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
326
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
327
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
328
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
329
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62, enc);
330
+ test_token(frt_ts_next(ts), "ÊËÌ", 64, 70, enc);
331
+ test_token(frt_ts_next(ts), "ÚØÃ", 72, 78, enc);
332
+ test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86, enc);
327
333
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
328
- ts = frt_mb_lowercase_filter_new(ts);
329
- ts->reset(ts, text);
330
- test_token(frt_ts_next(ts), "dbalmän", 0, 8);
331
- test_token(frt_ts_next(ts), "gmail", 9, 14);
332
- test_token(frt_ts_next(ts), "com", 15, 18);
333
- test_token(frt_ts_next(ts), "is", 19, 21);
334
- test_token(frt_ts_next(ts), "my", 22, 24);
335
- test_token(frt_ts_next(ts), "e", 25, 26);
336
- test_token(frt_ts_next(ts), "mail", 27, 31);
337
- test_token(frt_ts_next(ts), "address", 40, 47);
338
- test_token(frt_ts_next(ts), "áägç", 55, 62);
339
- test_token(frt_ts_next(ts), "êëì", 64, 70);
340
- test_token(frt_ts_next(ts), "úøã", 72, 78);
341
- test_token(frt_ts_next(ts), "öîí", 80, 86);
334
+ ts = frt_lowercase_filter_new(ts);
335
+ ts->reset(ts, text, enc);
336
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8, enc);
337
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
338
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
339
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
340
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
341
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
342
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
343
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
344
+ test_token(frt_ts_next(ts), "áägç", 55, 62, enc);
345
+ test_token(frt_ts_next(ts), "êëì", 64, 70, enc);
346
+ test_token(frt_ts_next(ts), "úøã", 72, 78, enc);
347
+ test_token(frt_ts_next(ts), "öîí", 80, 86, enc);
342
348
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
343
349
  frt_ts_deref(ts);
344
- ts = frt_mb_letter_tokenizer_new(true);
345
- ts->reset(ts, text);
346
- test_token(frt_ts_next(ts), "dbalmän", 0, 8);
347
- test_token(frt_ts_next(ts), "gmail", 9, 14);
348
- test_token(frt_ts_next(ts), "com", 15, 18);
349
- test_token(frt_ts_next(ts), "is", 19, 21);
350
- test_token(frt_ts_next(ts), "my", 22, 24);
351
- test_token(frt_ts_next(ts), "e", 25, 26);
352
- test_token(frt_ts_next(ts), "mail", 27, 31);
353
- test_token(frt_ts_next(ts), "address", 40, 47);
354
- test_token(frt_ts_next(ts), "áägç", 55, 62);
355
- test_token(frt_ts_next(ts), "êëì", 64, 70);
356
- test_token(frt_ts_next(ts), "úøã", 72, 78);
357
- test_token(frt_ts_next(ts), "öîí", 80, 86);
350
+ ts = frt_letter_tokenizer_new();
351
+ ts = frt_lowercase_filter_new(ts);
352
+ ts->reset(ts, text, enc);
353
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8, enc);
354
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
355
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
356
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
357
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
358
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
359
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
360
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
361
+ test_token(frt_ts_next(ts), "áägç", 55, 62, enc);
362
+ test_token(frt_ts_next(ts), "êëì", 64, 70, enc);
363
+ test_token(frt_ts_next(ts), "úøã", 72, 78, enc);
364
+ test_token(frt_ts_next(ts), "öîí", 80, 86, enc);
358
365
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
359
366
  FRT_REF(ts); /* test ref_cnt */
360
367
  Aiequal(2, ts->ref_cnt);
@@ -369,17 +376,18 @@ static void test_letter_analyzer(TestCase *tc, void *data)
369
376
  FrtToken *tk = frt_tk_new();
370
377
  FrtAnalyzer *a = frt_letter_analyzer_new(true);
371
378
  char text[100] = "DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$";
372
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
379
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
380
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
373
381
  (void)data;
374
382
 
375
- test_token(frt_ts_next(ts), "dbalmain", 0, 8);
376
- test_token(frt_ts_next(ts), "gmail", 9, 14);
377
- test_token(frt_ts_next(ts), "com", 15, 18);
378
- test_token(frt_ts_next(ts), "is", 19, 21);
379
- test_token(frt_ts_next(ts), "my", 22, 24);
380
- test_token(frt_ts_next(ts), "e", 25, 26);
381
- test_token(frt_ts_next(ts), "mail", 27, 31);
382
- test_token(frt_ts_next(ts), "address", 40, 47);
383
+ test_token(frt_ts_next(ts), "dbalmain", 0, 8, enc);
384
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
385
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
386
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
387
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
388
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
389
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
390
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
383
391
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
384
392
  frt_tk_destroy(tk);
385
393
  frt_ts_deref(ts);
@@ -389,42 +397,43 @@ static void test_letter_analyzer(TestCase *tc, void *data)
389
397
  static void test_mb_letter_analyzer(TestCase *tc, void *data)
390
398
  {
391
399
  FrtToken *tk = frt_tk_new();
392
- FrtAnalyzer *a = frt_mb_letter_analyzer_new(false);
400
+ FrtAnalyzer *a = frt_letter_analyzer_new(false);
393
401
  char text[100] =
394
402
  "DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ "
395
403
  "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
396
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
404
+ rb_encoding *enc = utf8_encoding;
405
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
397
406
  (void)data;
398
407
 
399
- test_token(frt_ts_next(ts), "DBalmän", 0, 8);
400
- test_token(frt_ts_next(ts), "gmail", 9, 14);
401
- test_token(frt_ts_next(ts), "com", 15, 18);
402
- test_token(frt_ts_next(ts), "is", 19, 21);
403
- test_token(frt_ts_next(ts), "My", 22, 24);
404
- test_token(frt_ts_next(ts), "e", 25, 26);
405
- test_token(frt_ts_next(ts), "mail", 27, 31);
406
- test_token(frt_ts_next(ts), "address", 40, 47);
407
- test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62);
408
- test_token(frt_ts_next(ts), "ÊËÌ", 64, 70);
409
- test_token(frt_ts_next(ts), "ÚØÃ", 72, 78);
410
- test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86);
408
+ test_token(frt_ts_next(ts), "DBalmän", 0, 8, enc);
409
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
410
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
411
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
412
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
413
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
414
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
415
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
416
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 55, 62, enc);
417
+ test_token(frt_ts_next(ts), "ÊËÌ", 64, 70, enc);
418
+ test_token(frt_ts_next(ts), "ÚØÃ", 72, 78, enc);
419
+ test_token(frt_ts_next(ts), "ÖÎÍ", 80, 86, enc);
411
420
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
412
421
  frt_ts_deref(ts);
413
422
  frt_a_deref(a);
414
- a = frt_mb_letter_analyzer_new(true);
415
- ts = frt_a_get_ts(a, rb_intern("random"), text);
416
- test_token(frt_ts_next(ts), "dbalmän", 0, 8);
417
- test_token(frt_ts_next(ts), "gmail", 9, 14);
418
- test_token(frt_ts_next(ts), "com", 15, 18);
419
- test_token(frt_ts_next(ts), "is", 19, 21);
420
- test_token(frt_ts_next(ts), "my", 22, 24);
421
- test_token(frt_ts_next(ts), "e", 25, 26);
422
- test_token(frt_ts_next(ts), "mail", 27, 31);
423
- test_token(frt_ts_next(ts), "address", 40, 47);
424
- test_token(frt_ts_next(ts), "áägç", 55, 62);
425
- test_token(frt_ts_next(ts), "êëì", 64, 70);
426
- test_token(frt_ts_next(ts), "úøã", 72, 78);
427
- test_token(frt_ts_next(ts), "öîí", 80, 86);
423
+ a = frt_letter_analyzer_new(true);
424
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
425
+ test_token(frt_ts_next(ts), "dbalmän", 0, 8, enc);
426
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
427
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
428
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
429
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
430
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
431
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
432
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
433
+ test_token(frt_ts_next(ts), "áägç", 55, 62, enc);
434
+ test_token(frt_ts_next(ts), "êëì", 64, 70, enc);
435
+ test_token(frt_ts_next(ts), "úøã", 72, 78, enc);
436
+ test_token(frt_ts_next(ts), "öîí", 80, 86, enc);
428
437
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
429
438
  frt_a_deref(a);
430
439
  frt_ts_deref(ts);
@@ -445,21 +454,21 @@ static void do_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
445
454
  "DBalmain@gmail.com is My e-mail -52 #$ Address. 23#!$ "
446
455
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 "
447
456
  "underscored_word, won't we're";
448
-
449
- ts->reset(ts, text);
450
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
451
- test_token(frt_ts_next(ts), "is", 19, 21);
452
- test_token(frt_ts_next(ts), "My", 22, 24);
453
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
454
- test_token(frt_ts_next(ts), "-52", 32, 35);
455
- test_token(frt_ts_next(ts), "Address", 40, 47);
456
- test_token(frt_ts_next(ts), "23", 49, 51);
457
- test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
458
- test_token(frt_ts_next(ts), "TNT", 86, 91);
459
- test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110);
460
- test_token(frt_ts_next(ts), "underscored_word", 111, 127);
461
- test_token(frt_ts_next(ts), "won't", 129, 134);
462
- test_token(frt_ts_next(ts), "we're", 135, 140);
457
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
458
+ ts->reset(ts, text, enc);
459
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
460
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
461
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
462
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
463
+ test_token(frt_ts_next(ts), "-52", 32, 35, enc);
464
+ test_token(frt_ts_next(ts), "Address", 40, 47, enc);
465
+ test_token(frt_ts_next(ts), "23", 49, 51, enc);
466
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85, enc);
467
+ test_token(frt_ts_next(ts), "TNT", 86, 91, enc);
468
+ test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, enc);
469
+ test_token(frt_ts_next(ts), "underscored_word", 111, 127, enc);
470
+ test_token(frt_ts_next(ts), "won't", 129, 134, enc);
471
+ test_token(frt_ts_next(ts), "we're", 135, 140, enc);
463
472
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
464
473
  frt_tk_destroy(tk);
465
474
  FRT_REF(ts); /* test ref_cnt */
@@ -470,12 +479,12 @@ static void do_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
470
479
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
471
480
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
472
481
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
473
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
482
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", enc);
474
483
  test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
475
484
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
476
485
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
477
486
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
478
- "xxxxxxxxxxxxxxxxxxx", 0, 280);
487
+ "xxxxxxxxxxxxxxxxxxx", 0, 280, enc);
479
488
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
480
489
  }
481
490
 
@@ -489,7 +498,7 @@ static void test_standard_tokenizer(TestCase *tc, void *data)
489
498
 
490
499
  static void test_legacy_standard_tokenizer(TestCase *tc, void *data)
491
500
  {
492
- FrtTokenStream *ts = frt_legacy_standard_tokenizer_new();
501
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
493
502
  (void)data;
494
503
  do_standard_tokenizer(tc, ts);
495
504
  frt_ts_deref(ts);
@@ -502,44 +511,44 @@ static void do_mb_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
502
511
  "DBalmain@gmail.com is My e-mail -52 #$ Address. 23#!$ "
503
512
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 "
504
513
  "underscored_word, won't we're 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ "
505
- "\200 badchar it's groups' Barnes&Noble file:///home/user/ "
514
+ " badchar it's groups' Barnes&Noble file:///home/user/ "
506
515
  "svn://www.davebalmain.com/ www,.google.com www.google.com "
507
516
  "dave@balmain@gmail.com \"quoted string\" continue *star";
508
-
509
- ts->reset(ts, text);
510
- test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18);
511
- test_token(frt_ts_next(ts), "is", 19, 21);
512
- test_token(frt_ts_next(ts), "My", 22, 24);
513
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
514
- test_token(frt_ts_next(ts), "-52", 32, 35);
515
- test_token(frt_ts_next(ts), "Address", 40, 47);
516
- test_token(frt_ts_next(ts), "23", 49, 51);
517
- test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
518
- test_token(frt_ts_next(ts), "TNT", 86, 91);
519
- test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110);
520
- test_token(frt_ts_next(ts), "underscored_word", 111, 127);
521
- test_token(frt_ts_next(ts), "won't", 129, 134);
522
- test_token(frt_ts_next(ts), "we're", 135, 140);
523
- test_token(frt_ts_next(ts), "23", 141, 143);
524
- test_token(frt_ts_next(ts), "ÁÄGÇ", 147, 154);
525
- test_token(frt_ts_next(ts), "ÊËÌ", 156, 162);
526
- test_token(frt_ts_next(ts), "ÚØÃ", 164, 170);
527
- test_token(frt_ts_next(ts), "ÖÎÍ", 172, 178);
528
- test_token(frt_ts_next(ts), "badchar", 181, 188);
529
- test_token(frt_ts_next(ts), "it", 189, 193);
530
- test_token(frt_ts_next(ts), "groups", 194, 201);
531
- test_token(frt_ts_next(ts), "Barnes&Noble", 202, 214);
532
- test_token(frt_ts_next(ts), "home/user", 215, 233);
533
- test_token(frt_ts_next(ts), "svn://www.davebalmain.com", 234, 260);
534
- test_token(frt_ts_next(ts), "www", 261, 264);
535
- test_token(frt_ts_next(ts), "google.com", 266, 276);
536
- test_token(frt_ts_next(ts), "www.google.com", 277, 291);
537
- test_token(frt_ts_next(ts), "dave@balmain", 292, 304);
538
- test_token(frt_ts_next(ts), "gmail.com", 305, 314);
539
- test_token(frt_ts_next(ts), "quoted", 316, 322);
540
- test_token(frt_ts_next(ts), "string", 323, 329);
541
- test_token(frt_ts_next(ts), "continue", 331, 339);
542
- test_token(frt_ts_next(ts), "star", 341, 345);
517
+ rb_encoding *enc = utf8_encoding;
518
+ ts->reset(ts, text, enc);
519
+ test_token(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, enc);
520
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
521
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
522
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
523
+ test_token(frt_ts_next(ts), "-52", 32, 35, enc);
524
+ test_token(frt_ts_next(ts), "Address", 40, 47, enc);
525
+ test_token(frt_ts_next(ts), "23", 49, 51, enc);
526
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85, enc);
527
+ test_token(frt_ts_next(ts), "TNT", 86, 91, enc);
528
+ test_token(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, enc);
529
+ test_token(frt_ts_next(ts), "underscored_word", 111, 127, enc);
530
+ test_token(frt_ts_next(ts), "won't", 129, 134, enc);
531
+ test_token(frt_ts_next(ts), "we're", 135, 140, enc);
532
+ test_token(frt_ts_next(ts), "23", 141, 143, enc);
533
+ test_token(frt_ts_next(ts), "ÁÄGÇ", 147, 154, enc);
534
+ test_token(frt_ts_next(ts), "ÊËÌ", 156, 162, enc);
535
+ test_token(frt_ts_next(ts), "ÚØÃ", 164, 170, enc);
536
+ test_token(frt_ts_next(ts), "ÖÎÍ", 172, 178, enc);
537
+ test_token(frt_ts_next(ts), "badchar", 181, 188, enc);
538
+ test_token(frt_ts_next(ts), "it", 189, 193, enc);
539
+ test_token(frt_ts_next(ts), "groups", 194, 201, enc);
540
+ test_token(frt_ts_next(ts), "Barnes&Noble", 202, 214, enc);
541
+ test_token(frt_ts_next(ts), "home/user", 215, 233, enc);
542
+ test_token(frt_ts_next(ts), "svn://www.davebalmain.com", 234, 260, enc);
543
+ test_token(frt_ts_next(ts), "www", 261, 264, enc);
544
+ test_token(frt_ts_next(ts), "google.com", 266, 276, enc);
545
+ test_token(frt_ts_next(ts), "www.google.com", 277, 291, enc);
546
+ test_token(frt_ts_next(ts), "dave@balmain", 292, 304, enc);
547
+ test_token(frt_ts_next(ts), "gmail.com", 305, 314, enc);
548
+ test_token(frt_ts_next(ts), "quoted", 316, 322, enc);
549
+ test_token(frt_ts_next(ts), "string", 323, 329, enc);
550
+ test_token(frt_ts_next(ts), "continue", 331, 339, enc);
551
+ test_token(frt_ts_next(ts), "star", 341, 345, enc);
543
552
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
544
553
  frt_tk_destroy(tk);
545
554
  FRT_REF(ts); /* test ref_cnt */
@@ -550,29 +559,29 @@ static void do_mb_standard_tokenizer(TestCase *tc, FrtTokenStream *ts)
550
559
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
551
560
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
552
561
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
553
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
562
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", enc);
554
563
  test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
555
564
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
556
565
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
557
566
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
558
- "xxxxxxxxxxxxxxxxxxx", 0, 280);
567
+ "xxxxxxxxxxxxxxxxxxx", 0, 280, enc);
559
568
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
560
569
  ts->reset(ts, (char *)"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
561
570
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
562
571
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
563
572
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
564
573
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
565
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
574
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", enc);
566
575
  test_token(frt_ts_next(ts), "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
567
576
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
568
577
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
569
578
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
570
- "xxxxxxxxxxxxxxxxxxx", 0, 348);
579
+ "xxxxxxxxxxxxxxxxxxx", 0, 348, enc);
571
580
  }
572
581
 
573
582
  static void test_mb_standard_tokenizer(TestCase *tc, void *data)
574
583
  {
575
- FrtTokenStream *ts = frt_mb_standard_tokenizer_new();
584
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
576
585
  (void)data;
577
586
  do_mb_standard_tokenizer(tc, ts);
578
587
  frt_ts_deref(ts);
@@ -580,7 +589,7 @@ static void test_mb_standard_tokenizer(TestCase *tc, void *data)
580
589
 
581
590
  static void test_mb_legacy_standard_tokenizer(TestCase *tc, void *data)
582
591
  {
583
- FrtTokenStream *ts = frt_mb_legacy_standard_tokenizer_new();
592
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
584
593
  (void)data;
585
594
  do_mb_standard_tokenizer(tc, ts);
586
595
  frt_ts_deref(ts);
@@ -589,23 +598,24 @@ static void test_mb_legacy_standard_tokenizer(TestCase *tc, void *data)
589
598
  static void test_standard_analyzer(TestCase *tc, void *data)
590
599
  {
591
600
  FrtToken *tk = frt_tk_new();
592
- FrtAnalyzer *a = frt_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
601
+ FrtAnalyzer *a = frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
593
602
  char text[200] =
594
603
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
595
604
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
596
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
605
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
606
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
597
607
  (void)data;
598
608
 
599
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
600
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
601
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
602
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
603
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
604
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
605
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
606
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
607
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
608
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
609
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
610
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
611
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
612
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
613
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
614
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
615
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
616
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
617
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
618
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
609
619
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
610
620
  frt_tk_destroy(tk);
611
621
  frt_ts_deref(ts);
@@ -616,85 +626,86 @@ static void test_mb_standard_analyzer(TestCase *tc, void *data)
616
626
  {
617
627
  FrtToken *tk = frt_tk_new();
618
628
  FrtAnalyzer *a =
619
- frt_mb_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, false);
629
+ frt_standard_analyzer_new_with_words(false, FRT_ENGLISH_STOP_WORDS);
620
630
  const char *words[] = { "is", "the", "-23", "tnt", NULL };
621
631
  char text[200] =
622
632
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
623
633
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ "
624
634
  "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
625
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text), *ts2;
635
+ rb_encoding *enc = utf8_encoding;
636
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc), *ts2;
626
637
  (void)data;
627
638
 
628
- test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
629
- test_token_pi(frt_ts_next(ts), "My", 22, 24, 2);
630
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
631
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
632
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
633
- test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3);
634
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
635
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
636
- test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1);
637
- test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1);
638
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
639
- test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1);
640
- test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1);
641
- test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1);
642
- test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1);
639
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1, enc);
640
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 2, enc);
641
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
642
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
643
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
644
+ test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3, enc);
645
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
646
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
647
+ test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1, enc);
648
+ test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1, enc);
649
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
650
+ test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1, enc);
651
+ test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1, enc);
652
+ test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1, enc);
653
+ test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1, enc);
643
654
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
644
655
  frt_ts_deref(ts);
645
656
  frt_a_deref(a);
646
- a = frt_mb_standard_analyzer_new(true);
647
- ts = frt_a_get_ts(a, rb_intern("random"), text);
648
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
649
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
650
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
651
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
652
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
653
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
654
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
655
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
656
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
657
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
658
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
659
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
660
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
661
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
657
+ a = frt_standard_analyzer_new(true);
658
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
659
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
660
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3, enc);
661
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
662
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
663
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
664
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
665
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
666
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
667
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
668
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
669
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
670
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
671
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
672
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
662
673
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
663
674
  frt_ts_deref(ts);
664
675
  frt_a_deref(a);
665
- a = frt_mb_standard_analyzer_new_with_words(words, true);
666
- ts = frt_a_get_ts(a, rb_intern("random"), text);
667
- ts2 = frt_a_get_ts(a, rb_intern("random"), text);
668
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
669
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
670
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
671
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
672
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
673
- test_token_pi(frt_ts_next(ts), "and", 32, 35, 1);
674
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 2);
675
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2);
676
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2);
677
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
678
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
679
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
680
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
681
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
676
+ a = frt_standard_analyzer_new_with_words(true, words);
677
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
678
+ ts2 = frt_a_get_ts(a, rb_intern("random"), text, enc);
679
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
680
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
681
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
682
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
683
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
684
+ test_token_pi(frt_ts_next(ts), "and", 32, 35, 1, enc);
685
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 2, enc);
686
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2, enc);
687
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2, enc);
688
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
689
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
690
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
691
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
692
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
682
693
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
683
694
  frt_ts_deref(ts);
684
- test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1);
685
- test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2);
686
- test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1);
687
- test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0);
688
- test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1);
689
- test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1);
690
- test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2);
691
- test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2);
692
- test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2);
693
- test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1);
694
- test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1);
695
- test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1);
696
- test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1);
697
- test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1);
695
+ test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1, enc);
696
+ test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2, enc);
697
+ test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1, enc);
698
+ test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0, enc);
699
+ test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1, enc);
700
+ test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1, enc);
701
+ test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2, enc);
702
+ test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2, enc);
703
+ test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2, enc);
704
+ test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1, enc);
705
+ test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1, enc);
706
+ test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1, enc);
707
+ test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1, enc);
708
+ test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1, enc);
698
709
  Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
699
710
  ts2->ref_cnt = 3;
700
711
  ts = frt_ts_clone(ts2);
@@ -714,23 +725,24 @@ static void test_legacy_standard_analyzer(TestCase *tc, void *data)
714
725
  {
715
726
  FrtToken *tk = frt_tk_new();
716
727
  FrtAnalyzer *a =
717
- frt_legacy_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
728
+ frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
718
729
  char text[200] =
719
730
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
720
731
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
721
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
732
+ rb_encoding *enc = rb_enc_find("ASCII-8BIT");
733
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
722
734
  (void)data;
723
735
 
724
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
725
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
726
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
727
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
728
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
729
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
730
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
731
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
732
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
733
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
736
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
737
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
738
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
739
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
740
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
741
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
742
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
743
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
744
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
745
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
734
746
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
735
747
  frt_tk_destroy(tk);
736
748
  frt_ts_deref(ts);
@@ -741,85 +753,86 @@ static void test_mb_legacy_standard_analyzer(TestCase *tc, void *data)
741
753
  {
742
754
  FrtToken *tk = frt_tk_new();
743
755
  FrtAnalyzer *a =
744
- frt_mb_legacy_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, false);
756
+ frt_standard_analyzer_new_with_words(false, FRT_ENGLISH_STOP_WORDS);
745
757
  const char *words[] = { "is", "the", "-23", "tnt", NULL };
746
758
  char text[200] =
747
759
  "DBalmain@gmail.com is My e-mail and the Address. -23!$ "
748
760
  "http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ "
749
761
  "ÁÄGÇ®ÊË̯ÚØìÖÎÍ";
750
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text), *ts2;
762
+ rb_encoding *enc = utf8_encoding;
763
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc), *ts2;
751
764
  (void)data;
752
765
 
753
- test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
754
- test_token_pi(frt_ts_next(ts), "My", 22, 24, 2);
755
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
756
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
757
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
758
- test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3);
759
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
760
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
761
- test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1);
762
- test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1);
763
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
764
- test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1);
765
- test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1);
766
- test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1);
767
- test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1);
766
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1, enc);
767
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 2, enc);
768
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
769
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
770
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
771
+ test_token_pi(frt_ts_next(ts), "Address", 40, 47, 3, enc);
772
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
773
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
774
+ test_token_pi(frt_ts_next(ts), "TNT", 86, 91, 1, enc);
775
+ test_token_pi(frt_ts_next(ts), "123-1235-ASD-1234", 93, 110, 1, enc);
776
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
777
+ test_token_pi(frt_ts_next(ts), "ÁÄGÇ", 117, 124, 1, enc);
778
+ test_token_pi(frt_ts_next(ts), "ÊËÌ", 126, 132, 1, enc);
779
+ test_token_pi(frt_ts_next(ts), "ÚØÃ", 134, 140, 1, enc);
780
+ test_token_pi(frt_ts_next(ts), "ÖÎÍ", 142, 148, 1, enc);
768
781
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
769
782
  frt_ts_deref(ts);
770
783
  frt_a_deref(a);
771
- a = frt_mb_legacy_standard_analyzer_new(true);
772
- ts = frt_a_get_ts(a, rb_intern("random"), text);
773
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
774
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
775
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
776
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
777
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 3);
778
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
779
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
780
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
781
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
782
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
783
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
784
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
785
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
786
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
784
+ a = frt_standard_analyzer_new(true);
785
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
786
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
787
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3, enc);
788
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
789
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
790
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 3, enc);
791
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
792
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
793
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
794
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
795
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
796
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
797
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
798
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
799
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
787
800
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
788
801
  frt_ts_deref(ts);
789
802
  frt_a_deref(a);
790
- a = frt_mb_legacy_standard_analyzer_new_with_words(words, true);
791
- ts = frt_a_get_ts(a, rb_intern("random"), text);
792
- ts2 = frt_a_get_ts(a, rb_intern("random"), text);
793
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
794
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 2);
795
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
796
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
797
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
798
- test_token_pi(frt_ts_next(ts), "and", 32, 35, 1);
799
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 2);
800
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2);
801
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2);
802
- test_token_pi(frt_ts_next(ts), "23", 111, 113, 1);
803
- test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1);
804
- test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1);
805
- test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1);
806
- test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1);
803
+ a = frt_standard_analyzer_new_with_words(true, words);
804
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
805
+ ts2 = frt_a_get_ts(a, rb_intern("random"), text, enc);
806
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
807
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 2, enc);
808
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
809
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
810
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
811
+ test_token_pi(frt_ts_next(ts), "and", 32, 35, 1, enc);
812
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 2, enc);
813
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 2, enc);
814
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 2, enc);
815
+ test_token_pi(frt_ts_next(ts), "23", 111, 113, 1, enc);
816
+ test_token_pi(frt_ts_next(ts), "áägç", 117, 124, 1, enc);
817
+ test_token_pi(frt_ts_next(ts), "êëì", 126, 132, 1, enc);
818
+ test_token_pi(frt_ts_next(ts), "úøã", 134, 140, 1, enc);
819
+ test_token_pi(frt_ts_next(ts), "öîí", 142, 148, 1, enc);
807
820
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
808
821
  frt_ts_deref(ts);
809
- test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1);
810
- test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2);
811
- test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1);
812
- test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0);
813
- test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1);
814
- test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1);
815
- test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2);
816
- test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2);
817
- test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2);
818
- test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1);
819
- test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1);
820
- test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1);
821
- test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1);
822
- test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1);
822
+ test_token_pi(frt_ts_next(ts2), "dbalmain@gmail.com", 0, 18, 1, enc);
823
+ test_token_pi(frt_ts_next(ts2), "my", 22, 24, 2, enc);
824
+ test_token_pi(frt_ts_next(ts2), "email", 25, 31, 1, enc);
825
+ test_token_pi(frt_ts_next(ts2), "e", 25, 26, 0, enc);
826
+ test_token_pi(frt_ts_next(ts2), "mail", 27, 31, 1, enc);
827
+ test_token_pi(frt_ts_next(ts2), "and", 32, 35, 1, enc);
828
+ test_token_pi(frt_ts_next(ts2), "address", 40, 47, 2, enc);
829
+ test_token_pi(frt_ts_next(ts2), "www.google.com/results", 55, 85, 2, enc);
830
+ test_token_pi(frt_ts_next(ts2), "123-1235-asd-1234", 93, 110, 2, enc);
831
+ test_token_pi(frt_ts_next(ts2), "23", 111, 113, 1, enc);
832
+ test_token_pi(frt_ts_next(ts2), "áägç", 117, 124, 1, enc);
833
+ test_token_pi(frt_ts_next(ts2), "êëì", 126, 132, 1, enc);
834
+ test_token_pi(frt_ts_next(ts2), "úøã", 134, 140, 1, enc);
835
+ test_token_pi(frt_ts_next(ts2), "öîí", 142, 148, 1, enc);
823
836
  Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
824
837
  ts2->ref_cnt = 3;
825
838
  ts = frt_ts_clone(ts2);
@@ -838,25 +851,32 @@ static void test_mb_legacy_standard_analyzer(TestCase *tc, void *data)
838
851
  static void test_long_word(TestCase *tc, void *data)
839
852
  {
840
853
  FrtToken *tk = frt_tk_new();
841
- FrtAnalyzer *a = frt_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
854
+ FrtAnalyzer *a = frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
842
855
  char text[400] =
843
856
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
844
857
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
845
858
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
846
859
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
847
- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" " two";
848
- FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text);
860
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx two";
861
+ char text_a[400] =
862
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
863
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
864
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
865
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
866
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
867
+ rb_encoding *enc = utf8_encoding;
868
+ FrtTokenStream *ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
849
869
  (void)data;
850
870
 
851
- test_token_pi(frt_ts_next(ts), text, 0, 290, 1); /* text gets truncated anyway */
852
- test_token_pi(frt_ts_next(ts), "two", 291, 294, 1);
871
+ test_token_pi(frt_ts_next(ts), text_a, 0, 290, 1, enc);
872
+ test_token_pi(frt_ts_next(ts), "two", 291, 294, 1, enc);
853
873
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
854
874
  frt_ts_deref(ts);
855
875
  frt_a_deref(a);
856
- a = frt_mb_standard_analyzer_new_with_words(FRT_ENGLISH_STOP_WORDS, true);
857
- ts = frt_a_get_ts(a, rb_intern("random"), text);
858
- test_token_pi(frt_ts_next(ts), text, 0, 290, 1); /* text gets truncated anyway */
859
- test_token_pi(frt_ts_next(ts), "two", 291, 294, 1);
876
+ a = frt_standard_analyzer_new_with_words(true, FRT_ENGLISH_STOP_WORDS);
877
+ ts = frt_a_get_ts(a, rb_intern("random"), text, enc);
878
+ test_token_pi(frt_ts_next(ts), text_a, 0, 290, 1, enc);
879
+ test_token_pi(frt_ts_next(ts), "two", 291, 294, 1, enc);
860
880
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
861
881
  frt_ts_deref(ts);
862
882
  frt_a_deref(a);
@@ -872,22 +892,23 @@ static void test_long_word(TestCase *tc, void *data)
872
892
  static void test_lowercase_filter(TestCase *tc, void *data)
873
893
  {
874
894
  FrtToken *tk = frt_tk_new();
875
- FrtTokenStream *ts = frt_lowercase_filter_new(frt_standard_tokenizer_new());
895
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
896
+ ts = frt_lowercase_filter_new(ts);
876
897
  char text[200] =
877
898
  "DBalmain@gmail.com is My e-mail 52 #$ Address. -23!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234";
878
899
  (void)data;
879
-
880
- ts->reset(ts, text);
881
- test_token(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18);
882
- test_token(frt_ts_next(ts), "is", 19, 21);
883
- test_token(frt_ts_next(ts), "my", 22, 24);
884
- test_token(frt_ts_next(ts), "e-mail", 25, 31);
885
- test_token(frt_ts_next(ts), "52", 32, 34);
886
- test_token(frt_ts_next(ts), "address", 40, 47);
887
- test_token(frt_ts_next(ts), "-23", 49, 52);
888
- test_token(frt_ts_next(ts), "www.google.com/results", 55, 85);
889
- test_token(frt_ts_next(ts), "tnt", 86, 91);
890
- test_token(frt_ts_next(ts), "123-1235-asd-1234", 93, 110);
900
+ rb_encoding *enc = utf8_encoding;
901
+ ts->reset(ts, text, enc);
902
+ test_token(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, enc);
903
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
904
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
905
+ test_token(frt_ts_next(ts), "e-mail", 25, 31, enc);
906
+ test_token(frt_ts_next(ts), "52", 32, 34, enc);
907
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
908
+ test_token(frt_ts_next(ts), "-23", 49, 52, enc);
909
+ test_token(frt_ts_next(ts), "www.google.com/results", 55, 85, enc);
910
+ test_token(frt_ts_next(ts), "tnt", 86, 91, enc);
911
+ test_token(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, enc);
891
912
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
892
913
  frt_tk_destroy(tk);
893
914
  FRT_REF(ts);
@@ -900,31 +921,33 @@ static void test_lowercase_filter(TestCase *tc, void *data)
900
921
  static void test_hyphen_filter(TestCase *tc, void *data)
901
922
  {
902
923
  FrtToken *tk = frt_tk_new();
903
- FrtTokenStream *ts = frt_hyphen_filter_new(frt_lowercase_filter_new(frt_standard_tokenizer_new()));
924
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
925
+ ts = frt_lowercase_filter_new(ts);
926
+ ts = frt_hyphen_filter_new(ts);
904
927
  char text[200] =
905
928
  "DBalmain@gmail.com is My e-mail 52 #$ Address. -23!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 long-hyph-en-at-ed-word";
906
929
  (void)data;
907
-
908
- ts->reset(ts, text);
909
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
910
- test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
911
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 1);
912
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 1);
913
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
914
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
915
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
916
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 1);
917
- test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1);
918
- test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1);
919
- test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1);
920
- test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1);
921
- test_token_pi(frt_ts_next(ts), "longhyphenatedword", 111, 134, 1);
922
- test_token_pi(frt_ts_next(ts), "long", 111, 115, 0);
923
- test_token_pi(frt_ts_next(ts), "hyph", 116, 120, 1);
924
- test_token_pi(frt_ts_next(ts), "en", 121, 123, 1);
925
- test_token_pi(frt_ts_next(ts), "at", 124, 126, 1);
926
- test_token_pi(frt_ts_next(ts), "ed", 127, 129, 1);
927
- test_token_pi(frt_ts_next(ts), "word", 130, 134, 1);
930
+ rb_encoding *enc = utf8_encoding;
931
+ ts->reset(ts, text, enc);
932
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
933
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1, enc);
934
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 1, enc);
935
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 1, enc);
936
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
937
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
938
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
939
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 1, enc);
940
+ test_token_pi(frt_ts_next(ts), "-23", 49, 52, 1, enc);
941
+ test_token_pi(frt_ts_next(ts), "www.google.com/results", 55, 85, 1, enc);
942
+ test_token_pi(frt_ts_next(ts), "tnt", 86, 91, 1, enc);
943
+ test_token_pi(frt_ts_next(ts), "123-1235-asd-1234", 93, 110, 1, enc);
944
+ test_token_pi(frt_ts_next(ts), "longhyphenatedword", 111, 134, 1, enc);
945
+ test_token_pi(frt_ts_next(ts), "long", 111, 115, 0, enc);
946
+ test_token_pi(frt_ts_next(ts), "hyph", 116, 120, 1, enc);
947
+ test_token_pi(frt_ts_next(ts), "en", 121, 123, 1, enc);
948
+ test_token_pi(frt_ts_next(ts), "at", 124, 126, 1, enc);
949
+ test_token_pi(frt_ts_next(ts), "ed", 127, 129, 1, enc);
950
+ test_token_pi(frt_ts_next(ts), "word", 130, 134, 1, enc);
928
951
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
929
952
  frt_tk_destroy(tk);
930
953
  FRT_REF(ts);
@@ -943,14 +966,15 @@ static void test_stop_filter(TestCase *tc, void *data)
943
966
  char text[200] =
944
967
  "one, two, three, four, five, six, seven, eight, nine, ten.";
945
968
  (void)data;
946
-
947
- ts->reset(ts, text);
948
- test_token_pi(frt_ts_next(ts), "two", 5, 8, 2);
949
- test_token_pi(frt_ts_next(ts), "three", 10, 15, 1);
950
- test_token_pi(frt_ts_next(ts), "six", 29, 32, 3);
951
- test_token_pi(frt_ts_next(ts), "eight", 41, 46, 2);
952
- test_token_pi(frt_ts_next(ts), "nine", 48, 52, 1);
953
- test_token_pi(frt_ts_next(ts), "ten", 54, 57, 1);
969
+ rb_encoding *enc = utf8_encoding;
970
+
971
+ ts->reset(ts, text, enc);
972
+ test_token_pi(frt_ts_next(ts), "two", 5, 8, 2, enc);
973
+ test_token_pi(frt_ts_next(ts), "three", 10, 15, 1, enc);
974
+ test_token_pi(frt_ts_next(ts), "six", 29, 32, 3, enc);
975
+ test_token_pi(frt_ts_next(ts), "eight", 41, 46, 2, enc);
976
+ test_token_pi(frt_ts_next(ts), "nine", 48, 52, 1, enc);
977
+ test_token_pi(frt_ts_next(ts), "ten", 54, 57, 1, enc);
954
978
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
955
979
  frt_tk_destroy(tk);
956
980
  FRT_REF(ts);
@@ -974,36 +998,37 @@ static void test_mapping_filter(TestCase *tc, void *data)
974
998
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
975
999
  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
976
1000
  (void)data;
1001
+ rb_encoding *enc = utf8_encoding;
977
1002
 
978
1003
  frt_mapping_filter_add(ts, "ne", "hello");
979
1004
  frt_mapping_filter_add(ts, "four", long_word);
980
1005
 
981
- ts->reset(ts, text);
982
- test_token(frt_ts_next(ts), "ohello", 0, 3);
983
- test_token(frt_ts_next(ts), "two", 5, 8);
984
- test_token(frt_ts_next(ts), "three", 10, 15);
985
- test_token(frt_ts_next(ts), long_word, 17, 21);
986
- test_token(frt_ts_next(ts), "five", 23, 27);
987
- test_token(frt_ts_next(ts), "six", 29, 32);
988
- test_token(frt_ts_next(ts), "seven", 34, 39);
989
- test_token(frt_ts_next(ts), "eight", 41, 46);
990
- test_token(frt_ts_next(ts), "nihello", 48, 52);
991
- test_token(frt_ts_next(ts), "ten", 54, 57);
1006
+ ts->reset(ts, text, enc);
1007
+ test_token(frt_ts_next(ts), "ohello", 0, 3, enc);
1008
+ test_token(frt_ts_next(ts), "two", 5, 8, enc);
1009
+ test_token(frt_ts_next(ts), "three", 10, 15, enc);
1010
+ test_token(frt_ts_next(ts), long_word, 17, 21, enc);
1011
+ test_token(frt_ts_next(ts), "five", 23, 27, enc);
1012
+ test_token(frt_ts_next(ts), "six", 29, 32, enc);
1013
+ test_token(frt_ts_next(ts), "seven", 34, 39, enc);
1014
+ test_token(frt_ts_next(ts), "eight", 41, 46, enc);
1015
+ test_token(frt_ts_next(ts), "nihello", 48, 52, enc);
1016
+ test_token(frt_ts_next(ts), "ten", 54, 57, enc);
992
1017
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
993
1018
 
994
1019
  frt_mapping_filter_add(ts, "thr", "start");
995
1020
  frt_mapping_filter_add(ts, "en", "goodbye");
996
- ts->reset(ts, text);
997
- test_token(frt_ts_next(ts), "ohello", 0, 3);
998
- test_token(frt_ts_next(ts), "two", 5, 8);
999
- test_token(frt_ts_next(ts), "startee", 10, 15);
1000
- test_token(frt_ts_next(ts), long_word, 17, 21);
1001
- test_token(frt_ts_next(ts), "five", 23, 27);
1002
- test_token(frt_ts_next(ts), "six", 29, 32);
1003
- test_token(frt_ts_next(ts), "sevgoodbye", 34, 39);
1004
- test_token(frt_ts_next(ts), "eight", 41, 46);
1005
- test_token(frt_ts_next(ts), "nihello", 48, 52);
1006
- test_token(frt_ts_next(ts), "tgoodbye", 54, 57);
1021
+ ts->reset(ts, text, enc);
1022
+ test_token(frt_ts_next(ts), "ohello", 0, 3, enc);
1023
+ test_token(frt_ts_next(ts), "two", 5, 8, enc);
1024
+ test_token(frt_ts_next(ts), "startee", 10, 15, enc);
1025
+ test_token(frt_ts_next(ts), long_word, 17, 21, enc);
1026
+ test_token(frt_ts_next(ts), "five", 23, 27, enc);
1027
+ test_token(frt_ts_next(ts), "six", 29, 32, enc);
1028
+ test_token(frt_ts_next(ts), "sevgoodbye", 34, 39, enc);
1029
+ test_token(frt_ts_next(ts), "eight", 41, 46, enc);
1030
+ test_token(frt_ts_next(ts), "nihello", 48, 52, enc);
1031
+ test_token(frt_ts_next(ts), "tgoodbye", 54, 57, enc);
1007
1032
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1008
1033
  frt_tk_destroy(tk);
1009
1034
  FRT_REF(ts);
@@ -1036,38 +1061,40 @@ static void test_stemmer(TestCase *tc, void *data)
1036
1061
  static void test_stem_filter(TestCase *tc, void *data)
1037
1062
  {
1038
1063
  FrtToken *tk = frt_tk_new();
1039
- FrtTokenStream *ts = frt_stem_filter_new(frt_mb_letter_tokenizer_new(true),
1040
- "english", NULL);
1064
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
1065
+ ts = frt_lowercase_filter_new(ts);
1066
+ ts = frt_stem_filter_new(ts, "english");
1041
1067
  FrtTokenStream *ts2;
1042
1068
  char text[200] = "debate debates debated debating debater";
1043
1069
  char text2[200] = "dêbate dêbates dêbated dêbating dêbater";
1044
1070
  (void)data;
1045
1071
 
1046
- ts->reset(ts, text);
1072
+ rb_encoding *enc = utf8_encoding;
1073
+ ts->reset(ts, text, enc);
1047
1074
  ts2 = frt_ts_clone(ts);
1048
- test_token(frt_ts_next(ts), "debat", 0, 6);
1049
- test_token(frt_ts_next(ts), "debat", 7, 14);
1050
- test_token(frt_ts_next(ts), "debat", 15, 22);
1051
- test_token(frt_ts_next(ts), "debat", 23, 31);
1052
- test_token(frt_ts_next(ts), "debat", 32, 39);
1075
+ test_token(frt_ts_next(ts), "debat", 0, 6, enc);
1076
+ test_token(frt_ts_next(ts), "debat", 7, 14, enc);
1077
+ test_token(frt_ts_next(ts), "debat", 15, 22, enc);
1078
+ test_token(frt_ts_next(ts), "debat", 23, 31, enc);
1079
+ test_token(frt_ts_next(ts), "debat", 32, 39, enc);
1053
1080
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1054
- ts->reset(ts, text2);
1055
- test_token(frt_ts_next(ts), "dêbate", 0, 7);
1056
- test_token(frt_ts_next(ts), "dêbate", 8, 16);
1057
- test_token(frt_ts_next(ts), "dêbate", 17, 25);
1058
- test_token(frt_ts_next(ts), "dêbate", 26, 35);
1059
- test_token(frt_ts_next(ts), "dêbater", 36, 44);
1081
+ ts->reset(ts, text2, enc);
1082
+ test_token(frt_ts_next(ts), "dêbate", 0, 7, enc);
1083
+ test_token(frt_ts_next(ts), "dêbate", 8, 16, enc);
1084
+ test_token(frt_ts_next(ts), "dêbate", 17, 25, enc);
1085
+ test_token(frt_ts_next(ts), "dêbate", 26, 35, enc);
1086
+ test_token(frt_ts_next(ts), "dêbater", 36, 44, enc);
1060
1087
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1061
1088
  FRT_REF(ts);
1062
1089
  Aiequal(2, ts->ref_cnt);
1063
1090
  frt_ts_deref(ts);
1064
1091
  Aiequal(1, ts->ref_cnt);
1065
1092
  frt_ts_deref(ts);
1066
- test_token(frt_ts_next(ts2), "debat", 0, 6);
1067
- test_token(frt_ts_next(ts2), "debat", 7, 14);
1068
- test_token(frt_ts_next(ts2), "debat", 15, 22);
1069
- test_token(frt_ts_next(ts2), "debat", 23, 31);
1070
- test_token(frt_ts_next(ts2), "debat", 32, 39);
1093
+ test_token(frt_ts_next(ts2), "debat", 0, 6, enc);
1094
+ test_token(frt_ts_next(ts2), "debat", 7, 14, enc);
1095
+ test_token(frt_ts_next(ts2), "debat", 15, 22, enc);
1096
+ test_token(frt_ts_next(ts2), "debat", 23, 31, enc);
1097
+ test_token(frt_ts_next(ts2), "debat", 32, 39, enc);
1071
1098
  Assert(frt_ts_next(ts2) == NULL, "Should be no more tokens");
1072
1099
  frt_tk_destroy(tk);
1073
1100
  frt_ts_deref(ts2);
@@ -1080,64 +1107,65 @@ static void test_per_field_analyzer(TestCase *tc, void *data)
1080
1107
  char text[100] = "DBalmain@gmail.com is My E-mail 52 #$ address. 23#!$";
1081
1108
  FrtAnalyzer *pfa = frt_per_field_analyzer_new(frt_standard_analyzer_new(true));
1082
1109
  (void)data;
1110
+ rb_encoding *enc = utf8_encoding;
1083
1111
 
1084
1112
  frt_pfa_add_field(pfa, rb_intern("white"), frt_whitespace_analyzer_new(false));
1085
1113
  frt_pfa_add_field(pfa, rb_intern("white_l"), frt_whitespace_analyzer_new(true));
1086
1114
  frt_pfa_add_field(pfa, rb_intern("letter"), frt_letter_analyzer_new(false));
1087
1115
  frt_pfa_add_field(pfa, rb_intern("letter"), frt_letter_analyzer_new(true));
1088
1116
  frt_pfa_add_field(pfa, rb_intern("letter_u"), frt_letter_analyzer_new(false));
1089
- ts = frt_a_get_ts(pfa, rb_intern("white"), text);
1090
- test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1);
1091
- test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
1092
- test_token_pi(frt_ts_next(ts), "My", 22, 24, 1);
1093
- test_token_pi(frt_ts_next(ts), "E-mail", 25, 31, 1);
1094
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1095
- test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1);
1096
- test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1);
1097
- test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1);
1117
+ ts = frt_a_get_ts(pfa, rb_intern("white"), text, enc);
1118
+ test_token_pi(frt_ts_next(ts), "DBalmain@gmail.com", 0, 18, 1, enc);
1119
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1, enc);
1120
+ test_token_pi(frt_ts_next(ts), "My", 22, 24, 1, enc);
1121
+ test_token_pi(frt_ts_next(ts), "E-mail", 25, 31, 1, enc);
1122
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
1123
+ test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1, enc);
1124
+ test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1, enc);
1125
+ test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1, enc);
1098
1126
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1099
1127
  frt_ts_deref(ts);
1100
- ts = frt_a_get_ts(pfa, rb_intern("white_l"), text);
1101
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
1102
- test_token_pi(frt_ts_next(ts), "is", 19, 21, 1);
1103
- test_token_pi(frt_ts_next(ts), "my", 22, 24, 1);
1104
- test_token_pi(frt_ts_next(ts), "e-mail", 25, 31, 1);
1105
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1106
- test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1);
1107
- test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1);
1108
- test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1);
1128
+ ts = frt_a_get_ts(pfa, rb_intern("white_l"), text, enc);
1129
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
1130
+ test_token_pi(frt_ts_next(ts), "is", 19, 21, 1, enc);
1131
+ test_token_pi(frt_ts_next(ts), "my", 22, 24, 1, enc);
1132
+ test_token_pi(frt_ts_next(ts), "e-mail", 25, 31, 1, enc);
1133
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
1134
+ test_token_pi(frt_ts_next(ts), "#$", 37, 39, 1, enc);
1135
+ test_token_pi(frt_ts_next(ts), "address.", 40, 48, 1, enc);
1136
+ test_token_pi(frt_ts_next(ts), "23#!$", 49, 54, 1, enc);
1109
1137
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1110
1138
  frt_ts_deref(ts);
1111
- ts = frt_a_get_ts(pfa, rb_intern("letter_u"), text);
1112
- test_token(frt_ts_next(ts), "DBalmain", 0, 8);
1113
- test_token(frt_ts_next(ts), "gmail", 9, 14);
1114
- test_token(frt_ts_next(ts), "com", 15, 18);
1115
- test_token(frt_ts_next(ts), "is", 19, 21);
1116
- test_token(frt_ts_next(ts), "My", 22, 24);
1117
- test_token(frt_ts_next(ts), "E", 25, 26);
1118
- test_token(frt_ts_next(ts), "mail", 27, 31);
1119
- test_token(frt_ts_next(ts), "address", 40, 47);
1139
+ ts = frt_a_get_ts(pfa, rb_intern("letter_u"), text, enc);
1140
+ test_token(frt_ts_next(ts), "DBalmain", 0, 8, enc);
1141
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
1142
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
1143
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
1144
+ test_token(frt_ts_next(ts), "My", 22, 24, enc);
1145
+ test_token(frt_ts_next(ts), "E", 25, 26, enc);
1146
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
1147
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
1120
1148
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1121
1149
  frt_ts_deref(ts);
1122
- ts = frt_a_get_ts(pfa, rb_intern("letter"), text);
1123
- test_token(frt_ts_next(ts), "dbalmain", 0, 8);
1124
- test_token(frt_ts_next(ts), "gmail", 9, 14);
1125
- test_token(frt_ts_next(ts), "com", 15, 18);
1126
- test_token(frt_ts_next(ts), "is", 19, 21);
1127
- test_token(frt_ts_next(ts), "my", 22, 24);
1128
- test_token(frt_ts_next(ts), "e", 25, 26);
1129
- test_token(frt_ts_next(ts), "mail", 27, 31);
1130
- test_token(frt_ts_next(ts), "address", 40, 47);
1150
+ ts = frt_a_get_ts(pfa, rb_intern("letter"), text, enc);
1151
+ test_token(frt_ts_next(ts), "dbalmain", 0, 8, enc);
1152
+ test_token(frt_ts_next(ts), "gmail", 9, 14, enc);
1153
+ test_token(frt_ts_next(ts), "com", 15, 18, enc);
1154
+ test_token(frt_ts_next(ts), "is", 19, 21, enc);
1155
+ test_token(frt_ts_next(ts), "my", 22, 24, enc);
1156
+ test_token(frt_ts_next(ts), "e", 25, 26, enc);
1157
+ test_token(frt_ts_next(ts), "mail", 27, 31, enc);
1158
+ test_token(frt_ts_next(ts), "address", 40, 47, enc);
1131
1159
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1132
1160
  frt_ts_deref(ts);
1133
- ts = frt_a_get_ts(pfa, rb_intern("XXX"), text); /* should use default analyzer */
1134
- test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1);
1135
- test_token_pi(frt_ts_next(ts), "email", 25, 31, 3);
1136
- test_token_pi(frt_ts_next(ts), "e", 25, 26, 0);
1137
- test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1);
1138
- test_token_pi(frt_ts_next(ts), "52", 32, 34, 1);
1139
- test_token_pi(frt_ts_next(ts), "address", 40, 47, 1);
1140
- test_token_pi(frt_ts_next(ts), "23", 49, 51, 1);
1161
+ ts = frt_a_get_ts(pfa, rb_intern("XXX"), text, enc); /* should use default analyzer */
1162
+ test_token_pi(frt_ts_next(ts), "dbalmain@gmail.com", 0, 18, 1, enc);
1163
+ test_token_pi(frt_ts_next(ts), "email", 25, 31, 3, enc);
1164
+ test_token_pi(frt_ts_next(ts), "e", 25, 26, 0, enc);
1165
+ test_token_pi(frt_ts_next(ts), "mail", 27, 31, 1, enc);
1166
+ test_token_pi(frt_ts_next(ts), "52", 32, 34, 1, enc);
1167
+ test_token_pi(frt_ts_next(ts), "address", 40, 47, 1, enc);
1168
+ test_token_pi(frt_ts_next(ts), "23", 49, 51, 1, enc);
1141
1169
  Assert(frt_ts_next(ts) == NULL, "Should be no more tokens");
1142
1170
  frt_tk_destroy(tk);
1143
1171
  frt_ts_deref(ts);
@@ -1146,11 +1174,6 @@ static void test_per_field_analyzer(TestCase *tc, void *data)
1146
1174
 
1147
1175
  TestSuite *ts_analysis(TestSuite *suite)
1148
1176
  {
1149
- bool u = false;
1150
- char *original_locale = setlocale(LC_ALL, NULL);
1151
- char *locale = setlocale(LC_ALL, "");
1152
- if (locale && (strstr(locale, "utf") || strstr(locale, "UTF"))) u = true;
1153
-
1154
1177
  suite = ADD_SUITE(suite);
1155
1178
 
1156
1179
  tst_run_test(suite, test_tk, NULL);
@@ -1161,45 +1184,31 @@ TestSuite *ts_analysis(TestSuite *suite)
1161
1184
 
1162
1185
  /* Whitespace */
1163
1186
  tst_run_test(suite, test_whitespace_tokenizer, NULL);
1164
- if (u) {
1165
- tst_run_test(suite, test_mb_whitespace_tokenizer, NULL);
1166
- }
1187
+ tst_run_test(suite, test_mb_whitespace_tokenizer, NULL);
1167
1188
 
1168
1189
  tst_run_test(suite, test_whitespace_analyzer, NULL);
1169
- if (u) {
1170
- tst_run_test(suite, test_mb_whitespace_analyzer, NULL);
1171
- }
1190
+ tst_run_test(suite, test_mb_whitespace_analyzer, NULL);
1172
1191
 
1173
1192
  /* Letter */
1174
1193
  tst_run_test(suite, test_letter_tokenizer, NULL);
1175
- if (u) {
1176
- tst_run_test(suite, test_mb_letter_tokenizer, NULL);
1177
- }
1194
+ tst_run_test(suite, test_mb_letter_tokenizer, NULL);
1178
1195
 
1179
1196
  tst_run_test(suite, test_letter_analyzer, NULL);
1180
- if (u) {
1181
- tst_run_test(suite, test_mb_letter_analyzer, NULL);
1182
- }
1197
+ tst_run_test(suite, test_mb_letter_analyzer, NULL);
1183
1198
 
1184
1199
  /* Standard */
1185
1200
  tst_run_test(suite, test_standard_tokenizer, NULL);
1186
- if (u) {
1187
- tst_run_test(suite, test_mb_standard_tokenizer, NULL);
1188
- }
1201
+ tst_run_test(suite, test_mb_standard_tokenizer, NULL);
1202
+
1189
1203
  tst_run_test(suite, test_standard_analyzer, NULL);
1190
- if (u) {
1191
- tst_run_test(suite, test_mb_standard_analyzer, NULL);
1192
- }
1204
+ tst_run_test(suite, test_mb_standard_analyzer, NULL);
1193
1205
 
1194
1206
  /* LegacyStandard */
1195
1207
  tst_run_test(suite, test_legacy_standard_tokenizer, NULL);
1196
- if (u) {
1197
- tst_run_test(suite, test_mb_legacy_standard_tokenizer, NULL);
1198
- }
1208
+ tst_run_test(suite, test_mb_legacy_standard_tokenizer, NULL);
1209
+
1199
1210
  tst_run_test(suite, test_legacy_standard_analyzer, NULL);
1200
- if (u) {
1201
- tst_run_test(suite, test_mb_legacy_standard_analyzer, NULL);
1202
- }
1211
+ tst_run_test(suite, test_mb_legacy_standard_analyzer, NULL);
1203
1212
 
1204
1213
  tst_run_test(suite, test_long_word, NULL);
1205
1214
 
@@ -1211,11 +1220,9 @@ TestSuite *ts_analysis(TestSuite *suite)
1211
1220
  tst_run_test(suite, test_hyphen_filter, NULL);
1212
1221
  tst_run_test(suite, test_stop_filter, NULL);
1213
1222
  tst_run_test(suite, test_mapping_filter, NULL);
1223
+ tst_run_test(suite, test_stem_filter, NULL);
1224
+
1214
1225
  tst_run_test(suite, test_stemmer, NULL);
1215
- if (u) {
1216
- tst_run_test(suite, test_stem_filter, NULL);
1217
- }
1218
1226
 
1219
- setlocale(LC_ALL, original_locale);
1220
1227
  return suite;
1221
1228
  }