isomorfeus-ferret 0.12.6 → 0.13.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (249) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +85 -16
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
  9. data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
  10. data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
  11. data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
  12. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
  13. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
  14. data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
  15. data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
  16. data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
  17. data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
  18. data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
  19. data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
  20. data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
  21. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
  22. data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
  23. data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
  24. data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
  25. data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
  26. data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
  27. data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
  28. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
  29. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
  30. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
  31. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
  32. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
  33. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
  34. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
  35. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
  36. data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
  37. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
  38. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
  39. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
  40. data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
  41. data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
  42. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
  43. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
  44. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
  45. data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
  46. data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
  47. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
  48. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
  49. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
  50. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
  51. data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
  52. data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
  53. data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
  54. data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
  55. data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
  56. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
  57. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
  58. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
  59. data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
  60. data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
  61. data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
  62. data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
  63. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
  64. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
  65. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
  66. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
  67. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
  68. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
  69. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
  70. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
  71. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
  72. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
  73. data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
  74. data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
  75. data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
  76. data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
  77. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
  78. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
  79. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
  80. data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
  81. data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
  82. data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
  83. data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
  84. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
  85. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
  86. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
  87. data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
  88. data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
  89. data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
  90. data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
  91. data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
  92. data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
  93. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  94. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  95. data/ext/isomorfeus_ferret_ext/bzlib_blocksort.c +1094 -0
  96. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  97. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  98. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  99. data/ext/isomorfeus_ferret_ext/bzlib_huffman.c +205 -0
  100. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  101. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  102. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  103. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  104. data/ext/isomorfeus_ferret_ext/frb_index.c +513 -464
  105. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  106. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  107. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  108. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  109. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  110. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  111. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  112. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  113. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  114. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  115. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  116. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -9
  117. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  118. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  119. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  120. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  121. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +2 -0
  122. data/ext/isomorfeus_ferret_ext/frt_global.c +91 -200
  123. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -18
  124. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  125. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  126. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  127. data/ext/isomorfeus_ferret_ext/frt_index.c +714 -384
  128. data/ext/isomorfeus_ferret_ext/frt_index.h +274 -290
  129. data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
  130. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  131. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  132. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +68 -91
  133. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  134. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  135. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  136. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  137. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  138. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  139. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  140. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  141. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  142. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  143. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  144. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  145. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +46 -84
  146. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  147. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  148. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  149. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  150. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  151. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  152. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  153. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  154. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +22 -112
  155. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  156. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  157. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  158. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  159. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  160. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  161. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  162. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  163. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  164. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  165. data/ext/isomorfeus_ferret_ext/test.c +0 -17
  166. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  167. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  168. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  169. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  170. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  171. data/ext/isomorfeus_ferret_ext/test_fields.c +111 -100
  172. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  173. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  174. data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
  175. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  176. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  177. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  178. data/ext/isomorfeus_ferret_ext/test_index.c +373 -363
  179. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  180. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  181. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  182. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  183. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  184. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  185. data/ext/isomorfeus_ferret_ext/test_search.c +60 -64
  186. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  187. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  188. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  189. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  190. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  191. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  192. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  193. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  194. data/lib/isomorfeus/ferret/version.rb +1 -1
  195. metadata +113 -58
  196. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  197. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  198. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  199. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  200. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  201. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  202. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  203. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  204. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  205. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  206. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  207. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  208. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  209. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  210. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  211. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  212. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  213. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  214. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  215. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  216. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  217. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  218. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  219. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  220. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  221. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  222. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  223. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  224. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  225. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  226. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  227. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  228. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  229. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  230. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  231. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  232. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  233. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  234. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  235. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  236. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  237. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  238. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  239. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  240. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  241. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  242. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  243. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  244. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  245. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  246. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  247. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  248. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  249. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -155,12 +155,15 @@
155
155
 
156
156
  #include <string.h>
157
157
  #include <ctype.h>
158
- #include <wctype.h>
159
158
  #include <assert.h>
160
159
  #include "frt_global.h"
161
160
  #include "frt_except.h"
162
161
  #include "frt_search.h"
163
162
  #include "frt_array.h"
163
+ #include <ruby/encoding.h>
164
+
165
+ extern rb_encoding *utf8_encoding;
166
+ extern int utf8_mbmaxlen;
164
167
 
165
168
  typedef struct Phrase {
166
169
  int size;
@@ -180,7 +183,7 @@ float frt_qp_default_fuzzy_min_sim = 0.5;
180
183
  int frt_qp_default_fuzzy_pre_len = 0;
181
184
 
182
185
 
183
- #line 184 "frt_q_parser.c"
186
+ #line 187 "frt_q_parser.c"
184
187
 
185
188
  # ifndef YY_CAST
186
189
  # ifdef __cplusplus
@@ -237,7 +240,7 @@ extern int yydebug;
237
240
  #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
238
241
  union YYSTYPE
239
242
  {
240
- #line 113 "frt_q_parser.y"
243
+ #line 116 "frt_q_parser.y"
241
244
 
242
245
  FrtQuery *query;
243
246
  FrtBooleanClause *bcls;
@@ -246,7 +249,7 @@ union YYSTYPE
246
249
  Phrase *phrase;
247
250
  char *str;
248
251
 
249
- #line 250 "frt_q_parser.c"
252
+ #line 253 "frt_q_parser.c"
250
253
 
251
254
  };
252
255
  typedef union YYSTYPE YYSTYPE;
@@ -257,7 +260,7 @@ typedef union YYSTYPE YYSTYPE;
257
260
 
258
261
 
259
262
 
260
- int yyparse (FrtQParser *qp);
263
+ int yyparse (FrtQParser *qp, rb_encoding *encoding);
261
264
 
262
265
 
263
266
 
@@ -312,10 +315,10 @@ typedef enum yysymbol_kind_t yysymbol_kind_t;
312
315
 
313
316
 
314
317
  /* Second part of user prologue. */
315
- #line 121 "frt_q_parser.y"
318
+ #line 124 "frt_q_parser.y"
316
319
 
317
320
  static int yylex(YYSTYPE *lvalp, FrtQParser *qp);
318
- static int yyerror(FrtQParser *qp, char const *msg);
321
+ static int yyerror(FrtQParser *qp, rb_encoding *encoding, char const *msg);
319
322
 
320
323
  #define PHRASE_INIT_CAPA 4
321
324
  static FrtQuery *get_bool_q(FrtBCArray *bca);
@@ -323,29 +326,26 @@ static FrtQuery *get_bool_q(FrtBCArray *bca);
323
326
  static FrtBCArray *first_cls(FrtBooleanClause *boolean_clause);
324
327
  static FrtBCArray *add_and_cls(FrtBCArray *bca, FrtBooleanClause *clause);
325
328
  static FrtBCArray *add_or_cls(FrtBCArray *bca, FrtBooleanClause *clause);
326
- static FrtBCArray *add_default_cls(FrtQParser *qp, FrtBCArray *bca,
327
- FrtBooleanClause *clause);
329
+ static FrtBCArray *add_default_cls(FrtQParser *qp, FrtBCArray *bca, FrtBooleanClause *clause);
328
330
  static void bca_destroy(FrtBCArray *bca);
329
331
 
330
332
  static FrtBooleanClause *get_bool_cls(FrtQuery *q, FrtBCType occur);
331
333
 
332
- static FrtQuery *get_term_q(FrtQParser *qp, FrtSymbol field, char *word);
333
- static FrtQuery *get_fuzzy_q(FrtQParser *qp, FrtSymbol field, char *word,
334
- char *slop);
335
- static FrtQuery *get_wild_q(FrtQParser *qp, FrtSymbol field, char *pattern);
334
+ static FrtQuery *get_term_q(FrtQParser *qp, ID field, char *word, rb_encoding *encoding);
335
+ static FrtQuery *get_fuzzy_q(FrtQParser *qp, ID field, char *word, char *slop, rb_encoding *encoding);
336
+ static FrtQuery *get_wild_q(FrtQParser *qp, ID field, char *pattern, rb_encoding *encoding);
336
337
 
337
338
  static FrtHashSet *first_field(FrtQParser *qp, const char *field_name);
338
339
  static FrtHashSet *add_field(FrtQParser *qp, const char *field_name);
339
340
 
340
- static FrtQuery *get_phrase_q(FrtQParser *qp, Phrase *phrase, char *slop);
341
+ static FrtQuery *get_phrase_q(FrtQParser *qp, Phrase *phrase, char *slop, rb_encoding *encoding);
341
342
 
342
343
  static Phrase *ph_first_word(char *word);
343
344
  static Phrase *ph_add_word(Phrase *self, char *word);
344
345
  static Phrase *ph_add_multi_word(Phrase *self, char *word);
345
346
  static void ph_destroy(Phrase *self);
346
347
 
347
- static FrtQuery *get_r_q(FrtQParser *qp, FrtSymbol field, char *from, char *to,
348
- bool inc_lower, bool inc_upper);
348
+ static FrtQuery *get_r_q(FrtQParser *qp, ID field, char *from, char *to, bool inc_lower, bool inc_upper, rb_encoding *encoding);
349
349
 
350
350
  static void qp_push_fields(FrtQParser *self, FrtHashSet *fields, bool destroy);
351
351
  static void qp_pop_fields(FrtQParser *self);
@@ -359,17 +359,17 @@ static void qp_pop_fields(FrtQParser *self);
359
359
  */
360
360
  #define FLDS(q, func) do {\
361
361
  FRT_TRY {\
362
- FrtSymbol field;\
362
+ ID field;\
363
363
  if (qp->fields->size == 0) {\
364
364
  q = NULL;\
365
365
  } else if (qp->fields->size == 1) {\
366
- field = (FrtSymbol)qp->fields->first->elem;\
366
+ field = (ID)qp->fields->first->elem;\
367
367
  q = func;\
368
368
  } else {\
369
369
  FrtQuery *volatile sq; FrtHashSetEntry *volatile hse;\
370
370
  q = frt_bq_new_max(false, qp->max_clauses);\
371
371
  for (hse = qp->fields->first; hse; hse = hse->next) {\
372
- field = (FrtSymbol)hse->elem;\
372
+ field = (ID)hse->elem;\
373
373
  sq = func;\
374
374
  FRT_TRY\
375
375
  if (sq) frt_bq_add_query_nr(q, sq, FRT_BC_SHOULD);\
@@ -969,7 +969,7 @@ enum { YYENOMEM = -2 };
969
969
  } \
970
970
  else \
971
971
  { \
972
- yyerror (qp, YY_("syntax error: cannot back up")); \
972
+ yyerror (qp, encoding, YY_("syntax error: cannot back up")); \
973
973
  YYERROR; \
974
974
  } \
975
975
  while (0)
@@ -1002,7 +1002,7 @@ do { \
1002
1002
  { \
1003
1003
  YYFPRINTF (stderr, "%s ", Title); \
1004
1004
  yy_symbol_print (stderr, \
1005
- Kind, Value, qp); \
1005
+ Kind, Value, qp, encoding); \
1006
1006
  YYFPRINTF (stderr, "\n"); \
1007
1007
  } \
1008
1008
  } while (0)
@@ -1014,11 +1014,12 @@ do { \
1014
1014
 
1015
1015
  static void
1016
1016
  yy_symbol_value_print (FILE *yyo,
1017
- yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep, FrtQParser *qp)
1017
+ yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep, FrtQParser *qp, rb_encoding *encoding)
1018
1018
  {
1019
1019
  FILE *yyoutput = yyo;
1020
1020
  YY_USE (yyoutput);
1021
1021
  YY_USE (qp);
1022
+ YY_USE (encoding);
1022
1023
  if (!yyvaluep)
1023
1024
  return;
1024
1025
  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
@@ -1033,12 +1034,12 @@ yy_symbol_value_print (FILE *yyo,
1033
1034
 
1034
1035
  static void
1035
1036
  yy_symbol_print (FILE *yyo,
1036
- yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep, FrtQParser *qp)
1037
+ yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep, FrtQParser *qp, rb_encoding *encoding)
1037
1038
  {
1038
1039
  YYFPRINTF (yyo, "%s %s (",
1039
1040
  yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind));
1040
1041
 
1041
- yy_symbol_value_print (yyo, yykind, yyvaluep, qp);
1042
+ yy_symbol_value_print (yyo, yykind, yyvaluep, qp, encoding);
1042
1043
  YYFPRINTF (yyo, ")");
1043
1044
  }
1044
1045
 
@@ -1072,7 +1073,7 @@ do { \
1072
1073
 
1073
1074
  static void
1074
1075
  yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp,
1075
- int yyrule, FrtQParser *qp)
1076
+ int yyrule, FrtQParser *qp, rb_encoding *encoding)
1076
1077
  {
1077
1078
  int yylno = yyrline[yyrule];
1078
1079
  int yynrhs = yyr2[yyrule];
@@ -1085,7 +1086,7 @@ yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp,
1085
1086
  YYFPRINTF (stderr, " $%d = ", yyi + 1);
1086
1087
  yy_symbol_print (stderr,
1087
1088
  YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]),
1088
- &yyvsp[(yyi + 1) - (yynrhs)], qp);
1089
+ &yyvsp[(yyi + 1) - (yynrhs)], qp, encoding);
1089
1090
  YYFPRINTF (stderr, "\n");
1090
1091
  }
1091
1092
  }
@@ -1093,7 +1094,7 @@ yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp,
1093
1094
  # define YY_REDUCE_PRINT(Rule) \
1094
1095
  do { \
1095
1096
  if (yydebug) \
1096
- yy_reduce_print (yyssp, yyvsp, Rule, qp); \
1097
+ yy_reduce_print (yyssp, yyvsp, Rule, qp, encoding); \
1097
1098
  } while (0)
1098
1099
 
1099
1100
  /* Nonzero means print parse trace. It is left uninitialized so that
@@ -1134,10 +1135,11 @@ int yydebug;
1134
1135
 
1135
1136
  static void
1136
1137
  yydestruct (const char *yymsg,
1137
- yysymbol_kind_t yykind, YYSTYPE *yyvaluep, FrtQParser *qp)
1138
+ yysymbol_kind_t yykind, YYSTYPE *yyvaluep, FrtQParser *qp, rb_encoding *encoding)
1138
1139
  {
1139
1140
  YY_USE (yyvaluep);
1140
1141
  YY_USE (qp);
1142
+ YY_USE (encoding);
1141
1143
  if (!yymsg)
1142
1144
  yymsg = "Deleting";
1143
1145
  YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp);
@@ -1148,67 +1150,67 @@ yydestruct (const char *yymsg,
1148
1150
  case YYSYMBOL_bool_q: /* bool_q */
1149
1151
  #line 221 "frt_q_parser.y"
1150
1152
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1151
- #line 1152 "frt_q_parser.c"
1153
+ #line 1154 "frt_q_parser.c"
1152
1154
  break;
1153
1155
 
1154
1156
  case YYSYMBOL_bool_clss: /* bool_clss */
1155
1157
  #line 223 "frt_q_parser.y"
1156
1158
  { if (((*yyvaluep).bclss) && qp->destruct) bca_destroy(((*yyvaluep).bclss)); }
1157
- #line 1158 "frt_q_parser.c"
1159
+ #line 1160 "frt_q_parser.c"
1158
1160
  break;
1159
1161
 
1160
1162
  case YYSYMBOL_bool_cls: /* bool_cls */
1161
1163
  #line 222 "frt_q_parser.y"
1162
1164
  { if (((*yyvaluep).bcls) && qp->destruct) frt_bc_deref(((*yyvaluep).bcls)); }
1163
- #line 1164 "frt_q_parser.c"
1165
+ #line 1166 "frt_q_parser.c"
1164
1166
  break;
1165
1167
 
1166
1168
  case YYSYMBOL_boosted_q: /* boosted_q */
1167
1169
  #line 221 "frt_q_parser.y"
1168
1170
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1169
- #line 1170 "frt_q_parser.c"
1171
+ #line 1172 "frt_q_parser.c"
1170
1172
  break;
1171
1173
 
1172
1174
  case YYSYMBOL_q: /* q */
1173
1175
  #line 221 "frt_q_parser.y"
1174
1176
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1175
- #line 1176 "frt_q_parser.c"
1177
+ #line 1178 "frt_q_parser.c"
1176
1178
  break;
1177
1179
 
1178
1180
  case YYSYMBOL_term_q: /* term_q */
1179
1181
  #line 221 "frt_q_parser.y"
1180
1182
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1181
- #line 1182 "frt_q_parser.c"
1183
+ #line 1184 "frt_q_parser.c"
1182
1184
  break;
1183
1185
 
1184
1186
  case YYSYMBOL_wild_q: /* wild_q */
1185
1187
  #line 221 "frt_q_parser.y"
1186
1188
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1187
- #line 1188 "frt_q_parser.c"
1189
+ #line 1190 "frt_q_parser.c"
1188
1190
  break;
1189
1191
 
1190
1192
  case YYSYMBOL_field_q: /* field_q */
1191
1193
  #line 221 "frt_q_parser.y"
1192
1194
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1193
- #line 1194 "frt_q_parser.c"
1195
+ #line 1196 "frt_q_parser.c"
1194
1196
  break;
1195
1197
 
1196
1198
  case YYSYMBOL_phrase_q: /* phrase_q */
1197
1199
  #line 221 "frt_q_parser.y"
1198
1200
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1199
- #line 1200 "frt_q_parser.c"
1201
+ #line 1202 "frt_q_parser.c"
1200
1202
  break;
1201
1203
 
1202
1204
  case YYSYMBOL_ph_words: /* ph_words */
1203
1205
  #line 224 "frt_q_parser.y"
1204
1206
  { if (((*yyvaluep).phrase) && qp->destruct) ph_destroy(((*yyvaluep).phrase)); }
1205
- #line 1206 "frt_q_parser.c"
1207
+ #line 1208 "frt_q_parser.c"
1206
1208
  break;
1207
1209
 
1208
1210
  case YYSYMBOL_range_q: /* range_q */
1209
1211
  #line 221 "frt_q_parser.y"
1210
1212
  { if (((*yyvaluep).query) && qp->destruct) frt_q_deref(((*yyvaluep).query)); }
1211
- #line 1212 "frt_q_parser.c"
1213
+ #line 1214 "frt_q_parser.c"
1212
1214
  break;
1213
1215
 
1214
1216
  default:
@@ -1227,7 +1229,7 @@ yydestruct (const char *yymsg,
1227
1229
  `----------*/
1228
1230
 
1229
1231
  int
1230
- yyparse (FrtQParser *qp)
1232
+ yyparse (FrtQParser *qp, rb_encoding *encoding)
1231
1233
  {
1232
1234
  /* Lookahead token kind. */
1233
1235
  int yychar;
@@ -1484,269 +1486,269 @@ yyreduce:
1484
1486
  case 2: /* bool_q: %empty */
1485
1487
  #line 226 "frt_q_parser.y"
1486
1488
  { qp->result = (yyval.query) = NULL; }
1487
- #line 1488 "frt_q_parser.c"
1489
+ #line 1490 "frt_q_parser.c"
1488
1490
  break;
1489
1491
 
1490
1492
  case 3: /* bool_q: bool_clss */
1491
1493
  #line 227 "frt_q_parser.y"
1492
1494
  { T qp->result = (yyval.query) = get_bool_q((yyvsp[0].bclss)); E }
1493
- #line 1494 "frt_q_parser.c"
1495
+ #line 1496 "frt_q_parser.c"
1494
1496
  break;
1495
1497
 
1496
1498
  case 4: /* bool_clss: bool_cls */
1497
1499
  #line 229 "frt_q_parser.y"
1498
1500
  { T (yyval.bclss) = first_cls((yyvsp[0].bcls)); E }
1499
- #line 1500 "frt_q_parser.c"
1501
+ #line 1502 "frt_q_parser.c"
1500
1502
  break;
1501
1503
 
1502
1504
  case 5: /* bool_clss: bool_clss AND bool_cls */
1503
1505
  #line 230 "frt_q_parser.y"
1504
1506
  { T (yyval.bclss) = add_and_cls((yyvsp[-2].bclss), (yyvsp[0].bcls)); E }
1505
- #line 1506 "frt_q_parser.c"
1507
+ #line 1508 "frt_q_parser.c"
1506
1508
  break;
1507
1509
 
1508
1510
  case 6: /* bool_clss: bool_clss OR bool_cls */
1509
1511
  #line 231 "frt_q_parser.y"
1510
1512
  { T (yyval.bclss) = add_or_cls((yyvsp[-2].bclss), (yyvsp[0].bcls)); E }
1511
- #line 1512 "frt_q_parser.c"
1513
+ #line 1514 "frt_q_parser.c"
1512
1514
  break;
1513
1515
 
1514
1516
  case 7: /* bool_clss: bool_clss bool_cls */
1515
1517
  #line 232 "frt_q_parser.y"
1516
1518
  { T (yyval.bclss) = add_default_cls(qp, (yyvsp[-1].bclss), (yyvsp[0].bcls)); E }
1517
- #line 1518 "frt_q_parser.c"
1519
+ #line 1520 "frt_q_parser.c"
1518
1520
  break;
1519
1521
 
1520
1522
  case 8: /* bool_cls: REQ boosted_q */
1521
1523
  #line 234 "frt_q_parser.y"
1522
1524
  { T (yyval.bcls) = get_bool_cls((yyvsp[0].query), FRT_BC_MUST); E }
1523
- #line 1524 "frt_q_parser.c"
1525
+ #line 1526 "frt_q_parser.c"
1524
1526
  break;
1525
1527
 
1526
1528
  case 9: /* bool_cls: NOT boosted_q */
1527
1529
  #line 235 "frt_q_parser.y"
1528
1530
  { T (yyval.bcls) = get_bool_cls((yyvsp[0].query), FRT_BC_MUST_NOT); E }
1529
- #line 1530 "frt_q_parser.c"
1531
+ #line 1532 "frt_q_parser.c"
1530
1532
  break;
1531
1533
 
1532
1534
  case 10: /* bool_cls: boosted_q */
1533
1535
  #line 236 "frt_q_parser.y"
1534
1536
  { T (yyval.bcls) = get_bool_cls((yyvsp[0].query), FRT_BC_SHOULD); E }
1535
- #line 1536 "frt_q_parser.c"
1537
+ #line 1538 "frt_q_parser.c"
1536
1538
  break;
1537
1539
 
1538
1540
  case 12: /* boosted_q: q '^' QWRD */
1539
1541
  #line 239 "frt_q_parser.y"
1540
1542
  { T if ((yyvsp[-2].query)) sscanf((yyvsp[0].str),"%f",&((yyvsp[-2].query)->boost)); (yyval.query)=(yyvsp[-2].query); E }
1541
- #line 1542 "frt_q_parser.c"
1543
+ #line 1544 "frt_q_parser.c"
1542
1544
  break;
1543
1545
 
1544
1546
  case 14: /* q: '(' ')' */
1545
1547
  #line 242 "frt_q_parser.y"
1546
1548
  { T (yyval.query) = frt_bq_new_max(true, qp->max_clauses); E }
1547
- #line 1548 "frt_q_parser.c"
1549
+ #line 1550 "frt_q_parser.c"
1548
1550
  break;
1549
1551
 
1550
1552
  case 15: /* q: '(' bool_clss ')' */
1551
1553
  #line 243 "frt_q_parser.y"
1552
1554
  { T (yyval.query) = get_bool_q((yyvsp[-1].bclss)); E }
1553
- #line 1554 "frt_q_parser.c"
1555
+ #line 1556 "frt_q_parser.c"
1554
1556
  break;
1555
1557
 
1556
1558
  case 20: /* term_q: QWRD */
1557
1559
  #line 249 "frt_q_parser.y"
1558
- { FLDS((yyval.query), get_term_q(qp, field, (yyvsp[0].str))); Y}
1559
- #line 1560 "frt_q_parser.c"
1560
+ { FLDS((yyval.query), get_term_q(qp, field, (yyvsp[0].str), encoding)); Y}
1561
+ #line 1562 "frt_q_parser.c"
1560
1562
  break;
1561
1563
 
1562
1564
  case 21: /* term_q: QWRD '~' QWRD */
1563
1565
  #line 250 "frt_q_parser.y"
1564
- { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[-2].str), (yyvsp[0].str))); Y}
1565
- #line 1566 "frt_q_parser.c"
1566
+ { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[-2].str), (yyvsp[0].str), encoding)); Y}
1567
+ #line 1568 "frt_q_parser.c"
1566
1568
  break;
1567
1569
 
1568
1570
  case 22: /* term_q: QWRD '~' */
1569
1571
  #line 251 "frt_q_parser.y"
1570
- { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[-1].str), NULL)); Y}
1571
- #line 1572 "frt_q_parser.c"
1572
+ { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[-1].str), NULL, encoding)); Y}
1573
+ #line 1574 "frt_q_parser.c"
1572
1574
  break;
1573
1575
 
1574
1576
  case 23: /* wild_q: WILD_STR */
1575
1577
  #line 253 "frt_q_parser.y"
1576
- { FLDS((yyval.query), get_wild_q(qp, field, (yyvsp[0].str))); Y}
1577
- #line 1578 "frt_q_parser.c"
1578
+ { FLDS((yyval.query), get_wild_q(qp, field, (yyvsp[0].str), encoding)); Y}
1579
+ #line 1580 "frt_q_parser.c"
1578
1580
  break;
1579
1581
 
1580
1582
  case 24: /* $@1: %empty */
1581
1583
  #line 255 "frt_q_parser.y"
1582
1584
  { qp_pop_fields(qp); }
1583
- #line 1584 "frt_q_parser.c"
1585
+ #line 1586 "frt_q_parser.c"
1584
1586
  break;
1585
1587
 
1586
1588
  case 25: /* field_q: field ':' q $@1 */
1587
1589
  #line 256 "frt_q_parser.y"
1588
1590
  { (yyval.query) = (yyvsp[-1].query); }
1589
- #line 1590 "frt_q_parser.c"
1591
+ #line 1592 "frt_q_parser.c"
1590
1592
  break;
1591
1593
 
1592
1594
  case 26: /* $@2: %empty */
1593
1595
  #line 257 "frt_q_parser.y"
1594
1596
  { qp_push_fields(qp, qp->all_fields, false); }
1595
- #line 1596 "frt_q_parser.c"
1597
+ #line 1598 "frt_q_parser.c"
1596
1598
  break;
1597
1599
 
1598
1600
  case 27: /* $@3: %empty */
1599
1601
  #line 257 "frt_q_parser.y"
1600
1602
  { qp_pop_fields(qp); }
1601
- #line 1602 "frt_q_parser.c"
1603
+ #line 1604 "frt_q_parser.c"
1602
1604
  break;
1603
1605
 
1604
1606
  case 28: /* field_q: '*' $@2 ':' q $@3 */
1605
1607
  #line 258 "frt_q_parser.y"
1606
1608
  { (yyval.query) = (yyvsp[-1].query); }
1607
- #line 1608 "frt_q_parser.c"
1609
+ #line 1610 "frt_q_parser.c"
1608
1610
  break;
1609
1611
 
1610
1612
  case 29: /* field: QWRD */
1611
1613
  #line 260 "frt_q_parser.y"
1612
1614
  { (yyval.hashset) = first_field(qp, (yyvsp[0].str)); }
1613
- #line 1614 "frt_q_parser.c"
1615
+ #line 1616 "frt_q_parser.c"
1614
1616
  break;
1615
1617
 
1616
1618
  case 30: /* field: field '|' QWRD */
1617
1619
  #line 261 "frt_q_parser.y"
1618
1620
  { (yyval.hashset) = add_field(qp, (yyvsp[0].str));}
1619
- #line 1620 "frt_q_parser.c"
1621
+ #line 1622 "frt_q_parser.c"
1620
1622
  break;
1621
1623
 
1622
1624
  case 31: /* phrase_q: '"' ph_words '"' */
1623
1625
  #line 263 "frt_q_parser.y"
1624
- { (yyval.query) = get_phrase_q(qp, (yyvsp[-1].phrase), NULL); }
1625
- #line 1626 "frt_q_parser.c"
1626
+ { (yyval.query) = get_phrase_q(qp, (yyvsp[-1].phrase), NULL, encoding); }
1627
+ #line 1628 "frt_q_parser.c"
1626
1628
  break;
1627
1629
 
1628
1630
  case 32: /* phrase_q: '"' ph_words '"' '~' QWRD */
1629
1631
  #line 264 "frt_q_parser.y"
1630
- { (yyval.query) = get_phrase_q(qp, (yyvsp[-3].phrase), (yyvsp[0].str)); }
1631
- #line 1632 "frt_q_parser.c"
1632
+ { (yyval.query) = get_phrase_q(qp, (yyvsp[-3].phrase), (yyvsp[0].str), encoding); }
1633
+ #line 1634 "frt_q_parser.c"
1632
1634
  break;
1633
1635
 
1634
1636
  case 33: /* phrase_q: '"' '"' */
1635
1637
  #line 265 "frt_q_parser.y"
1636
1638
  { (yyval.query) = NULL; }
1637
- #line 1638 "frt_q_parser.c"
1639
+ #line 1640 "frt_q_parser.c"
1638
1640
  break;
1639
1641
 
1640
1642
  case 34: /* phrase_q: '"' '"' '~' QWRD */
1641
1643
  #line 266 "frt_q_parser.y"
1642
1644
  { (yyval.query) = NULL; (void)(yyvsp[0].str);}
1643
- #line 1644 "frt_q_parser.c"
1645
+ #line 1646 "frt_q_parser.c"
1644
1646
  break;
1645
1647
 
1646
1648
  case 35: /* ph_words: QWRD */
1647
1649
  #line 268 "frt_q_parser.y"
1648
1650
  { (yyval.phrase) = ph_first_word((yyvsp[0].str)); }
1649
- #line 1650 "frt_q_parser.c"
1651
+ #line 1652 "frt_q_parser.c"
1650
1652
  break;
1651
1653
 
1652
1654
  case 36: /* ph_words: '<' '>' */
1653
1655
  #line 269 "frt_q_parser.y"
1654
1656
  { (yyval.phrase) = ph_first_word(NULL); }
1655
- #line 1656 "frt_q_parser.c"
1657
+ #line 1658 "frt_q_parser.c"
1656
1658
  break;
1657
1659
 
1658
1660
  case 37: /* ph_words: ph_words QWRD */
1659
1661
  #line 270 "frt_q_parser.y"
1660
1662
  { (yyval.phrase) = ph_add_word((yyvsp[-1].phrase), (yyvsp[0].str)); }
1661
- #line 1662 "frt_q_parser.c"
1663
+ #line 1664 "frt_q_parser.c"
1662
1664
  break;
1663
1665
 
1664
1666
  case 38: /* ph_words: ph_words '<' '>' */
1665
1667
  #line 271 "frt_q_parser.y"
1666
1668
  { (yyval.phrase) = ph_add_word((yyvsp[-2].phrase), NULL); }
1667
- #line 1668 "frt_q_parser.c"
1669
+ #line 1670 "frt_q_parser.c"
1668
1670
  break;
1669
1671
 
1670
1672
  case 39: /* ph_words: ph_words '|' QWRD */
1671
1673
  #line 272 "frt_q_parser.y"
1672
1674
  { (yyval.phrase) = ph_add_multi_word((yyvsp[-2].phrase), (yyvsp[0].str)); }
1673
- #line 1674 "frt_q_parser.c"
1675
+ #line 1676 "frt_q_parser.c"
1674
1676
  break;
1675
1677
 
1676
1678
  case 40: /* range_q: '[' QWRD QWRD ']' */
1677
1679
  #line 274 "frt_q_parser.y"
1678
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), true, true)); Y}
1679
- #line 1680 "frt_q_parser.c"
1680
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), true, true, encoding)); Y}
1681
+ #line 1682 "frt_q_parser.c"
1680
1682
  break;
1681
1683
 
1682
1684
  case 41: /* range_q: '[' QWRD QWRD '}' */
1683
1685
  #line 275 "frt_q_parser.y"
1684
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), true, false)); Y}
1685
- #line 1686 "frt_q_parser.c"
1686
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), true, false, encoding)); Y}
1687
+ #line 1688 "frt_q_parser.c"
1686
1688
  break;
1687
1689
 
1688
1690
  case 42: /* range_q: '{' QWRD QWRD ']' */
1689
1691
  #line 276 "frt_q_parser.y"
1690
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), false, true)); Y}
1691
- #line 1692 "frt_q_parser.c"
1692
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), false, true, encoding)); Y}
1693
+ #line 1694 "frt_q_parser.c"
1692
1694
  break;
1693
1695
 
1694
1696
  case 43: /* range_q: '{' QWRD QWRD '}' */
1695
1697
  #line 277 "frt_q_parser.y"
1696
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), false, false)); Y}
1697
- #line 1698 "frt_q_parser.c"
1698
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-2].str), (yyvsp[-1].str), false, false, encoding)); Y}
1699
+ #line 1700 "frt_q_parser.c"
1698
1700
  break;
1699
1701
 
1700
1702
  case 44: /* range_q: '<' QWRD '}' */
1701
1703
  #line 278 "frt_q_parser.y"
1702
- { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[-1].str), false, false)); Y}
1703
- #line 1704 "frt_q_parser.c"
1704
+ { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[-1].str), false, false, encoding)); Y}
1705
+ #line 1706 "frt_q_parser.c"
1704
1706
  break;
1705
1707
 
1706
1708
  case 45: /* range_q: '<' QWRD ']' */
1707
1709
  #line 279 "frt_q_parser.y"
1708
- { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[-1].str), false, true)); Y}
1709
- #line 1710 "frt_q_parser.c"
1710
+ { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[-1].str), false, true, encoding)); Y}
1711
+ #line 1712 "frt_q_parser.c"
1710
1712
  break;
1711
1713
 
1712
1714
  case 46: /* range_q: '[' QWRD '>' */
1713
1715
  #line 280 "frt_q_parser.y"
1714
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-1].str), NULL,true, false)); Y}
1715
- #line 1716 "frt_q_parser.c"
1716
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-1].str), NULL,true, false, encoding)); Y}
1717
+ #line 1718 "frt_q_parser.c"
1716
1718
  break;
1717
1719
 
1718
1720
  case 47: /* range_q: '{' QWRD '>' */
1719
1721
  #line 281 "frt_q_parser.y"
1720
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-1].str), NULL,false, false)); Y}
1721
- #line 1722 "frt_q_parser.c"
1722
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[-1].str), NULL,false, false, encoding)); Y}
1723
+ #line 1724 "frt_q_parser.c"
1722
1724
  break;
1723
1725
 
1724
1726
  case 48: /* range_q: '<' QWRD */
1725
1727
  #line 282 "frt_q_parser.y"
1726
- { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[0].str), false, false)); Y}
1727
- #line 1728 "frt_q_parser.c"
1728
+ { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[0].str), false, false, encoding)); Y}
1729
+ #line 1730 "frt_q_parser.c"
1728
1730
  break;
1729
1731
 
1730
1732
  case 49: /* range_q: '<' '=' QWRD */
1731
1733
  #line 283 "frt_q_parser.y"
1732
- { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[0].str), false, true)); Y}
1733
- #line 1734 "frt_q_parser.c"
1734
+ { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[0].str), false, true, encoding)); Y}
1735
+ #line 1736 "frt_q_parser.c"
1734
1736
  break;
1735
1737
 
1736
1738
  case 50: /* range_q: '>' '=' QWRD */
1737
1739
  #line 284 "frt_q_parser.y"
1738
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[0].str), NULL,true, false)); Y}
1739
- #line 1740 "frt_q_parser.c"
1740
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[0].str), NULL,true, false, encoding)); Y}
1741
+ #line 1742 "frt_q_parser.c"
1740
1742
  break;
1741
1743
 
1742
1744
  case 51: /* range_q: '>' QWRD */
1743
1745
  #line 285 "frt_q_parser.y"
1744
- { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[0].str), NULL,false, false)); Y}
1745
- #line 1746 "frt_q_parser.c"
1746
+ { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[0].str), NULL,false, false, encoding)); Y}
1747
+ #line 1748 "frt_q_parser.c"
1746
1748
  break;
1747
1749
 
1748
1750
 
1749
- #line 1750 "frt_q_parser.c"
1751
+ #line 1752 "frt_q_parser.c"
1750
1752
 
1751
1753
  default: break;
1752
1754
  }
@@ -1793,7 +1795,7 @@ yyerrlab:
1793
1795
  if (!yyerrstatus)
1794
1796
  {
1795
1797
  ++yynerrs;
1796
- yyerror (qp, YY_("syntax error"));
1798
+ yyerror (qp, encoding, YY_("syntax error"));
1797
1799
  }
1798
1800
 
1799
1801
  if (yyerrstatus == 3)
@@ -1810,7 +1812,7 @@ yyerrlab:
1810
1812
  else
1811
1813
  {
1812
1814
  yydestruct ("Error: discarding",
1813
- yytoken, &yylval, qp);
1815
+ yytoken, &yylval, qp, encoding);
1814
1816
  yychar = YYEMPTY;
1815
1817
  }
1816
1818
  }
@@ -1866,7 +1868,7 @@ yyerrlab1:
1866
1868
 
1867
1869
 
1868
1870
  yydestruct ("Error: popping",
1869
- YY_ACCESSING_SYMBOL (yystate), yyvsp, qp);
1871
+ YY_ACCESSING_SYMBOL (yystate), yyvsp, qp, encoding);
1870
1872
  YYPOPSTACK (1);
1871
1873
  yystate = *yyssp;
1872
1874
  YY_STACK_PRINT (yyss, yyssp);
@@ -1904,7 +1906,7 @@ yyabortlab:
1904
1906
  | yyexhaustedlab -- YYNOMEM (memory exhaustion) comes here. |
1905
1907
  `-----------------------------------------------------------*/
1906
1908
  yyexhaustedlab:
1907
- yyerror (qp, YY_("memory exhausted"));
1909
+ yyerror (qp, encoding, YY_("memory exhausted"));
1908
1910
  yyresult = 2;
1909
1911
  goto yyreturnlab;
1910
1912
 
@@ -1919,7 +1921,7 @@ yyreturnlab:
1919
1921
  user semantic actions for why this is necessary. */
1920
1922
  yytoken = YYTRANSLATE (yychar);
1921
1923
  yydestruct ("Cleanup: discarding lookahead",
1922
- yytoken, &yylval, qp);
1924
+ yytoken, &yylval, qp, encoding);
1923
1925
  }
1924
1926
  /* Do not reclaim the symbols of the rule whose action triggered
1925
1927
  this YYABORT or YYACCEPT. */
@@ -1928,7 +1930,7 @@ yyreturnlab:
1928
1930
  while (yyssp != yyss)
1929
1931
  {
1930
1932
  yydestruct ("Cleanup: popping",
1931
- YY_ACCESSING_SYMBOL (+*yyssp), yyvsp, qp);
1933
+ YY_ACCESSING_SYMBOL (+*yyssp), yyvsp, qp, encoding);
1932
1934
  YYPOPSTACK (1);
1933
1935
  }
1934
1936
  #ifndef yyoverflow
@@ -2102,8 +2104,9 @@ static int yylex(YYSTYPE *lvalp, FrtQParser *qp)
2102
2104
  * It is responsible for clearing any memory that was allocated during the
2103
2105
  * parsing process.
2104
2106
  */
2105
- static int yyerror(FrtQParser *qp, char const *msg)
2107
+ static int yyerror(FrtQParser *qp, rb_encoding *encoding, char const *msg)
2106
2108
  {
2109
+ (void)encoding;
2107
2110
  qp->destruct = true;
2108
2111
  if (!qp->handle_parse_errors) {
2109
2112
  char buf[1024];
@@ -2133,22 +2136,21 @@ static int yyerror(FrtQParser *qp, char const *msg)
2133
2136
  * This method returns the query parser for a particular field and sets it up
2134
2137
  * with the text to be tokenized.
2135
2138
  */
2136
- static FrtTokenStream *get_cached_ts(FrtQParser *qp, FrtSymbol field, char *text)
2137
- {
2139
+ static FrtTokenStream *get_cached_ts(FrtQParser *qp, ID field, char *text, rb_encoding *encoding) {
2138
2140
  FrtTokenStream *ts;
2139
2141
  if (frt_hs_exists(qp->tokenized_fields, (void *)field)) {
2140
2142
  ts = (FrtTokenStream *)frt_h_get(qp->ts_cache, (void *)field);
2141
2143
  if (!ts) {
2142
- ts = frt_a_get_ts(qp->analyzer, field, text);
2144
+ ts = frt_a_get_ts(qp->analyzer, field, text, encoding);
2143
2145
  frt_h_set(qp->ts_cache, (void *)field, ts);
2144
2146
  }
2145
2147
  else {
2146
- ts->reset(ts, text);
2148
+ ts->reset(ts, text, encoding);
2147
2149
  }
2148
2150
  }
2149
2151
  else {
2150
2152
  ts = qp->non_tokenizer;
2151
- ts->reset(ts, text);
2153
+ ts->reset(ts, text, encoding);
2152
2154
  }
2153
2155
  return ts;
2154
2156
  }
@@ -2305,11 +2307,10 @@ static FrtBooleanClause *get_bool_cls(FrtQuery *q, FrtBCType occur)
2305
2307
  * what we want as it will match any documents containing the same email
2306
2308
  * address and tokenized with the same tokenizer.
2307
2309
  */
2308
- static FrtQuery *get_term_q(FrtQParser *qp, FrtSymbol field, char *word)
2309
- {
2310
+ static FrtQuery *get_term_q(FrtQParser *qp, ID field, char *word, rb_encoding *encoding) {
2310
2311
  FrtQuery *q;
2311
2312
  FrtToken *token;
2312
- FrtTokenStream *stream = get_cached_ts(qp, field, word);
2313
+ FrtTokenStream *stream = get_cached_ts(qp, field, word, encoding);
2313
2314
 
2314
2315
  if ((token = frt_ts_next(stream)) == NULL) {
2315
2316
  q = NULL;
@@ -2343,11 +2344,10 @@ static FrtQuery *get_term_q(FrtQParser *qp, FrtSymbol field, char *word)
2343
2344
  * will be used. If there are any more tokens after tokenization, they will be
2344
2345
  * ignored.
2345
2346
  */
2346
- static FrtQuery *get_fuzzy_q(FrtQParser *qp, FrtSymbol field, char *word, char *slop_str)
2347
- {
2347
+ static FrtQuery *get_fuzzy_q(FrtQParser *qp, ID field, char *word, char *slop_str, rb_encoding *encoding) {
2348
2348
  FrtQuery *q;
2349
2349
  FrtToken *token;
2350
- FrtTokenStream *stream = get_cached_ts(qp, field, word);
2350
+ FrtTokenStream *stream = get_cached_ts(qp, field, word, encoding);
2351
2351
 
2352
2352
  if ((token = frt_ts_next(stream)) == NULL) {
2353
2353
  q = NULL;
@@ -2365,31 +2365,20 @@ static FrtQuery *get_fuzzy_q(FrtQParser *qp, FrtSymbol field, char *word, char *
2365
2365
  }
2366
2366
 
2367
2367
  /**
2368
- * Downcase a string taking locale into account and works for multibyte
2369
- * character sets.
2368
+ * Downcase a string taking encoding into account and works for multibyte character sets.
2370
2369
  */
2371
- static char *lower_str(char *str)
2372
- {
2373
- const int max_len = (int)strlen(str) + 1;
2374
- int cnt;
2375
- wchar_t *wstr = FRT_ALLOC_N(wchar_t, max_len);
2376
- if ((cnt = mbstowcs(wstr, str, max_len)) > 0) {
2377
- wchar_t *w = wstr;
2378
- while (*w) {
2379
- *w = towlower(*w);
2380
- w++;
2381
- }
2382
- wcstombs(str, wstr, max_len);
2383
- }
2384
- else {
2385
- char *s = str;
2386
- while (*s) {
2387
- *s = tolower(*s);
2388
- s++;
2389
- }
2390
- }
2391
- free(wstr);
2392
- str[max_len] = '\0';
2370
+ static char *lower_str(char *str, int len, rb_encoding *enc) {
2371
+ OnigCaseFoldType fold_type = ONIGENC_CASE_DOWNCASE;
2372
+ const int max_len = len + 20; // CASE_MAPPING_ADDITIONAL_LENGTH
2373
+ char *buf = FRT_ALLOC_N(char, max_len);
2374
+ char *buf_end = buf + max_len + 19;
2375
+ const OnigUChar *t = (const OnigUChar *)str;
2376
+
2377
+ len = enc->case_map(&fold_type, &t, (const OnigUChar *)(str + len), (OnigUChar *)buf, (OnigUChar *)buf_end, enc);
2378
+ memcpy(str, buf, len);
2379
+ str[len] = '\0';
2380
+ free(buf);
2381
+
2393
2382
  return str;
2394
2383
  }
2395
2384
 
@@ -2402,8 +2391,7 @@ static char *lower_str(char *str)
2402
2391
  * optimized to a MatchAllQuery if the pattern is '*' or a PrefixQuery if the
2403
2392
  * only wild char (*, ?) in the pattern is a '*' at the end of the pattern.
2404
2393
  */
2405
- static FrtQuery *get_wild_q(FrtQParser *qp, FrtSymbol field, char *pattern)
2406
- {
2394
+ static FrtQuery *get_wild_q(FrtQParser *qp, ID field, char *pattern, rb_encoding *encoding) {
2407
2395
  FrtQuery *q;
2408
2396
  bool is_prefix = false;
2409
2397
  char *p;
@@ -2411,7 +2399,7 @@ static FrtQuery *get_wild_q(FrtQParser *qp, FrtSymbol field, char *pattern)
2411
2399
 
2412
2400
  if (qp->wild_lower
2413
2401
  && (!qp->tokenized_fields || frt_hs_exists(qp->tokenized_fields, (void *)field))) {
2414
- lower_str(pattern);
2402
+ lower_str(pattern, len, encoding);
2415
2403
  }
2416
2404
 
2417
2405
  /* simplify the wildcard query to a prefix query if possible. Basically a
@@ -2446,9 +2434,8 @@ static FrtQuery *get_wild_q(FrtQParser *qp, FrtSymbol field, char *pattern)
2446
2434
  /**
2447
2435
  * Adds another field to the top of the FieldStack.
2448
2436
  */
2449
- static FrtHashSet *add_field(FrtQParser *qp, const char *field_name)
2450
- {
2451
- FrtSymbol field = rb_intern(field_name);
2437
+ static FrtHashSet *add_field(FrtQParser *qp, const char *field_name) {
2438
+ ID field = rb_intern(field_name);
2452
2439
  if (qp->allow_any_fields || frt_hs_exists(qp->all_fields, (void *)field)) {
2453
2440
  frt_hs_add(qp->fields, (void *)field);
2454
2441
  }
@@ -2574,8 +2561,7 @@ static Phrase *ph_add_multi_word(Phrase *self, char *word)
2574
2561
  * This problem can easily be solved by using the StandardTokenizer or any
2575
2562
  * custom tokenizer which will leave dbalmain@gmail.com as a single token.
2576
2563
  */
2577
- static FrtQuery *get_phrase_query(FrtQParser *qp, FrtSymbol field, Phrase *phrase, char *slop_str)
2578
- {
2564
+ static FrtQuery *get_phrase_query(FrtQParser *qp, ID field, Phrase *phrase, char *slop_str, rb_encoding *encoding) {
2579
2565
  const int pos_cnt = phrase->size;
2580
2566
  FrtQuery *q = NULL;
2581
2567
 
@@ -2583,7 +2569,7 @@ static FrtQuery *get_phrase_query(FrtQParser *qp, FrtSymbol field, Phrase *phras
2583
2569
  char **words = phrase->positions[0].terms;
2584
2570
  const int word_count = frt_ary_size(words);
2585
2571
  if (word_count == 1) {
2586
- q = get_term_q(qp, field, words[0]);
2572
+ q = get_term_q(qp, field, words[0], encoding);
2587
2573
  }
2588
2574
  else {
2589
2575
  int i;
@@ -2592,7 +2578,7 @@ static FrtQuery *get_phrase_query(FrtQParser *qp, FrtSymbol field, Phrase *phras
2592
2578
  char *last_word = NULL;
2593
2579
 
2594
2580
  for (i = 0; i < word_count; i++) {
2595
- token = frt_ts_next(get_cached_ts(qp, field, words[i]));
2581
+ token = frt_ts_next(get_cached_ts(qp, field, words[i], encoding));
2596
2582
  if (token) {
2597
2583
  free(words[i]);
2598
2584
  last_word = words[i] = frt_estrdup(token->text);
@@ -2644,7 +2630,7 @@ static FrtQuery *get_phrase_query(FrtQParser *qp, FrtSymbol field, Phrase *phras
2644
2630
  pos_inc += phrase->positions[i].pos + 1; /* Actually holds pos_inc*/
2645
2631
 
2646
2632
  if (word_count == 1) {
2647
- stream = get_cached_ts(qp, field, words[0]);
2633
+ stream = get_cached_ts(qp, field, words[0], encoding);
2648
2634
  while ((token = frt_ts_next(stream))) {
2649
2635
  if (token->pos_inc) {
2650
2636
  frt_phq_add_term(q, token->text,
@@ -2661,7 +2647,7 @@ static FrtQuery *get_phrase_query(FrtQParser *qp, FrtSymbol field, Phrase *phras
2661
2647
  bool added_position = false;
2662
2648
 
2663
2649
  for (j = 0; j < word_count; j++) {
2664
- stream = get_cached_ts(qp, field, words[j]);
2650
+ stream = get_cached_ts(qp, field, words[j], encoding);
2665
2651
  if ((token = frt_ts_next(stream))) {
2666
2652
  if (!added_position) {
2667
2653
  frt_phq_add_term(q, token->text,
@@ -2685,10 +2671,10 @@ static FrtQuery *get_phrase_query(FrtQParser *qp, FrtSymbol field, Phrase *phras
2685
2671
  * the query parser as the all PhraseQuery didn't work well for this. Once the
2686
2672
  * PhraseQuery has been built the Phrase object needs to be destroyed.
2687
2673
  */
2688
- static FrtQuery *get_phrase_q(FrtQParser *qp, Phrase *phrase, char *slop_str)
2674
+ static FrtQuery *get_phrase_q(FrtQParser *qp, Phrase *phrase, char *slop_str, rb_encoding *encoding)
2689
2675
  {
2690
2676
  FrtQuery *volatile q = NULL;
2691
- FLDS(q, get_phrase_query(qp, field, phrase, slop_str));
2677
+ FLDS(q, get_phrase_query(qp, field, phrase, slop_str, encoding));
2692
2678
  ph_destroy(phrase);
2693
2679
  return q;
2694
2680
  }
@@ -2699,29 +2685,26 @@ static FrtQuery *get_phrase_q(FrtQParser *qp, Phrase *phrase, char *slop_str)
2699
2685
  * Just like with WildCardQuery, RangeQuery needs to downcase its terms if the
2700
2686
  * tokenizer also downcased its terms.
2701
2687
  */
2702
- static FrtQuery *get_r_q(FrtQParser *qp, FrtSymbol field, char *from, char *to, bool inc_lower, bool inc_upper)
2703
- {
2688
+ static FrtQuery *get_r_q(FrtQParser *qp, ID field, char *from, char *to, bool inc_lower, bool inc_upper, rb_encoding *encoding) {
2704
2689
  FrtQuery *rq;
2705
2690
  if (qp->wild_lower
2706
2691
  && (!qp->tokenized_fields || frt_hs_exists(qp->tokenized_fields, (void *)field))) {
2707
- if (from) {
2708
- lower_str(from);
2709
- }
2710
- if (to) {
2711
- lower_str(to);
2712
- }
2692
+ if (from)
2693
+ lower_str(from, strlen(from), encoding);
2694
+ if (to)
2695
+ lower_str(to, strlen(to), encoding);
2713
2696
  }
2714
2697
  /*
2715
2698
  * terms don't get tokenized as it doesn't really make sense to do so for
2716
2699
  * range queries.
2717
2700
 
2718
2701
  if (from) {
2719
- FrtTokenStream *stream = get_cached_ts(qp, field, from);
2702
+ FrtTokenStream *stream = get_cached_ts(qp, field, from, encoding);
2720
2703
  FrtToken *token = frt_ts_next(stream);
2721
2704
  from = token ? frt_estrdup(token->text) : NULL;
2722
2705
  }
2723
2706
  if (to) {
2724
- FrtTokenStream *stream = get_cached_ts(qp, field, to);
2707
+ FrtTokenStream *stream = get_cached_ts(qp, field, to, encoding);
2725
2708
  FrtToken *token = frt_ts_next(stream);
2726
2709
  to = token ? frt_estrdup(token->text) : NULL;
2727
2710
  }
@@ -2789,20 +2772,16 @@ void frt_qp_destroy(FrtQParser *self)
2789
2772
  assert(NULL == self->fields_top);
2790
2773
 
2791
2774
  frt_h_destroy(self->ts_cache);
2792
- frt_tk_destroy(self->non_tokenizer);
2775
+ frt_ts_deref(self->non_tokenizer);
2793
2776
  frt_a_deref(self->analyzer);
2794
2777
  free(self);
2795
2778
  }
2796
2779
 
2797
- /**
2798
- * Creates a new QueryParser setting all boolean parameters to their defaults.
2799
- * If +def_fields+ is NULL then +all_fields+ is used in place of +def_fields+.
2800
- * Not also that this method ensures that all fields that exist in
2801
- * +def_fields+ must also exist in +all_fields+. This should make sense.
2802
- */
2803
- FrtQParser *frt_qp_new(FrtAnalyzer *analyzer)
2804
- {
2805
- FrtQParser *self = FRT_ALLOC(FrtQParser);
2780
+ FrtQParser *frt_qp_alloc() {
2781
+ return FRT_ALLOC(FrtQParser);
2782
+ }
2783
+
2784
+ FrtQParser *frt_qp_init(FrtQParser *self, FrtAnalyzer *analyzer) {
2806
2785
  self->or_default = true;
2807
2786
  self->wild_lower = true;
2808
2787
  self->clean_str = false;
@@ -2830,8 +2809,18 @@ FrtQParser *frt_qp_new(FrtAnalyzer *analyzer)
2830
2809
  return self;
2831
2810
  }
2832
2811
 
2833
- void frt_qp_add_field(FrtQParser *self, FrtSymbol field, bool is_default, bool is_tokenized)
2834
- {
2812
+ /**
2813
+ * Creates a new QueryParser setting all boolean parameters to their defaults.
2814
+ * If +def_fields+ is NULL then +all_fields+ is used in place of +def_fields+.
2815
+ * Not also that this method ensures that all fields that exist in
2816
+ * +def_fields+ must also exist in +all_fields+. This should make sense.
2817
+ */
2818
+ FrtQParser *frt_qp_new(FrtAnalyzer *analyzer) {
2819
+ FrtQParser *self = frt_qp_alloc();
2820
+ return frt_qp_init(self, analyzer);
2821
+ }
2822
+
2823
+ void frt_qp_add_field(FrtQParser *self, ID field, bool is_default, bool is_tokenized) {
2835
2824
  frt_hs_add(self->all_fields, (void *)field);
2836
2825
  if (is_default) {
2837
2826
  frt_hs_add(self->def_fields, (void *)field);
@@ -2961,12 +2950,12 @@ char *frt_qp_clean_str(char *str)
2961
2950
  * analyzer. It then turns these tokens (if any) into a boolean query. If it
2962
2951
  * fails to find any tokens, this method will return NULL.
2963
2952
  */
2964
- static FrtQuery *qp_get_bad_query(FrtQParser *qp, char *str)
2953
+ static FrtQuery *qp_get_bad_query(FrtQParser *qp, char *str, rb_encoding *encoding)
2965
2954
  {
2966
2955
  FrtQuery *volatile q = NULL;
2967
2956
  qp->recovering = true;
2968
2957
  assert(qp->fields_top->next == NULL);
2969
- FLDS(q, get_term_q(qp, field, str));
2958
+ FLDS(q, get_term_q(qp, field, str, encoding));
2970
2959
  return q;
2971
2960
  }
2972
2961
 
@@ -2978,40 +2967,63 @@ static FrtQuery *qp_get_bad_query(FrtQParser *qp, char *str)
2978
2967
  * and turns them into a boolean query on the default fields.
2979
2968
  */
2980
2969
 
2981
- FrtQuery *qp_parse(FrtQParser *self, char *qstr)
2970
+ FrtQuery *qp_parse(FrtQParser *self, char *query_string, rb_encoding *encoding)
2982
2971
  {
2983
2972
  FrtQuery *result = NULL;
2973
+ char *qstr;
2974
+ unsigned char *dp_start = NULL;
2975
+
2984
2976
  frt_mutex_lock(&self->mutex);
2985
2977
  /* if qp->fields_top->next is not NULL we have a left over field-stack
2986
2978
  * object that was not popped during the last query parse */
2987
2979
  assert(NULL == self->fields_top->next);
2988
2980
 
2981
+ /* encode query_string to utf8 for futher processing unless it is utf8 encoded */
2982
+ if (encoding == utf8_encoding) {
2983
+ qstr = query_string;
2984
+ } else {
2985
+ /* assume query is sbc encoded und encoding to utf results in maximum utf mbc expansion */
2986
+ const unsigned char *sp = (unsigned char *)query_string;
2987
+ int query_string_len = strlen(query_string);
2988
+ int dp_length = query_string_len * utf8_mbmaxlen + 1;
2989
+ unsigned char *dp = FRT_ALLOC_N(unsigned char, dp_length);
2990
+ dp_start = dp;
2991
+ rb_econv_t *ec = rb_econv_open(rb_enc_name(encoding), rb_enc_name(utf8_encoding), RUBY_ECONV_INVALID_REPLACE);
2992
+ assert(ec != NULL);
2993
+ rb_econv_convert(ec, &sp, (unsigned char *)query_string + query_string_len, &dp, (unsigned char *)dp + dp_length - 1, 0);
2994
+ rb_econv_close(ec);
2995
+ *dp = '\0';
2996
+ qstr = (char *)dp_start;
2997
+ }
2998
+
2989
2999
  self->recovering = self->destruct = false;
3000
+
2990
3001
  if (self->clean_str) {
2991
3002
  self->qstrp = self->qstr = frt_qp_clean_str(qstr);
2992
- }
2993
- else {
3003
+ } else {
2994
3004
  self->qstrp = self->qstr = qstr;
2995
3005
  }
2996
3006
  self->fields = self->def_fields;
2997
3007
  self->result = NULL;
2998
3008
 
2999
- if (0 == yyparse(self)) {
3009
+ if (0 == yyparse(self, encoding))
3000
3010
  result = self->result;
3001
- }
3011
+
3002
3012
  if (!result && self->handle_parse_errors) {
3003
3013
  self->destruct = false;
3004
- result = qp_get_bad_query(self, self->qstr);
3014
+ result = qp_get_bad_query(self, self->qstr, encoding);
3005
3015
  }
3006
- if (self->destruct && !self->handle_parse_errors) {
3016
+ if (self->destruct && !self->handle_parse_errors)
3007
3017
  FRT_RAISE(FRT_PARSE_ERROR, frt_xmsg_buffer);
3008
- }
3009
- if (!result) {
3018
+
3019
+ if (!result)
3010
3020
  result = frt_bq_new(false);
3011
- }
3012
- if (self->clean_str) {
3021
+
3022
+ if (self->clean_str)
3013
3023
  free(self->qstr);
3014
- }
3024
+ if (dp_start)
3025
+ free(dp_start);
3026
+
3015
3027
  frt_mutex_unlock(&self->mutex);
3016
3028
  return result;
3017
3029
  }