ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_qparser.c CHANGED
@@ -4,14 +4,16 @@
4
4
  static VALUE cQueryParser;
5
5
  VALUE cQueryParseException;
6
6
 
7
- VALUE rhandle_parse_errors_key;
8
- VALUE rallow_any_fields_key;
9
- VALUE rwild_lower_key;
10
- VALUE roccur_default_key;
11
- VALUE rdefault_slop_key;
12
- VALUE rclean_str_key;
13
- VALUE rfields_key;
14
- extern VALUE ranalyzer_key;
7
+ extern VALUE sym_analyzer;
8
+ static VALUE sym_wild_card_downcase;
9
+ static VALUE sym_all_fields;
10
+ static VALUE sym_default_field;
11
+ static VALUE sym_validate_fields;
12
+ static VALUE sym_or_default;
13
+ static VALUE sym_default_slop;
14
+ static VALUE sym_handle_parse_errors;
15
+ static VALUE sym_clean_string;
16
+ static VALUE sym_max_clauses;
15
17
 
16
18
  extern VALUE frt_get_analyzer(Analyzer *a);
17
19
  extern VALUE frt_get_q(Query *q);
@@ -26,163 +28,231 @@ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
26
28
  static void
27
29
  frt_qp_free(void *p)
28
30
  {
29
- QParser *qp = (QParser *)p;
30
- object_del(p);
31
- qp_destroy(qp);
31
+ object_del(p);
32
+ qp_destroy((QParser *)p);
32
33
  }
33
34
 
34
35
  static void
35
36
  frt_qp_mark(void *p)
36
37
  {
37
- QParser *qp = (QParser *)p;
38
- frt_gc_mark(qp->analyzer);
38
+ frt_gc_mark(((QParser *)p)->analyzer);
39
39
  }
40
40
 
41
- HashSet *
41
+ static HashSet *
42
42
  frt_get_fields(VALUE rfields)
43
43
  {
44
- VALUE rval;
45
- HashSet *fields = hs_str_create(&free);
46
- char *s, *p, *str;
44
+ VALUE rval;
45
+ HashSet *fields = hs_new_str(&free);
46
+ char *s, *p, *str;
47
47
 
48
- if (TYPE(rfields) == T_ARRAY) {
49
- int i;
50
- for (i = 0; i < RARRAY(rfields)->len; i++) {
51
- rval = rb_obj_as_string(RARRAY(rfields)->ptr[i]);
52
- hs_add(fields, estrdup(RSTRING(rval)->ptr));
53
- }
54
- } else {
55
- rval = rb_obj_as_string(rfields);
56
- if (strcmp("*", RSTRING(rval)->ptr) == 0) {
57
- hs_destroy(fields);
58
- fields = NULL;
48
+ if (TYPE(rfields) == T_ARRAY) {
49
+ int i;
50
+ for (i = 0; i < RARRAY(rfields)->len; i++) {
51
+ rval = rb_obj_as_string(RARRAY(rfields)->ptr[i]);
52
+ hs_add(fields, estrdup(RSTRING(rval)->ptr));
53
+ }
59
54
  } else {
60
- s = str = estrdup(RSTRING(rval)->ptr);
61
- while ((p = strchr(s, '|')) != '\0') {
62
- *p = '\0';
63
- hs_add(fields, estrdup(s));
64
- s = p + 1;
65
- }
66
- hs_add(fields, estrdup(s));
67
- free(str);
55
+ rval = rb_obj_as_string(rfields);
56
+ if (strcmp("*", RSTRING(rval)->ptr) == 0) {
57
+ hs_destroy(fields);
58
+ fields = NULL;
59
+ } else {
60
+ s = str = estrdup(RSTRING(rval)->ptr);
61
+ while ((p = strchr(s, '|')) != '\0') {
62
+ *p = '\0';
63
+ hs_add(fields, estrdup(s));
64
+ s = p + 1;
65
+ }
66
+ hs_add(fields, estrdup(s));
67
+ free(str);
68
+ }
68
69
  }
69
- }
70
- return fields;
70
+ return fields;
71
71
  }
72
72
 
73
+ /*
74
+ * call-seq:
75
+ * QueryParser.new(options = {}) -> QueryParser
76
+ *
77
+ * Create a new QueryParser. The QueryParser is used to convert string
78
+ * queries into Query objects. The options are;
79
+ *
80
+ * === Options
81
+ *
82
+ * :default_field:: Default: "*" (all fields). The default field to
83
+ * search when no field is specified in the search
84
+ * string. It can also be an array of fields.
85
+ * :analyzer:: Default: StandardAnalyzer. Analyzer used by the
86
+ * query parser to parse query terms
87
+ * :wild_card_downcase:: Default: true. Specifies whether wild-card queries
88
+ * should be downcased or not since they are not
89
+ * passed through the parser
90
+ * :all_fields:: Default: []. Lets the query parser know what
91
+ * fields are available for searching, particularly
92
+ * when the "*" is specified as the search field
93
+ * :validate_fields:: Default: false. Set to true if you want an
94
+ * exception to be raised if there is an attempt to
95
+ * search a non-existent field
96
+ * :or_default:: Default: true. Use "OR" as the default boolean
97
+ * operator
98
+ * :default_slop:: Default: 0. Default slop to use in PhraseQuery
99
+ * :handle_parser_errors:: Default: true. QueryParser will quietly handle all
100
+ * parsing errors internally. If you'd like to handle
101
+ * them yourself, set this parameter to false.
102
+ * :clean_string:: Default: true. QueryParser will do a quick
103
+ * once-over the query string make sure that quotes
104
+ * and brackets match up and special characters are
105
+ * escaped
106
+ * :max_clauses:: Default: 512. the maximum number of clauses
107
+ * allowed in boolean queries and the maximum number
108
+ * of terms allowed in multi, prefix, wild-card or
109
+ * fuzzy queries when those queries are generated by
110
+ * rewriting other queries
111
+ */
73
112
  static VALUE
74
113
  frt_qp_init(int argc, VALUE *argv, VALUE self)
75
114
  {
76
- VALUE rdef_field, roptions;
77
- VALUE rval;
78
- Analyzer *analyzer = NULL;
79
-
80
- HashSet *all_fields = hs_str_create(&free);
81
- HashSet *def_fields = NULL;
82
- QParser *qp;
83
-
84
- rb_scan_args(argc, argv, "02", &rdef_field, &roptions);
85
- /* process default_field */
86
- if (argc > 0) {
87
- def_fields = frt_get_fields(rdef_field);
88
- }
89
-
90
- if (argc == 2) {
91
- if (Qnil != (rval = rb_hash_aref(roptions, ranalyzer_key))) {
92
- analyzer = frt_get_cwrapped_analyzer(rval);
93
- }
94
- if (Qnil != (rval = rb_hash_aref(roptions, rfields_key))) {
95
- all_fields = frt_get_fields(rval);
96
- }
97
- }
98
- if (all_fields == NULL) {
99
- all_fields = hs_str_create(&free);
100
- }
101
-
102
- if (!analyzer) {
103
- analyzer = mb_standard_analyzer_create(true);
104
- }
105
-
106
- qp = qp_create(all_fields, def_fields, analyzer);
107
- qp->allow_any_fields = true;
108
- qp->clean_str = true;
109
- /* handle options */
110
- if (argc == 2) {
111
- if (Qnil != (rval = rb_hash_aref(roptions, rhandle_parse_errors_key))) {
112
- qp->handle_parse_errors = RTEST(rval);
113
- }
114
- if (Qnil != (rval = rb_hash_aref(roptions, rallow_any_fields_key))) {
115
- qp->allow_any_fields = RTEST(rval);
116
- }
117
- if (Qnil != (rval = rb_hash_aref(roptions, rwild_lower_key))) {
118
- qp->wild_lower = RTEST(rval);
115
+ VALUE roptions;
116
+ VALUE rval;
117
+ Analyzer *analyzer = NULL;
118
+ bool has_options = false;
119
+
120
+ HashSet *all_fields = NULL;
121
+ HashSet *def_fields = NULL;
122
+ QParser *qp;
123
+
124
+ if (rb_scan_args(argc, argv, "01", &roptions) > 0) {
125
+ if (TYPE(roptions) == T_HASH) {
126
+ has_options = true;
127
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_default_field))) {
128
+ def_fields = frt_get_fields(rval);
129
+ }
130
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_analyzer))) {
131
+ analyzer = frt_get_cwrapped_analyzer(rval);
132
+ }
133
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_all_fields))) {
134
+ all_fields = frt_get_fields(rval);
135
+ }
136
+ } else {
137
+ def_fields = frt_get_fields(roptions);
138
+ }
119
139
  }
120
- if (Qnil != (rval = rb_hash_aref(roptions, roccur_default_key))) {
121
- qp->or_default = (FIX2INT(rval) == BC_MUST) ? false : true;
140
+ if (all_fields == NULL) {
141
+ all_fields = hs_new_str(&free);
122
142
  }
123
- if (Qnil != (rval = rb_hash_aref(roptions, rdefault_slop_key))) {
124
- qp->def_slop = FIX2INT(rval);
143
+
144
+ if (!analyzer) {
145
+ analyzer = mb_standard_analyzer_new(true);
125
146
  }
126
- if (Qnil != (rval = rb_hash_aref(roptions, rclean_str_key))) {
127
- qp->clean_str = RTEST(rval);
147
+
148
+ qp = qp_new(all_fields, def_fields, analyzer);
149
+ qp->allow_any_fields = true;
150
+ qp->clean_str = true;
151
+ /* handle options */
152
+ if (argc > 0) {
153
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
154
+ qp->handle_parse_errors = RTEST(rval);
155
+ }
156
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_validate_fields))) {
157
+ qp->allow_any_fields = !RTEST(rval);
158
+ }
159
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_wild_card_downcase))) {
160
+ qp->wild_lower = RTEST(rval);
161
+ }
162
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_or_default))) {
163
+ qp->or_default = (FIX2INT(rval) == BC_MUST) ? false : true;
164
+ }
165
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_default_slop))) {
166
+ qp->def_slop = FIX2INT(rval);
167
+ }
168
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_clean_string))) {
169
+ qp->clean_str = RTEST(rval);
170
+ }
171
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
172
+ qp->max_clauses = FIX2INT(rval);
173
+ }
128
174
  }
129
- }
130
- Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
131
- object_add(qp, self);
132
- return self;
175
+ Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
176
+ object_add(qp, self);
177
+ return self;
133
178
  }
134
179
 
135
180
  #define GET_QP QParser *qp = (QParser *)DATA_PTR(self)
181
+ /*
182
+ * call-seq:
183
+ * query_parser.parse(query_string) -> Query
184
+ *
185
+ * Parse a query string returning a Query object if parsing was successful.
186
+ * Will raise a QueryParseException if unsuccessful.
187
+ */
136
188
  static VALUE
137
189
  frt_qp_parse(VALUE self, VALUE rstr)
138
190
  {
139
- char *msg = NULL;
140
- volatile VALUE rq;
141
- GET_QP;
142
- rstr = rb_obj_as_string(rstr);
143
- TRY
144
- rq = frt_get_q(qp_parse(qp, RSTRING(rstr)->ptr));
145
- break;
146
- default:
147
- msg = xcontext.msg;
148
- HANDLED();
149
- XENDTRY
150
-
151
- if (msg) {
152
- rb_raise(cQueryParseException, msg);
153
- }
154
-
155
- return rq;
191
+ const char *msg = NULL;
192
+ volatile VALUE rq;
193
+ GET_QP;
194
+ rstr = rb_obj_as_string(rstr);
195
+ TRY
196
+ rq = frt_get_q(qp_parse(qp, RSTRING(rstr)->ptr));
197
+ break;
198
+ default:
199
+ msg = xcontext.msg;
200
+ HANDLED();
201
+ XENDTRY
202
+
203
+ if (msg) {
204
+ rb_raise(cQueryParseException, msg);
205
+ }
206
+
207
+ return rq;
156
208
  }
157
209
 
210
+ /*
211
+ * call-seq:
212
+ * query_parser.fields -> Array of Symbols
213
+ *
214
+ * Returns the list of all fields that the QueryParser knows about.
215
+ */
158
216
  static VALUE
159
217
  frt_qp_get_fields(VALUE self)
160
218
  {
161
- GET_QP;
162
- int i;
163
- HashSet *fields = qp->all_fields;
164
- VALUE rfields = rb_ary_new();
219
+ GET_QP;
220
+ int i;
221
+ HashSet *fields = qp->all_fields;
222
+ VALUE rfields = rb_ary_new();
165
223
 
166
- for (i = 0; i < fields->size; i++) {
167
- rb_ary_push(rfields, rb_str_new2((char *)fields->elems[i]));
168
- }
224
+ for (i = 0; i < fields->size; i++) {
225
+ rb_ary_push(rfields, ID2SYM(rb_intern((char *)fields->elems[i])));
226
+ }
169
227
 
170
- return rfields;
228
+ return rfields;
171
229
  }
172
230
 
231
+ /*
232
+ * call-seq:
233
+ * query_parser.fields = fields -> self
234
+ *
235
+ * Set the list of fields. These fields are expanded for searches on "*".
236
+ */
173
237
  static VALUE
174
238
  frt_qp_set_fields(VALUE self, VALUE rfields)
175
239
  {
176
- GET_QP;
177
- HashSet *fields = frt_get_fields(rfields);
240
+ GET_QP;
241
+ HashSet *fields = frt_get_fields(rfields);
178
242
 
179
- if (fields == NULL) {
180
- fields = hs_str_create(&free);
181
- }
182
- hs_destroy(qp->all_fields);
183
- qp->all_fields = fields;
243
+ if (qp->def_fields == qp->all_fields) {
244
+ qp->def_fields = NULL;
245
+ }
246
+ if (fields == NULL) {
247
+ fields = hs_new_str(&free);
248
+ }
249
+ hs_destroy(qp->all_fields);
250
+ qp->all_fields = fields;
251
+ if (qp->def_fields == NULL) {
252
+ qp->def_fields = fields;
253
+ }
184
254
 
185
- return self;
255
+ return self;
186
256
  }
187
257
 
188
258
  /****************************************************************************
@@ -191,28 +261,245 @@ frt_qp_set_fields(VALUE self, VALUE rfields)
191
261
  *
192
262
  ****************************************************************************/
193
263
 
264
+ /* rdoc hack
265
+ extern VALUE mFerret = rb_define_module("Ferret");
266
+ extern VALUE cQueryParser = rb_define_module_under(mFerret, "QueryParser");
267
+ */
268
+
269
+ /*
270
+ * Document-class: Ferret::QueryParser::QueryParseException
271
+ *
272
+ * == Summary
273
+ *
274
+ * Exception raised when there is an error parsing the query string passed to
275
+ * QueryParser.
276
+ */
277
+ void
278
+ Init_QueryParseException(void)
279
+ {
280
+ cQueryParseException = rb_define_class_under(cQueryParser,
281
+ "QueryParseException",
282
+ rb_eStandardError);
283
+ }
284
+
285
+ /*
286
+ * Document-class: Ferret::QueryParser
287
+ *
288
+ * == Summary
289
+ *
290
+ * The QueryParser is used to transform user submitted query strings into
291
+ * QueryObjects. Ferret using its own Query Language known from now on as
292
+ * Ferret Query Language or FQL.
293
+ *
294
+ * == Ferret Query Language
295
+ *
296
+ * === Preamble
297
+ *
298
+ * The following characters are special characters in FQL;
299
+ *
300
+ * :, (, ), [, ], {, }, !, +, ", ~, ^, -, |, <, >, =, *, ?, \
301
+ *
302
+ * If you want to use one of these characters in one of your terms you need
303
+ * to escape it with a \ character. \ escapes itself. The exception to this
304
+ * rule is within Phrases which a strings surrounded by double quotes (and
305
+ * will be explained further bellow in the section on PhraseQueries). In
306
+ * Phrases, only ", | and <> have special meaning and need to be escaped if
307
+ * you want the literal value. <> is escaped \<\>.
308
+ *
309
+ * In the following examples I have only written the query string. This would
310
+ * be parse like;
311
+ *
312
+ * query = query_parser.parse("pet:(dog AND cat)")
313
+ * puts query # => "+pet:dog +pet:cat"
314
+ *
315
+ * === TermQuery
316
+ *
317
+ * A term query is the most basic query of all and is what most of the other
318
+ * queries are built upon. The term consists of a single word. eg;
319
+ *
320
+ * 'term'
321
+ *
322
+ * Note that the analyzer will be run on the term and if it splits the term
323
+ * in two then it will be turned into a phrase query. For example, with the
324
+ * plain Ferret::Analysis::Analyzer, the following;
325
+ *
326
+ * 'dave12balmain'
327
+ *
328
+ * is equivalent to;
329
+ *
330
+ * '"dave balmain"'
331
+ *
332
+ * Which we will explain now...
333
+ *
334
+ * === PhraseQuery
335
+ *
336
+ * A phrase query is a string of terms surrounded by double quotes. For
337
+ * example you could write;
338
+ *
339
+ * '"quick brown fox"'
340
+ *
341
+ * But if a "fast" fox is just as good as a quick one you could use the |
342
+ * character to specify alternate terms.
343
+ *
344
+ * '"quick|speedy|fast brown fox"'
345
+ *
346
+ * What if we don't care what colour the fox is. We can use the <> to specify
347
+ * a place setter. eg;
348
+ *
349
+ * '"quick|speedy|fast <> fox"'
350
+ *
351
+ * This will match any word in between quick and fox. Alternatively we could
352
+ * set the "slop" for the phrase which allows a certain variation in the
353
+ * match of the phrase. The slop for a phrase is an integer indicating how
354
+ * many positions you are allowed to move the terms to get a match. Read more
355
+ * about the slop factor in Ferret::Search::PhraseQuery. To set the slop
356
+ * factor for a phrase you can type;
357
+ *
358
+ * '"big house"~2'
359
+ *
360
+ * This would match "big house", "big red house", "big red brick house" and
361
+ * even "house big". That's right, you don't need to have th terms in order
362
+ * if you allow some slop in your phrases. (See Ferret::Search::Spans if you
363
+ * need a phrase type query with ordered terms.)
364
+ *
365
+ * These basic queries will be run on the default field which is set when you
366
+ * create the query_parser. But what if you want to search a different field.
367
+ * You'll be needing a ...
368
+ *
369
+ * === FieldQuery
370
+ *
371
+ * A field query is any field prefixed by <fieldname>:. For example, to
372
+ * search for all instances of the term "ski" in field "sport", you'd write;
373
+ *
374
+ * 'sport:ski'
375
+ * Or we can apply a field to phrase;
376
+ *
377
+ * 'sport:"skiing is fun"'
378
+ *
379
+ * Now we have a few types of queries, we'll be needing to glue them together
380
+ * with a ...
381
+ *
382
+ * === BooleanQuery
383
+ *
384
+ * There are a couple of ways of writing boolean queries. Firstly you can
385
+ * specify which terms are required, optional or required not to exist (not).
386
+ *
387
+ * * '+' or "REQ" can be used to indicate a required query. "REQ" must be
388
+ * surrounded by white space.
389
+ * * '-', '!' or "NOT" are used to indicate query that is required to be
390
+ * false. "NOT" must be surrounded by white space.
391
+ * * all other queries are optional if the above symbols are used.
392
+ *
393
+ * Some examples;
394
+ *
395
+ * '+sport:ski -sport:snowboard sport:toboggan'
396
+ * '+ingredient:chocolate +ingredient:strawberries -ingredient:wheat'
397
+ *
398
+ * You may also use the boolean operators "AND", "&&", "OR" and "||". eg;
399
+ *
400
+ * 'sport:ski AND NOT sport:snowboard OR sport:toboggan'
401
+ * 'ingredient:chocolate AND ingredient:strawberries AND NOT ingredient:wheat'
402
+ *
403
+ * You can set the default operator when you create the query parse.
404
+ *
405
+ * === RangeQuery
406
+ *
407
+ * A range query finds all documents with terms between the two query terms.
408
+ * This can be very useful in particular for dates. eg;
409
+ *
410
+ * 'date:[20050725 20050905]' # all dates >= 20050725 and <= 20050905
411
+ * 'date:[20050725 20050905}' # all dates >= 20050725 and < 20050905
412
+ * 'date:{20050725 20050905]' # all dates > 20050725 and <= 20050905
413
+ * 'date:{20050725 20050905}' # all dates > 20050725 and < 20050905
414
+ *
415
+ * You can also do open ended queries like this;
416
+ *
417
+ * 'date:[20050725>' # all dates >= 20050725
418
+ * 'date:{20050725>' # all dates > 20050725
419
+ * 'date:<20050905]' # all dates <= 20050905
420
+ * 'date:<20050905}' # all dates < 20050905
421
+ *
422
+ * Or like this;
423
+ *
424
+ * 'date: >= 20050725'
425
+ * 'date: > 20050725'
426
+ * 'date: <= 20050905'
427
+ * 'date: < 20050905'
428
+ *
429
+ * If you prefer the above style you could use a boolean query but like this;
430
+ *
431
+ * 'date:( >= 20050725 AND <= 20050905)'
432
+ *
433
+ * But rangequery only solution shown first will be faster.
434
+ *
435
+ * === WildQuery
436
+ *
437
+ * A wild query is a query using the pattern matching characters * and ?. *
438
+ * matchs 0 or more characters while ? matchs a single character. This type
439
+ * of query can be really useful for matching heirarchical categories for
440
+ * example. Let's say we had this structure;
441
+ *
442
+ * /sport/skiing
443
+ * /sport/cycling
444
+ * /coding1/ruby
445
+ * /coding1/c
446
+ * /coding2/python
447
+ * /coding2/perl
448
+ *
449
+ * If you wanted all categories with programming languages you could use the
450
+ * query;
451
+ *
452
+ * 'category:/coding?/?*'
453
+ *
454
+ * Note that this query can be quite expensive if not used carefully. In the
455
+ * example above there would be no problem but you should be careful not use
456
+ * the wild characters at the beginning of the query as it'll have to iterate
457
+ * through every term in that field. Having said that, some fields like the
458
+ * category field above will only have a small number of distinct fields so
459
+ * this could be ok.
460
+ *
461
+ * === FuzzyQuery
462
+ *
463
+ * This is like the sloppy phrase query above, except you are now adding slop
464
+ * to a term. Basically it measures the Levenshtein distance between two
465
+ * terms and if the value is below the slop threshold the term is a match.
466
+ * This time though the slop must be a float between 0 and 1.0, 1.0 being a
467
+ * perfect match and 0 being far from a match. The default is set to 0.5 so
468
+ * you don't need to give a slop value if you don't want to. You can set the
469
+ * default in the Ferret::Search::FuzzyQuery class. Here are a couple of
470
+ * examples;
471
+ *
472
+ * 'content:ferret~'
473
+ * 'content:Ostralya~0.4'
474
+ *
475
+ * Note that this query can be quite expensive. If you'd like to use this
476
+ * query, you may want to set a mininum prefix length in the FuzzyQuery
477
+ * class. This can substantially reduce the number of terms that the query
478
+ * will iterate over.
479
+ *
480
+ */
194
481
  void
195
- Init_qparser(void)
482
+ Init_QueryParser(void)
196
483
  {
197
- /* hash keys */
198
- rhandle_parse_errors_key = ID2SYM(rb_intern("handle_parse_errors"));
199
- rallow_any_fields_key = ID2SYM(rb_intern("allow_any_fields"));
200
- rwild_lower_key = ID2SYM(rb_intern("wild_lower"));
201
- roccur_default_key = ID2SYM(rb_intern("occur_default"));
202
- rdefault_slop_key = ID2SYM(rb_intern("default_slop"));
203
- rclean_str_key = ID2SYM(rb_intern("clean_string"));
204
- rfields_key = ID2SYM(rb_intern("fields"));
205
-
206
- /* QueryParser */
207
- cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
208
- rb_define_alloc_func(cQueryParser, frt_data_alloc);
209
-
210
- rb_define_method(cQueryParser, "initialize", frt_qp_init, -1);
211
- rb_define_method(cQueryParser, "parse", frt_qp_parse, 1);
212
- rb_define_method(cQueryParser, "fields", frt_qp_get_fields, 0);
213
- rb_define_method(cQueryParser, "fields=", frt_qp_set_fields, 1);
214
-
215
- /* QueryParseException */
216
- cQueryParseException = rb_define_class_under(cQueryParser,
217
- "QueryParseException", rb_eStandardError);
484
+ /* hash keys */
485
+ sym_wild_card_downcase = ID2SYM(rb_intern("wild_card_downcase"));
486
+ sym_all_fields = ID2SYM(rb_intern("fields"));
487
+ sym_default_field = ID2SYM(rb_intern("default_field"));
488
+ sym_validate_fields = ID2SYM(rb_intern("validate_fields"));
489
+ sym_or_default = ID2SYM(rb_intern("or_default"));
490
+ sym_default_slop = ID2SYM(rb_intern("default_slop"));
491
+ sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
492
+ sym_clean_string = ID2SYM(rb_intern("clean_string"));
493
+ sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
494
+
495
+ /* QueryParser */
496
+ cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
497
+ rb_define_alloc_func(cQueryParser, frt_data_alloc);
498
+
499
+ rb_define_method(cQueryParser, "initialize", frt_qp_init, -1);
500
+ rb_define_method(cQueryParser, "parse", frt_qp_parse, 1);
501
+ rb_define_method(cQueryParser, "fields", frt_qp_get_fields, 0);
502
+ rb_define_method(cQueryParser, "fields=", frt_qp_set_fields, 1);
503
+
504
+ Init_QueryParseException();
218
505
  }