sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,585 @@
1
+ #include "ferret.h"
2
+ #include "search.h"
3
+
4
+ static VALUE cQueryParser;
5
+ VALUE cQueryParseException;
6
+
7
+ extern VALUE sym_analyzer;
8
+ static VALUE sym_wild_card_downcase;
9
+ static VALUE sym_fields;
10
+ static VALUE sym_all_fields;
11
+ static VALUE sym_tkz_fields;
12
+ static VALUE sym_default_field;
13
+ static VALUE sym_validate_fields;
14
+ static VALUE sym_or_default;
15
+ static VALUE sym_default_slop;
16
+ static VALUE sym_handle_parse_errors;
17
+ static VALUE sym_clean_string;
18
+ static VALUE sym_max_clauses;
19
+ static VALUE sym_use_keywords;
20
+
21
+ extern VALUE frt_get_analyzer(Analyzer *a);
22
+ extern VALUE frt_get_q(Query *q);
23
+ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
24
+
25
+ /****************************************************************************
26
+ *
27
+ * QueryParser Methods
28
+ *
29
+ ****************************************************************************/
30
+
31
+ static void
32
+ frt_qp_free(void *p)
33
+ {
34
+ object_del(p);
35
+ qp_destroy((QParser *)p);
36
+ }
37
+
38
+ static void
39
+ frt_qp_mark(void *p)
40
+ {
41
+ frt_gc_mark(((QParser *)p)->analyzer);
42
+ }
43
+
44
+ static HashSet *
45
+ frt_get_fields(VALUE rfields)
46
+ {
47
+ VALUE rval;
48
+ HashSet *fields;
49
+ char *s, *p, *str;
50
+
51
+ if (rfields == Qnil) return NULL;
52
+
53
+ fields = hs_new_str(&free);
54
+ if (TYPE(rfields) == T_ARRAY) {
55
+ int i;
56
+ for (i = 0; i < RARRAY_LEN(rfields); i++) {
57
+ rval = rb_obj_as_string(RARRAY_PTR(rfields)[i]);
58
+ hs_add(fields, nstrdup(rval));
59
+ }
60
+ } else {
61
+ rval = rb_obj_as_string(rfields);
62
+ if (strcmp("*", rs2s(rval)) == 0) {
63
+ hs_destroy(fields);
64
+ fields = NULL;
65
+ } else {
66
+ s = str = nstrdup(rval);
67
+ while ((p = strchr(s, '|')) != '\0') {
68
+ *p = '\0';
69
+ hs_add(fields, estrdup(s));
70
+ s = p + 1;
71
+ }
72
+ hs_add(fields, estrdup(s));
73
+ free(str);
74
+ }
75
+ }
76
+ return fields;
77
+ }
78
+
79
+ /*
80
+ * call-seq:
81
+ * QueryParser.new(options = {}) -> QueryParser
82
+ *
83
+ * Create a new QueryParser. The QueryParser is used to convert string
84
+ * queries into Query objects. The options are;
85
+ *
86
+ * === Options
87
+ *
88
+ * :default_field:: Default: "*" (all fields). The default field to
89
+ * search when no field is specified in the search
90
+ * string. It can also be an array of fields.
91
+ * :analyzer:: Default: StandardAnalyzer. Analyzer used by the
92
+ * query parser to parse query terms
93
+ * :wild_card_downcase:: Default: true. Specifies whether wild-card queries
94
+ * and range queries should be downcased or not since
95
+ * they are not passed through the parser
96
+ * :fields:: Default: []. Lets the query parser know what
97
+ * fields are available for searching, particularly
98
+ * when the "*" is specified as the search field
99
+ * :tokenized_fields:: Default: :fields. Lets the query parser know which
100
+ * fields are tokenized so it knows which fields to
101
+ * run the analyzer over.
102
+ * :validate_fields:: Default: false. Set to true if you want an
103
+ * exception to be raised if there is an attempt to
104
+ * search a non-existent field
105
+ * :or_default:: Default: true. Use "OR" as the default boolean
106
+ * operator
107
+ * :default_slop:: Default: 0. Default slop to use in PhraseQuery
108
+ * :handle_parse_errors:: Default: true. QueryParser will quietly handle all
109
+ * parsing errors internally. If you'd like to handle
110
+ * them yourself, set this parameter to false.
111
+ * :clean_string:: Default: true. QueryParser will do a quick
112
+ * once-over the query string make sure that quotes
113
+ * and brackets match up and special characters are
114
+ * escaped
115
+ * :max_clauses:: Default: 512. the maximum number of clauses
116
+ * allowed in boolean queries and the maximum number
117
+ * of terms allowed in multi, prefix, wild-card or
118
+ * fuzzy queries when those queries are generated by
119
+ * rewriting other queries
120
+ * :use_keywords: Default: true. By default AND, OR, NOT and REQ are
121
+ * keywords used by the query parser. Sometimes this
122
+ * is undesirable. For example, if your application
123
+ * allows searching for US states by their
124
+ * abbreviation, then OR will be a common query
125
+ * string. By setting :use_keywords to false, OR will
126
+ * no longer be a keyword allowing searches for the
127
+ * state of Oregon. You will still be able to use
128
+ * boolean queries by using the + and - characters.
129
+ */
130
+ static VALUE
131
+ frt_qp_init(int argc, VALUE *argv, VALUE self)
132
+ {
133
+ VALUE roptions = Qnil;
134
+ VALUE rval;
135
+ Analyzer *analyzer = NULL;
136
+ bool has_options = false;
137
+
138
+ HashSet *all_fields = NULL;
139
+ HashSet *tkz_fields = NULL;
140
+ HashSet *def_fields = NULL;
141
+ QParser *qp;
142
+
143
+ if (rb_scan_args(argc, argv, "01", &roptions) > 0) {
144
+ if (TYPE(roptions) == T_HASH) {
145
+ has_options = true;
146
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_default_field))) {
147
+ def_fields = frt_get_fields(rval);
148
+ }
149
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_analyzer))) {
150
+ analyzer = frt_get_cwrapped_analyzer(rval);
151
+ }
152
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_all_fields))) {
153
+ all_fields = frt_get_fields(rval);
154
+ }
155
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_fields))) {
156
+ all_fields = frt_get_fields(rval);
157
+ }
158
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_tkz_fields))) {
159
+ tkz_fields = frt_get_fields(rval);
160
+ }
161
+ } else {
162
+ def_fields = frt_get_fields(roptions);
163
+ roptions = Qnil;
164
+ }
165
+ }
166
+ if (all_fields == NULL) {
167
+ all_fields = hs_new_str(&free);
168
+ }
169
+
170
+ if (!analyzer) {
171
+ analyzer = mb_standard_analyzer_new(true);
172
+ }
173
+
174
+ qp = qp_new(all_fields, def_fields, tkz_fields, analyzer);
175
+ qp->allow_any_fields = true;
176
+ qp->clean_str = true;
177
+ qp->handle_parse_errors = true;
178
+ /* handle options */
179
+ if (roptions != Qnil) {
180
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
181
+ qp->handle_parse_errors = RTEST(rval);
182
+ }
183
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_validate_fields))) {
184
+ qp->allow_any_fields = !RTEST(rval);
185
+ }
186
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_wild_card_downcase))) {
187
+ qp->wild_lower = RTEST(rval);
188
+ }
189
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_or_default))) {
190
+ qp->or_default = RTEST(rval);
191
+ }
192
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_default_slop))) {
193
+ qp->def_slop = FIX2INT(rval);
194
+ }
195
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_clean_string))) {
196
+ qp->clean_str = RTEST(rval);
197
+ }
198
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
199
+ qp->max_clauses = FIX2INT(rval);
200
+ }
201
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_use_keywords))) {
202
+ qp->use_keywords = RTEST(rval);
203
+ }
204
+ }
205
+ Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
206
+ object_add(qp, self);
207
+ return self;
208
+ }
209
+
210
+ #define GET_QP QParser *qp = (QParser *)DATA_PTR(self)
211
+ /*
212
+ * call-seq:
213
+ * query_parser.parse(query_string) -> Query
214
+ *
215
+ * Parse a query string returning a Query object if parsing was successful.
216
+ * Will raise a QueryParseException if unsuccessful.
217
+ */
218
+ static VALUE
219
+ frt_qp_parse(VALUE self, VALUE rstr)
220
+ {
221
+ const char *msg = NULL;
222
+ volatile VALUE rq;
223
+ GET_QP;
224
+ rstr = rb_obj_as_string(rstr);
225
+ TRY
226
+ rq = frt_get_q(qp_parse(qp, rs2s(rstr)));
227
+ break;
228
+ default:
229
+ msg = xcontext.msg;
230
+ HANDLED();
231
+ XENDTRY
232
+
233
+ if (msg) {
234
+ rb_raise(cQueryParseException, msg);
235
+ }
236
+
237
+ return rq;
238
+ }
239
+
240
+ /*
241
+ * call-seq:
242
+ * query_parser.fields -> Array of Symbols
243
+ *
244
+ * Returns the list of all fields that the QueryParser knows about.
245
+ */
246
+ static VALUE
247
+ frt_qp_get_fields(VALUE self)
248
+ {
249
+ GET_QP;
250
+ int i;
251
+ HashSet *fields = qp->all_fields;
252
+ VALUE rfields = rb_ary_new();
253
+
254
+ for (i = 0; i < fields->size; i++) {
255
+ rb_ary_push(rfields, ID2SYM(rb_intern((char *)fields->elems[i])));
256
+ }
257
+
258
+ return rfields;
259
+ }
260
+
261
+ /*
262
+ * call-seq:
263
+ * query_parser.fields = fields -> self
264
+ *
265
+ * Set the list of fields. These fields are expanded for searches on "*".
266
+ */
267
+ static VALUE
268
+ frt_qp_set_fields(VALUE self, VALUE rfields)
269
+ {
270
+ GET_QP;
271
+ HashSet *fields = frt_get_fields(rfields);
272
+
273
+ if (qp->def_fields == qp->all_fields) {
274
+ qp->def_fields = NULL;
275
+ }
276
+ if (fields == NULL) {
277
+ fields = hs_new_str(&free);
278
+ }
279
+ hs_destroy(qp->all_fields);
280
+ qp->all_fields = fields;
281
+ if (qp->def_fields == NULL) {
282
+ qp->def_fields = fields;
283
+ }
284
+
285
+ return self;
286
+ }
287
+
288
+ /*
289
+ * call-seq:
290
+ * query_parser.tokenized_fields -> Array of Symbols
291
+ *
292
+ * Returns the list of all tokenized_fields that the QueryParser knows about.
293
+ */
294
+ static VALUE
295
+ frt_qp_get_tkz_fields(VALUE self)
296
+ {
297
+ GET_QP;
298
+ int i;
299
+ HashSet *fields = qp->tokenized_fields;
300
+ if (fields) {
301
+ VALUE rfields = rb_ary_new();
302
+
303
+ for (i = 0; i < fields->size; i++) {
304
+ rb_ary_push(rfields, ID2SYM(rb_intern((char *)fields->elems[i])));
305
+ }
306
+
307
+ return rfields;
308
+ }
309
+ else {
310
+ return Qnil;
311
+ }
312
+ }
313
+
314
+ /*
315
+ * call-seq:
316
+ * query_parser.tokenized_fields = fields -> self
317
+ *
318
+ * Set the list of tokenized_fields. These tokenized_fields are tokenized in
319
+ * the queries. If this is set to Qnil then all fields will be tokenized.
320
+ */
321
+ static VALUE
322
+ frt_qp_set_tkz_fields(VALUE self, VALUE rfields)
323
+ {
324
+ GET_QP;
325
+ if (qp->tokenized_fields) hs_destroy(qp->tokenized_fields);
326
+ qp->tokenized_fields = frt_get_fields(rfields);
327
+ return self;
328
+ }
329
+
330
+ /****************************************************************************
331
+ *
332
+ * Init function
333
+ *
334
+ ****************************************************************************/
335
+
336
+ /* rdoc hack
337
+ extern VALUE mFerret = rb_define_module("Ferret");
338
+ extern VALUE cQueryParser = rb_define_module_under(mFerret, "QueryParser");
339
+ */
340
+
341
+ /*
342
+ * Document-class: Ferret::QueryParser::QueryParseException
343
+ *
344
+ * == Summary
345
+ *
346
+ * Exception raised when there is an error parsing the query string passed to
347
+ * QueryParser.
348
+ */
349
+ void
350
+ Init_QueryParseException(void)
351
+ {
352
+ cQueryParseException = rb_define_class_under(cQueryParser,
353
+ "QueryParseException",
354
+ rb_eStandardError);
355
+ }
356
+
357
+ /*
358
+ * Document-class: Ferret::QueryParser
359
+ *
360
+ * == Summary
361
+ *
362
+ * The QueryParser is used to transform user submitted query strings into
363
+ * QueryObjects. Ferret using its own Query Language known from now on as
364
+ * Ferret Query Language or FQL.
365
+ *
366
+ * == Ferret Query Language
367
+ *
368
+ * === Preamble
369
+ *
370
+ * The following characters are special characters in FQL;
371
+ *
372
+ * :, (, ), [, ], {, }, !, +, ", ~, ^, -, |, <, >, =, *, ?, \
373
+ *
374
+ * If you want to use one of these characters in one of your terms you need
375
+ * to escape it with a \ character. \ escapes itself. The exception to this
376
+ * rule is within Phrases which a strings surrounded by double quotes (and
377
+ * will be explained further bellow in the section on PhraseQueries). In
378
+ * Phrases, only ", | and <> have special meaning and need to be escaped if
379
+ * you want the literal value. <> is escaped \<\>.
380
+ *
381
+ * In the following examples I have only written the query string. This would
382
+ * be parse like;
383
+ *
384
+ * query = query_parser.parse("pet:(dog AND cat)")
385
+ * puts query # => "+pet:dog +pet:cat"
386
+ *
387
+ * === TermQuery
388
+ *
389
+ * A term query is the most basic query of all and is what most of the other
390
+ * queries are built upon. The term consists of a single word. eg;
391
+ *
392
+ * 'term'
393
+ *
394
+ * Note that the analyzer will be run on the term and if it splits the term
395
+ * in two then it will be turned into a phrase query. For example, with the
396
+ * plain Ferret::Analysis::Analyzer, the following;
397
+ *
398
+ * 'dave12balmain'
399
+ *
400
+ * is equivalent to;
401
+ *
402
+ * '"dave balmain"'
403
+ *
404
+ * Which we will explain now...
405
+ *
406
+ * === PhraseQuery
407
+ *
408
+ * A phrase query is a string of terms surrounded by double quotes. For
409
+ * example you could write;
410
+ *
411
+ * '"quick brown fox"'
412
+ *
413
+ * But if a "fast" fox is just as good as a quick one you could use the |
414
+ * character to specify alternate terms.
415
+ *
416
+ * '"quick|speedy|fast brown fox"'
417
+ *
418
+ * What if we don't care what colour the fox is. We can use the <> to specify
419
+ * a place setter. eg;
420
+ *
421
+ * '"quick|speedy|fast <> fox"'
422
+ *
423
+ * This will match any word in between quick and fox. Alternatively we could
424
+ * set the "slop" for the phrase which allows a certain variation in the
425
+ * match of the phrase. The slop for a phrase is an integer indicating how
426
+ * many positions you are allowed to move the terms to get a match. Read more
427
+ * about the slop factor in Ferret::Search::PhraseQuery. To set the slop
428
+ * factor for a phrase you can type;
429
+ *
430
+ * '"big house"~2'
431
+ *
432
+ * This would match "big house", "big red house", "big red brick house" and
433
+ * even "house big". That's right, you don't need to have th terms in order
434
+ * if you allow some slop in your phrases. (See Ferret::Search::Spans if you
435
+ * need a phrase type query with ordered terms.)
436
+ *
437
+ * These basic queries will be run on the default field which is set when you
438
+ * create the query_parser. But what if you want to search a different field.
439
+ * You'll be needing a ...
440
+ *
441
+ * === FieldQuery
442
+ *
443
+ * A field query is any field prefixed by <fieldname>:. For example, to
444
+ * search for all instances of the term "ski" in field "sport", you'd write;
445
+ *
446
+ * 'sport:ski'
447
+ * Or we can apply a field to phrase;
448
+ *
449
+ * 'sport:"skiing is fun"'
450
+ *
451
+ * Now we have a few types of queries, we'll be needing to glue them together
452
+ * with a ...
453
+ *
454
+ * === BooleanQuery
455
+ *
456
+ * There are a couple of ways of writing boolean queries. Firstly you can
457
+ * specify which terms are required, optional or required not to exist (not).
458
+ *
459
+ * * '+' or "REQ" can be used to indicate a required query. "REQ" must be
460
+ * surrounded by white space.
461
+ * * '-', '!' or "NOT" are used to indicate query that is required to be
462
+ * false. "NOT" must be surrounded by white space.
463
+ * * all other queries are optional if the above symbols are used.
464
+ *
465
+ * Some examples;
466
+ *
467
+ * '+sport:ski -sport:snowboard sport:toboggan'
468
+ * '+ingredient:chocolate +ingredient:strawberries -ingredient:wheat'
469
+ *
470
+ * You may also use the boolean operators "AND", "&&", "OR" and "||". eg;
471
+ *
472
+ * 'sport:ski AND NOT sport:snowboard OR sport:toboggan'
473
+ * 'ingredient:chocolate AND ingredient:strawberries AND NOT ingredient:wheat'
474
+ *
475
+ * You can set the default operator when you create the query parse.
476
+ *
477
+ * === RangeQuery
478
+ *
479
+ * A range query finds all documents with terms between the two query terms.
480
+ * This can be very useful in particular for dates. eg;
481
+ *
482
+ * 'date:[20050725 20050905]' # all dates >= 20050725 and <= 20050905
483
+ * 'date:[20050725 20050905}' # all dates >= 20050725 and < 20050905
484
+ * 'date:{20050725 20050905]' # all dates > 20050725 and <= 20050905
485
+ * 'date:{20050725 20050905}' # all dates > 20050725 and < 20050905
486
+ *
487
+ * You can also do open ended queries like this;
488
+ *
489
+ * 'date:[20050725>' # all dates >= 20050725
490
+ * 'date:{20050725>' # all dates > 20050725
491
+ * 'date:<20050905]' # all dates <= 20050905
492
+ * 'date:<20050905}' # all dates < 20050905
493
+ *
494
+ * Or like this;
495
+ *
496
+ * 'date: >= 20050725'
497
+ * 'date: > 20050725'
498
+ * 'date: <= 20050905'
499
+ * 'date: < 20050905'
500
+ *
501
+ * If you prefer the above style you could use a boolean query but like this;
502
+ *
503
+ * 'date:( >= 20050725 AND <= 20050905)'
504
+ *
505
+ * But rangequery only solution shown first will be faster.
506
+ *
507
+ * === WildQuery
508
+ *
509
+ * A wild query is a query using the pattern matching characters * and ?. *
510
+ * matches 0 or more characters while ? matches a single character. This type
511
+ * of query can be really useful for matching hierarchical categories for
512
+ * example. Let's say we had this structure;
513
+ *
514
+ * /sport/skiing
515
+ * /sport/cycling
516
+ * /coding1/ruby
517
+ * /coding1/c
518
+ * /coding2/python
519
+ * /coding2/perl
520
+ *
521
+ * If you wanted all categories with programming languages you could use the
522
+ * query;
523
+ *
524
+ * 'category:/coding?/?*'
525
+ *
526
+ * Note that this query can be quite expensive if not used carefully. In the
527
+ * example above there would be no problem but you should be careful not use
528
+ * the wild characters at the beginning of the query as it'll have to iterate
529
+ * through every term in that field. Having said that, some fields like the
530
+ * category field above will only have a small number of distinct fields so
531
+ * this could be okay.
532
+ *
533
+ * === FuzzyQuery
534
+ *
535
+ * This is like the sloppy phrase query above, except you are now adding slop
536
+ * to a term. Basically it measures the Levenshtein distance between two
537
+ * terms and if the value is below the slop threshold the term is a match.
538
+ * This time though the slop must be a float between 0 and 1.0, 1.0 being a
539
+ * perfect match and 0 being far from a match. The default is set to 0.5 so
540
+ * you don't need to give a slop value if you don't want to. You can set the
541
+ * default in the Ferret::Search::FuzzyQuery class. Here are a couple of
542
+ * examples;
543
+ *
544
+ * 'content:ferret~'
545
+ * 'content:Ostralya~0.4'
546
+ *
547
+ * Note that this query can be quite expensive. If you'd like to use this
548
+ * query, you may want to set a minimum prefix length in the FuzzyQuery
549
+ * class. This can substantially reduce the number of terms that the query
550
+ * will iterate over.
551
+ *
552
+ */
553
+ void
554
+ Init_QueryParser(void)
555
+ {
556
+ /* hash keys */
557
+ sym_wild_card_downcase = ID2SYM(rb_intern("wild_card_downcase"));
558
+ sym_fields = ID2SYM(rb_intern("fields"));
559
+ sym_all_fields = ID2SYM(rb_intern("all_fields"));
560
+ sym_tkz_fields = ID2SYM(rb_intern("tokenized_fields"));
561
+ sym_default_field = ID2SYM(rb_intern("default_field"));
562
+ sym_validate_fields = ID2SYM(rb_intern("validate_fields"));
563
+ sym_or_default = ID2SYM(rb_intern("or_default"));
564
+ sym_default_slop = ID2SYM(rb_intern("default_slop"));
565
+ sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
566
+ sym_clean_string = ID2SYM(rb_intern("clean_string"));
567
+ sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
568
+ sym_use_keywords = ID2SYM(rb_intern("use_keywords"));
569
+
570
+ /* QueryParser */
571
+ cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
572
+ rb_define_alloc_func(cQueryParser, frt_data_alloc);
573
+
574
+ rb_define_method(cQueryParser, "initialize", frt_qp_init, -1);
575
+ rb_define_method(cQueryParser, "parse", frt_qp_parse, 1);
576
+ rb_define_method(cQueryParser, "fields", frt_qp_get_fields, 0);
577
+ rb_define_method(cQueryParser, "fields=", frt_qp_set_fields, 1);
578
+ rb_define_method(cQueryParser, "tokenized_fields",
579
+ frt_qp_get_tkz_fields, 0);
580
+ rb_define_method(cQueryParser, "tokenized_fields=",
581
+ frt_qp_set_tkz_fields, 1);
582
+
583
+ Init_QueryParseException();
584
+ }
585
+