ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -58,6 +58,14 @@
58
58
  /* Using locations. */
59
59
  #define YYLSP_NEEDED 0
60
60
 
61
+ /* Substitute the variable and function names. */
62
+ #define yyparse frt_parse
63
+ #define yylex frt_lex
64
+ #define yyerror frt_error
65
+ #define yylval frt_lval
66
+ #define yychar frt_char
67
+ #define yydebug frt_debug
68
+ #define yynerrs frt_nerrs
61
69
 
62
70
 
63
71
  /* Tokens. */
@@ -90,14 +98,17 @@
90
98
 
91
99
 
92
100
  /* Copy the first part of user declarations. */
93
- #line 1 "src/q_parser.y"
101
+ #line 84 "src/q_parser.y"
94
102
 
95
103
  #include <string.h>
96
104
  #include <ctype.h>
97
105
  #include <wctype.h>
106
+ #include <assert.h>
98
107
  #include "except.h"
99
108
  #include "search.h"
100
109
  #include "array.h"
110
+ #include "symbol.h"
111
+ #include "internal.h"
101
112
 
102
113
  typedef struct Phrase {
103
114
  int size;
@@ -138,7 +149,7 @@ int qp_default_fuzzy_pre_len = 0;
138
149
 
139
150
  #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
140
151
  typedef union YYSTYPE
141
- #line 27 "src/q_parser.y"
152
+ #line 113 "src/q_parser.y"
142
153
  {
143
154
  Query *query;
144
155
  BooleanClause *bcls;
@@ -148,7 +159,7 @@ typedef union YYSTYPE
148
159
  char *str;
149
160
  }
150
161
  /* Line 187 of yacc.c. */
151
- #line 152 "y.tab.c"
162
+ #line 163 "src/q_parser.c"
152
163
  YYSTYPE;
153
164
  # define yystype YYSTYPE /* obsolescent; will be withdrawn */
154
165
  # define YYSTYPE_IS_DECLARED 1
@@ -158,7 +169,7 @@ typedef union YYSTYPE
158
169
 
159
170
 
160
171
  /* Copy the second part of user declarations. */
161
- #line 35 "src/q_parser.y"
172
+ #line 121 "src/q_parser.y"
162
173
 
163
174
  static int yylex(YYSTYPE *lvalp, QParser *qp);
164
175
  static int yyerror(QParser *qp, char const *msg);
@@ -169,17 +180,19 @@ static Query *get_bool_q(BCArray *bca);
169
180
  static BCArray *first_cls(BooleanClause *boolean_clause);
170
181
  static BCArray *add_and_cls(BCArray *bca, BooleanClause *clause);
171
182
  static BCArray *add_or_cls(BCArray *bca, BooleanClause *clause);
172
- static BCArray *add_default_cls(QParser *qp, BCArray *bca, BooleanClause *clause);
183
+ static BCArray *add_default_cls(QParser *qp, BCArray *bca,
184
+ BooleanClause *clause);
173
185
  static void bca_destroy(BCArray *bca);
174
186
 
175
- static BooleanClause *get_bool_cls(Query *q, unsigned int occur);
187
+ static BooleanClause *get_bool_cls(Query *q, BCType occur);
176
188
 
177
- static Query *get_term_q(QParser *qp, char *field, char *word);
178
- static Query *get_fuzzy_q(QParser *qp, char *field, char *word, char *slop);
179
- static Query *get_wild_q(QParser *qp, char *field, char *pattern);
189
+ static Query *get_term_q(QParser *qp, Symbol field, char *word);
190
+ static Query *get_fuzzy_q(QParser *qp, Symbol field, char *word,
191
+ char *slop);
192
+ static Query *get_wild_q(QParser *qp, Symbol field, char *pattern);
180
193
 
181
- static HashSet *first_field(QParser *qp, char *field);
182
- static HashSet *add_field(QParser *qp, char *field);
194
+ static HashSet *first_field(QParser *qp, const char *field);
195
+ static HashSet *add_field(QParser *qp, const char *field);
183
196
 
184
197
  static Query *get_phrase_q(QParser *qp, Phrase *phrase, char *slop);
185
198
 
@@ -188,22 +201,32 @@ static Phrase *ph_add_word(Phrase *self, char *word);
188
201
  static Phrase *ph_add_multi_word(Phrase *self, char *word);
189
202
  static void ph_destroy(Phrase *self);
190
203
 
191
- static Query *get_r_q(QParser *qp, char *field, char *from, char *to,
204
+ static Query *get_r_q(QParser *qp, Symbol field, char *from, char *to,
192
205
  bool inc_lower, bool inc_upper);
193
206
 
207
+ static void qp_push_fields(QParser *self, HashSet *fields, bool destroy);
208
+ static void qp_pop_fields(QParser *self);
209
+
210
+ /**
211
+ * +FLDS+ calls +func+ for all fields on top of the field stack. +func+
212
+ * must return a query. If there is more than one field on top of FieldStack
213
+ * then +FLDS+ will combing all the queries returned by +func+ into a single
214
+ * BooleanQuery which it than assigns to +q+. If there is only one field, the
215
+ * return value of +func+ is assigned to +q+ directly.
216
+ */
194
217
  #define FLDS(q, func) do {\
195
218
  TRY {\
196
- char *field;\
219
+ Symbol field;\
197
220
  if (qp->fields->size == 0) {\
198
221
  q = NULL;\
199
222
  } else if (qp->fields->size == 1) {\
200
- field = (char *)qp->fields->elems[0];\
223
+ field = (Symbol)qp->fields->first->elem;\
201
224
  q = func;\
202
225
  } else {\
203
- int i;Query *sq;\
226
+ Query *volatile sq; HashSetEntry *volatile hse;\
204
227
  q = bq_new_max(false, qp->max_clauses);\
205
- for (i = 0; i < qp->fields->size; i++) {\
206
- field = (char *)qp->fields->elems[i];\
228
+ for (hse = qp->fields->first; hse; hse = hse->next) {\
229
+ field = (Symbol)hse->elem;\
207
230
  sq = func;\
208
231
  TRY\
209
232
  if (sq) bq_add_query_nr(q, sq, BC_SHOULD);\
@@ -234,7 +257,7 @@ static Query *get_r_q(QParser *qp, char *field, char *from, char *to,
234
257
 
235
258
 
236
259
  /* Line 216 of yacc.c. */
237
- #line 238 "y.tab.c"
260
+ #line 261 "src/q_parser.c"
238
261
 
239
262
  #ifdef short
240
263
  # undef short
@@ -535,14 +558,14 @@ static const yytype_int8 yyrhs[] =
535
558
  };
536
559
 
537
560
  /* YYRLINE[YYN] -- source line where rule number YYN was defined. */
538
- static const yytype_uint8 yyrline[] =
561
+ static const yytype_uint16 yyrline[] =
539
562
  {
540
- 0, 128, 128, 129, 131, 132, 133, 134, 136, 137,
541
- 138, 140, 141, 143, 144, 145, 146, 147, 148, 149,
542
- 151, 152, 153, 155, 157, 157, 159, 159, 159, 162,
543
- 163, 165, 166, 167, 168, 170, 171, 172, 173, 174,
544
- 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
545
- 186, 187
563
+ 0, 226, 226, 227, 229, 230, 231, 232, 234, 235,
564
+ 236, 238, 239, 241, 242, 243, 244, 245, 246, 247,
565
+ 249, 250, 251, 253, 255, 255, 257, 257, 257, 260,
566
+ 261, 263, 264, 265, 266, 268, 269, 270, 271, 272,
567
+ 274, 275, 276, 277, 278, 279, 280, 281, 282, 283,
568
+ 284, 285
546
569
  };
547
570
  #endif
548
571
 
@@ -1201,59 +1224,59 @@ yydestruct (yymsg, yytype, yyvaluep, qp)
1201
1224
  switch (yytype)
1202
1225
  {
1203
1226
  case 27: /* "bool_q" */
1204
- #line 123 "src/q_parser.y"
1227
+ #line 221 "src/q_parser.y"
1205
1228
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1206
- #line 1207 "y.tab.c"
1229
+ #line 1230 "src/q_parser.c"
1207
1230
  break;
1208
1231
  case 28: /* "bool_clss" */
1209
- #line 125 "src/q_parser.y"
1232
+ #line 223 "src/q_parser.y"
1210
1233
  { if ((yyvaluep->bclss) && qp->destruct) bca_destroy((yyvaluep->bclss)); };
1211
- #line 1212 "y.tab.c"
1234
+ #line 1235 "src/q_parser.c"
1212
1235
  break;
1213
1236
  case 29: /* "bool_cls" */
1214
- #line 124 "src/q_parser.y"
1237
+ #line 222 "src/q_parser.y"
1215
1238
  { if ((yyvaluep->bcls) && qp->destruct) bc_deref((yyvaluep->bcls)); };
1216
- #line 1217 "y.tab.c"
1239
+ #line 1240 "src/q_parser.c"
1217
1240
  break;
1218
1241
  case 30: /* "boosted_q" */
1219
- #line 123 "src/q_parser.y"
1242
+ #line 221 "src/q_parser.y"
1220
1243
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1221
- #line 1222 "y.tab.c"
1244
+ #line 1245 "src/q_parser.c"
1222
1245
  break;
1223
1246
  case 31: /* "q" */
1224
- #line 123 "src/q_parser.y"
1247
+ #line 221 "src/q_parser.y"
1225
1248
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1226
- #line 1227 "y.tab.c"
1249
+ #line 1250 "src/q_parser.c"
1227
1250
  break;
1228
1251
  case 32: /* "term_q" */
1229
- #line 123 "src/q_parser.y"
1252
+ #line 221 "src/q_parser.y"
1230
1253
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1231
- #line 1232 "y.tab.c"
1254
+ #line 1255 "src/q_parser.c"
1232
1255
  break;
1233
1256
  case 33: /* "wild_q" */
1234
- #line 123 "src/q_parser.y"
1257
+ #line 221 "src/q_parser.y"
1235
1258
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1236
- #line 1237 "y.tab.c"
1259
+ #line 1260 "src/q_parser.c"
1237
1260
  break;
1238
1261
  case 34: /* "field_q" */
1239
- #line 123 "src/q_parser.y"
1262
+ #line 221 "src/q_parser.y"
1240
1263
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1241
- #line 1242 "y.tab.c"
1264
+ #line 1265 "src/q_parser.c"
1242
1265
  break;
1243
1266
  case 39: /* "phrase_q" */
1244
- #line 123 "src/q_parser.y"
1267
+ #line 221 "src/q_parser.y"
1245
1268
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1246
- #line 1247 "y.tab.c"
1269
+ #line 1270 "src/q_parser.c"
1247
1270
  break;
1248
1271
  case 40: /* "ph_words" */
1249
- #line 126 "src/q_parser.y"
1272
+ #line 224 "src/q_parser.y"
1250
1273
  { if ((yyvaluep->phrase) && qp->destruct) ph_destroy((yyvaluep->phrase)); };
1251
- #line 1252 "y.tab.c"
1274
+ #line 1275 "src/q_parser.c"
1252
1275
  break;
1253
1276
  case 41: /* "range_q" */
1254
- #line 123 "src/q_parser.y"
1277
+ #line 221 "src/q_parser.y"
1255
1278
  { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); };
1256
- #line 1257 "y.tab.c"
1279
+ #line 1280 "src/q_parser.c"
1257
1280
  break;
1258
1281
 
1259
1282
  default:
@@ -1562,228 +1585,228 @@ yyreduce:
1562
1585
  switch (yyn)
1563
1586
  {
1564
1587
  case 2:
1565
- #line 128 "src/q_parser.y"
1588
+ #line 226 "src/q_parser.y"
1566
1589
  { qp->result = (yyval.query) = NULL; }
1567
1590
  break;
1568
1591
 
1569
1592
  case 3:
1570
- #line 129 "src/q_parser.y"
1593
+ #line 227 "src/q_parser.y"
1571
1594
  { T qp->result = (yyval.query) = get_bool_q((yyvsp[(1) - (1)].bclss)); E }
1572
1595
  break;
1573
1596
 
1574
1597
  case 4:
1575
- #line 131 "src/q_parser.y"
1598
+ #line 229 "src/q_parser.y"
1576
1599
  { T (yyval.bclss) = first_cls((yyvsp[(1) - (1)].bcls)); E }
1577
1600
  break;
1578
1601
 
1579
1602
  case 5:
1580
- #line 132 "src/q_parser.y"
1603
+ #line 230 "src/q_parser.y"
1581
1604
  { T (yyval.bclss) = add_and_cls((yyvsp[(1) - (3)].bclss), (yyvsp[(3) - (3)].bcls)); E }
1582
1605
  break;
1583
1606
 
1584
1607
  case 6:
1585
- #line 133 "src/q_parser.y"
1608
+ #line 231 "src/q_parser.y"
1586
1609
  { T (yyval.bclss) = add_or_cls((yyvsp[(1) - (3)].bclss), (yyvsp[(3) - (3)].bcls)); E }
1587
1610
  break;
1588
1611
 
1589
1612
  case 7:
1590
- #line 134 "src/q_parser.y"
1613
+ #line 232 "src/q_parser.y"
1591
1614
  { T (yyval.bclss) = add_default_cls(qp, (yyvsp[(1) - (2)].bclss), (yyvsp[(2) - (2)].bcls)); E }
1592
1615
  break;
1593
1616
 
1594
1617
  case 8:
1595
- #line 136 "src/q_parser.y"
1618
+ #line 234 "src/q_parser.y"
1596
1619
  { T (yyval.bcls) = get_bool_cls((yyvsp[(2) - (2)].query), BC_MUST); E }
1597
1620
  break;
1598
1621
 
1599
1622
  case 9:
1600
- #line 137 "src/q_parser.y"
1623
+ #line 235 "src/q_parser.y"
1601
1624
  { T (yyval.bcls) = get_bool_cls((yyvsp[(2) - (2)].query), BC_MUST_NOT); E }
1602
1625
  break;
1603
1626
 
1604
1627
  case 10:
1605
- #line 138 "src/q_parser.y"
1628
+ #line 236 "src/q_parser.y"
1606
1629
  { T (yyval.bcls) = get_bool_cls((yyvsp[(1) - (1)].query), BC_SHOULD); E }
1607
1630
  break;
1608
1631
 
1609
1632
  case 12:
1610
- #line 141 "src/q_parser.y"
1633
+ #line 239 "src/q_parser.y"
1611
1634
  { T if ((yyvsp[(1) - (3)].query)) sscanf((yyvsp[(3) - (3)].str),"%f",&((yyvsp[(1) - (3)].query)->boost)); (yyval.query)=(yyvsp[(1) - (3)].query); E }
1612
1635
  break;
1613
1636
 
1614
1637
  case 14:
1615
- #line 144 "src/q_parser.y"
1638
+ #line 242 "src/q_parser.y"
1616
1639
  { T (yyval.query) = bq_new_max(true, qp->max_clauses); E }
1617
1640
  break;
1618
1641
 
1619
1642
  case 15:
1620
- #line 145 "src/q_parser.y"
1643
+ #line 243 "src/q_parser.y"
1621
1644
  { T (yyval.query) = get_bool_q((yyvsp[(2) - (3)].bclss)); E }
1622
1645
  break;
1623
1646
 
1624
1647
  case 20:
1625
- #line 151 "src/q_parser.y"
1648
+ #line 249 "src/q_parser.y"
1626
1649
  { FLDS((yyval.query), get_term_q(qp, field, (yyvsp[(1) - (1)].str))); Y}
1627
1650
  break;
1628
1651
 
1629
1652
  case 21:
1630
- #line 152 "src/q_parser.y"
1653
+ #line 250 "src/q_parser.y"
1631
1654
  { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[(1) - (3)].str), (yyvsp[(3) - (3)].str))); Y}
1632
1655
  break;
1633
1656
 
1634
1657
  case 22:
1635
- #line 153 "src/q_parser.y"
1658
+ #line 251 "src/q_parser.y"
1636
1659
  { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[(1) - (2)].str), NULL)); Y}
1637
1660
  break;
1638
1661
 
1639
1662
  case 23:
1640
- #line 155 "src/q_parser.y"
1663
+ #line 253 "src/q_parser.y"
1641
1664
  { FLDS((yyval.query), get_wild_q(qp, field, (yyvsp[(1) - (1)].str))); Y}
1642
1665
  break;
1643
1666
 
1644
1667
  case 24:
1645
- #line 157 "src/q_parser.y"
1646
- { qp->fields = qp->def_fields; }
1668
+ #line 255 "src/q_parser.y"
1669
+ { qp_pop_fields(qp); }
1647
1670
  break;
1648
1671
 
1649
1672
  case 25:
1650
- #line 158 "src/q_parser.y"
1673
+ #line 256 "src/q_parser.y"
1651
1674
  { (yyval.query) = (yyvsp[(3) - (4)].query); }
1652
1675
  break;
1653
1676
 
1654
1677
  case 26:
1655
- #line 159 "src/q_parser.y"
1656
- { qp->fields = qp->all_fields; }
1678
+ #line 257 "src/q_parser.y"
1679
+ { qp_push_fields(qp, qp->all_fields, false); }
1657
1680
  break;
1658
1681
 
1659
1682
  case 27:
1660
- #line 159 "src/q_parser.y"
1661
- {qp->fields = qp->def_fields;}
1683
+ #line 257 "src/q_parser.y"
1684
+ { qp_pop_fields(qp); }
1662
1685
  break;
1663
1686
 
1664
1687
  case 28:
1665
- #line 160 "src/q_parser.y"
1688
+ #line 258 "src/q_parser.y"
1666
1689
  { (yyval.query) = (yyvsp[(4) - (5)].query); }
1667
1690
  break;
1668
1691
 
1669
1692
  case 29:
1670
- #line 162 "src/q_parser.y"
1693
+ #line 260 "src/q_parser.y"
1671
1694
  { (yyval.hashset) = first_field(qp, (yyvsp[(1) - (1)].str)); }
1672
1695
  break;
1673
1696
 
1674
1697
  case 30:
1675
- #line 163 "src/q_parser.y"
1698
+ #line 261 "src/q_parser.y"
1676
1699
  { (yyval.hashset) = add_field(qp, (yyvsp[(3) - (3)].str));}
1677
1700
  break;
1678
1701
 
1679
1702
  case 31:
1680
- #line 165 "src/q_parser.y"
1703
+ #line 263 "src/q_parser.y"
1681
1704
  { (yyval.query) = get_phrase_q(qp, (yyvsp[(2) - (3)].phrase), NULL); }
1682
1705
  break;
1683
1706
 
1684
1707
  case 32:
1685
- #line 166 "src/q_parser.y"
1708
+ #line 264 "src/q_parser.y"
1686
1709
  { (yyval.query) = get_phrase_q(qp, (yyvsp[(2) - (5)].phrase), (yyvsp[(5) - (5)].str)); }
1687
1710
  break;
1688
1711
 
1689
1712
  case 33:
1690
- #line 167 "src/q_parser.y"
1713
+ #line 265 "src/q_parser.y"
1691
1714
  { (yyval.query) = NULL; }
1692
1715
  break;
1693
1716
 
1694
1717
  case 34:
1695
- #line 168 "src/q_parser.y"
1718
+ #line 266 "src/q_parser.y"
1696
1719
  { (yyval.query) = NULL; (void)(yyvsp[(4) - (4)].str);}
1697
1720
  break;
1698
1721
 
1699
1722
  case 35:
1700
- #line 170 "src/q_parser.y"
1723
+ #line 268 "src/q_parser.y"
1701
1724
  { (yyval.phrase) = ph_first_word((yyvsp[(1) - (1)].str)); }
1702
1725
  break;
1703
1726
 
1704
1727
  case 36:
1705
- #line 171 "src/q_parser.y"
1728
+ #line 269 "src/q_parser.y"
1706
1729
  { (yyval.phrase) = ph_first_word(NULL); }
1707
1730
  break;
1708
1731
 
1709
1732
  case 37:
1710
- #line 172 "src/q_parser.y"
1733
+ #line 270 "src/q_parser.y"
1711
1734
  { (yyval.phrase) = ph_add_word((yyvsp[(1) - (2)].phrase), (yyvsp[(2) - (2)].str)); }
1712
1735
  break;
1713
1736
 
1714
1737
  case 38:
1715
- #line 173 "src/q_parser.y"
1738
+ #line 271 "src/q_parser.y"
1716
1739
  { (yyval.phrase) = ph_add_word((yyvsp[(1) - (3)].phrase), NULL); }
1717
1740
  break;
1718
1741
 
1719
1742
  case 39:
1720
- #line 174 "src/q_parser.y"
1743
+ #line 272 "src/q_parser.y"
1721
1744
  { (yyval.phrase) = ph_add_multi_word((yyvsp[(1) - (3)].phrase), (yyvsp[(3) - (3)].str)); }
1722
1745
  break;
1723
1746
 
1724
1747
  case 40:
1725
- #line 176 "src/q_parser.y"
1748
+ #line 274 "src/q_parser.y"
1726
1749
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str), (yyvsp[(3) - (4)].str), true, true)); Y}
1727
1750
  break;
1728
1751
 
1729
1752
  case 41:
1730
- #line 177 "src/q_parser.y"
1753
+ #line 275 "src/q_parser.y"
1731
1754
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str), (yyvsp[(3) - (4)].str), true, false)); Y}
1732
1755
  break;
1733
1756
 
1734
1757
  case 42:
1735
- #line 178 "src/q_parser.y"
1758
+ #line 276 "src/q_parser.y"
1736
1759
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str), (yyvsp[(3) - (4)].str), false, true)); Y}
1737
1760
  break;
1738
1761
 
1739
1762
  case 43:
1740
- #line 179 "src/q_parser.y"
1763
+ #line 277 "src/q_parser.y"
1741
1764
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str), (yyvsp[(3) - (4)].str), false, false)); Y}
1742
1765
  break;
1743
1766
 
1744
1767
  case 44:
1745
- #line 180 "src/q_parser.y"
1768
+ #line 278 "src/q_parser.y"
1746
1769
  { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(2) - (3)].str), false, false)); Y}
1747
1770
  break;
1748
1771
 
1749
1772
  case 45:
1750
- #line 181 "src/q_parser.y"
1773
+ #line 279 "src/q_parser.y"
1751
1774
  { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(2) - (3)].str), false, true)); Y}
1752
1775
  break;
1753
1776
 
1754
1777
  case 46:
1755
- #line 182 "src/q_parser.y"
1778
+ #line 280 "src/q_parser.y"
1756
1779
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (3)].str), NULL,true, false)); Y}
1757
1780
  break;
1758
1781
 
1759
1782
  case 47:
1760
- #line 183 "src/q_parser.y"
1783
+ #line 281 "src/q_parser.y"
1761
1784
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (3)].str), NULL,false, false)); Y}
1762
1785
  break;
1763
1786
 
1764
1787
  case 48:
1765
- #line 184 "src/q_parser.y"
1788
+ #line 282 "src/q_parser.y"
1766
1789
  { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(2) - (2)].str), false, false)); Y}
1767
1790
  break;
1768
1791
 
1769
1792
  case 49:
1770
- #line 185 "src/q_parser.y"
1793
+ #line 283 "src/q_parser.y"
1771
1794
  { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(3) - (3)].str), false, true)); Y}
1772
1795
  break;
1773
1796
 
1774
1797
  case 50:
1775
- #line 186 "src/q_parser.y"
1798
+ #line 284 "src/q_parser.y"
1776
1799
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(3) - (3)].str), NULL,true, false)); Y}
1777
1800
  break;
1778
1801
 
1779
1802
  case 51:
1780
- #line 187 "src/q_parser.y"
1803
+ #line 285 "src/q_parser.y"
1781
1804
  { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (2)].str), NULL,false, false)); Y}
1782
1805
  break;
1783
1806
 
1784
1807
 
1785
1808
  /* Line 1267 of yacc.c. */
1786
- #line 1787 "y.tab.c"
1809
+ #line 1810 "src/q_parser.c"
1787
1810
  default: break;
1788
1811
  }
1789
1812
  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
@@ -1997,12 +2020,22 @@ yyreturn:
1997
2020
  }
1998
2021
 
1999
2022
 
2000
- #line 189 "src/q_parser.y"
2023
+ #line 287 "src/q_parser.y"
2001
2024
 
2002
2025
 
2003
- const char *special_char = "&:()[]{}!\"~^|<>=*?+-";
2004
- const char *not_word = " \t()[]{}!\"~^|<>=";
2026
+ static const char *special_char = "&:()[]{}!\"~^|<>=*?+-";
2027
+ static const char *not_word = " \t()[]{}!\"~^|<>=";
2005
2028
 
2029
+ /**
2030
+ * +get_word+ gets the next query-word from the query string. A query-word is
2031
+ * basically a string of non-special or escaped special characters. It is
2032
+ * Analyzer agnostic. It is up to the get_*_q methods to tokenize the word and
2033
+ * turn it into a +Query+. See the documentation for each get_*_q method to
2034
+ * see how it handles tokenization.
2035
+ *
2036
+ * Note that +get_word+ is also responsible for returning field names and
2037
+ * matching the special tokens 'AND', 'NOT', 'REQ' and 'OR'.
2038
+ */
2006
2039
  static int get_word(YYSTYPE *lvalp, QParser *qp)
2007
2040
  {
2008
2041
  bool is_wild = false;
@@ -2046,8 +2079,10 @@ static int get_word(YYSTYPE *lvalp, QParser *qp)
2046
2079
  default:
2047
2080
  *bufp++ = c;
2048
2081
  }
2049
- /* we've exceeded the static buffer. switch to the dynamic
2050
- one. */
2082
+ /* we've exceeded the static buffer. switch to the dynamic one. The
2083
+ * dynamic buffer is allocated enough space to hold the whole query
2084
+ * string so it's capacity doesn't need to be checked again once
2085
+ * allocated. */
2051
2086
  if (!qp->dynbuf && ((bufp - buf) == MAX_WORD_SIZE)) {
2052
2087
  qp->dynbuf = ALLOC_AND_ZERO_N(char, strlen(qp->qstr) + 1);
2053
2088
  strncpy(qp->dynbuf, buf, MAX_WORD_SIZE);
@@ -2057,8 +2092,8 @@ static int get_word(YYSTYPE *lvalp, QParser *qp)
2057
2092
  }
2058
2093
  get_word_done:
2059
2094
  qp->qstrp--;
2060
- /* check for keywords. There are only four so we have a bit of a hack which
2061
- * just checks for all of them. */
2095
+ /* check for keywords. There are only four so we have a bit of a hack
2096
+ * which just checks for all of them. */
2062
2097
  *bufp = '\0';
2063
2098
  len = (int)(bufp - buf);
2064
2099
  if (qp->use_keywords) {
@@ -2078,6 +2113,33 @@ get_word_done:
2078
2113
  return QWRD;
2079
2114
  }
2080
2115
 
2116
+ /**
2117
+ * +yylex+ is the lexing method called by the QueryParser. It breaks the
2118
+ * query up into special characters;
2119
+ *
2120
+ * ( "&:()[]{}!\"~^|<>=*?+-" )
2121
+ *
2122
+ * and tokens;
2123
+ *
2124
+ * - QWRD
2125
+ * - WILD_STR
2126
+ * - AND['AND', '&&']
2127
+ * - OR['OR', '||']
2128
+ * - REQ['REQ', '+']
2129
+ * - NOT['NOT', '-', '~']
2130
+ *
2131
+ * QWRD tokens are query word tokens which are made up of characters other
2132
+ * than the special characters. They can also contain special characters when
2133
+ * escaped with a backslash '\'. WILD_STR is the same as QWRD except that it
2134
+ * may also contain '?' and '*' characters.
2135
+ *
2136
+ * If any of the special chars are seen they will usually be returned straight
2137
+ * away. The exceptions are the wild chars '*' and '?', and '&' which will be
2138
+ * treated as a plain old word character unless followed by another '&'.
2139
+ *
2140
+ * If no special characters or tokens are found then yylex delegates to
2141
+ * +get_word+ which will fetch the next query-word.
2142
+ */
2081
2143
  static int yylex(YYSTYPE *lvalp, QParser *qp)
2082
2144
  {
2083
2145
  char c, nc;
@@ -2116,6 +2178,11 @@ static int yylex(YYSTYPE *lvalp, QParser *qp)
2116
2178
  return get_word(lvalp, qp);
2117
2179
  }
2118
2180
 
2181
+ /**
2182
+ * yyerror gets called if there is an parse error with the yacc parser.
2183
+ * It is responsible for clearing any memory that was allocated during the
2184
+ * parsing process.
2185
+ */
2119
2186
  static int yyerror(QParser *qp, char const *msg)
2120
2187
  {
2121
2188
  qp->destruct = true;
@@ -2131,19 +2198,30 @@ static int yyerror(QParser *qp, char const *msg)
2131
2198
  "couldn't parse query ``%s''. Error message "
2132
2199
  " was %s", buf, (char *)msg);
2133
2200
  }
2201
+ while (qp->fields_top->next != NULL) {
2202
+ qp_pop_fields(qp);
2203
+ }
2134
2204
  return 0;
2135
2205
  }
2136
2206
 
2137
2207
  #define BQ(query) ((BooleanQuery *)(query))
2138
2208
 
2139
- static TokenStream *get_cached_ts(QParser *qp, char *field, char *text)
2209
+ /**
2210
+ * The QueryParser caches a tokenizer for each field so that it doesn't need
2211
+ * to allocate a new tokenizer for each term in the query. This would be quite
2212
+ * expensive as tokenizers use quite a large hunk of memory.
2213
+ *
2214
+ * This method returns the query parser for a particular field and sets it up
2215
+ * with the text to be tokenized.
2216
+ */
2217
+ static TokenStream *get_cached_ts(QParser *qp, Symbol field, char *text)
2140
2218
  {
2141
2219
  TokenStream *ts;
2142
- if (!qp->tokenized_fields || hs_exists(qp->tokenized_fields, field)) {
2143
- ts = h_get(qp->ts_cache, field);
2220
+ if (hs_exists(qp->tokenized_fields, field)) {
2221
+ ts = (TokenStream *)h_get(qp->ts_cache, field);
2144
2222
  if (!ts) {
2145
2223
  ts = a_get_ts(qp->analyzer, field, text);
2146
- h_set(qp->ts_cache, estrdup(field), ts);
2224
+ h_set(qp->ts_cache, field, ts);
2147
2225
  }
2148
2226
  else {
2149
2227
  ts->reset(ts, text);
@@ -2156,16 +2234,11 @@ static TokenStream *get_cached_ts(QParser *qp, char *field, char *text)
2156
2234
  return ts;
2157
2235
  }
2158
2236
 
2159
- static char *get_cached_field(HashTable *field_cache, const char *field)
2160
- {
2161
- char *cached_field = h_get(field_cache, field);
2162
- if (!cached_field) {
2163
- cached_field = estrdup(field);
2164
- h_set(field_cache, cached_field, cached_field);
2165
- }
2166
- return cached_field;
2167
- }
2168
-
2237
+ /**
2238
+ * Turns a BooleanClause array into a BooleanQuery. It will optimize the query
2239
+ * if 0 or 1 clauses are present to NULL or the actual query in the clause
2240
+ * respectively.
2241
+ */
2169
2242
  static Query *get_bool_q(BCArray *bca)
2170
2243
  {
2171
2244
  Query *q;
@@ -2201,6 +2274,10 @@ static Query *get_bool_q(BCArray *bca)
2201
2274
  return q;
2202
2275
  }
2203
2276
 
2277
+ /**
2278
+ * Base method for appending BooleanClauses to a BooleanClause array. This
2279
+ * method doesn't care about the type of clause (MUST, SHOULD, MUST_NOT).
2280
+ */
2204
2281
  static void bca_add_clause(BCArray *bca, BooleanClause *clause)
2205
2282
  {
2206
2283
  if (bca->size >= bca->capa) {
@@ -2211,6 +2288,10 @@ static void bca_add_clause(BCArray *bca, BooleanClause *clause)
2211
2288
  bca->size++;
2212
2289
  }
2213
2290
 
2291
+ /**
2292
+ * Add the first clause to a BooleanClause array. This method is also
2293
+ * responsible for allocating a new BooleanClause array.
2294
+ */
2214
2295
  static BCArray *first_cls(BooleanClause *clause)
2215
2296
  {
2216
2297
  BCArray *bca = ALLOC_AND_ZERO(BCArray);
@@ -2222,6 +2303,12 @@ static BCArray *first_cls(BooleanClause *clause)
2222
2303
  return bca;
2223
2304
  }
2224
2305
 
2306
+ /**
2307
+ * Add AND clause to the BooleanClause array. The means that it will set the
2308
+ * clause being added and the previously added clause from SHOULD clauses to
2309
+ * MUST clauses. (If they are currently MUST_NOT clauses they stay as they
2310
+ * are.)
2311
+ */
2225
2312
  static BCArray *add_and_cls(BCArray *bca, BooleanClause *clause)
2226
2313
  {
2227
2314
  if (clause) {
@@ -2238,6 +2325,9 @@ static BCArray *add_and_cls(BCArray *bca, BooleanClause *clause)
2238
2325
  return bca;
2239
2326
  }
2240
2327
 
2328
+ /**
2329
+ * Add SHOULD clause to the BooleanClause array.
2330
+ */
2241
2331
  static BCArray *add_or_cls(BCArray *bca, BooleanClause *clause)
2242
2332
  {
2243
2333
  if (clause) {
@@ -2246,6 +2336,10 @@ static BCArray *add_or_cls(BCArray *bca, BooleanClause *clause)
2246
2336
  return bca;
2247
2337
  }
2248
2338
 
2339
+ /**
2340
+ * Add AND or OR clause to the BooleanClause array, depending on the default
2341
+ * clause type.
2342
+ */
2249
2343
  static BCArray *add_default_cls(QParser *qp, BCArray *bca,
2250
2344
  BooleanClause *clause)
2251
2345
  {
@@ -2258,6 +2352,9 @@ static BCArray *add_default_cls(QParser *qp, BCArray *bca,
2258
2352
  return bca;
2259
2353
  }
2260
2354
 
2355
+ /**
2356
+ * destroy array of BooleanClauses
2357
+ */
2261
2358
  static void bca_destroy(BCArray *bca)
2262
2359
  {
2263
2360
  int i;
@@ -2268,7 +2365,10 @@ static void bca_destroy(BCArray *bca)
2268
2365
  free(bca);
2269
2366
  }
2270
2367
 
2271
- static BooleanClause *get_bool_cls(Query *q, unsigned int occur)
2368
+ /**
2369
+ * Turn a query into a BooleanClause for addition to a BooleanQuery.
2370
+ */
2371
+ static BooleanClause *get_bool_cls(Query *q, BCType occur)
2272
2372
  {
2273
2373
  if (q) {
2274
2374
  return bc_new(q, occur);
@@ -2278,7 +2378,15 @@ static BooleanClause *get_bool_cls(Query *q, unsigned int occur)
2278
2378
  }
2279
2379
  }
2280
2380
 
2281
- static Query *get_term_q(QParser *qp, char *field, char *word)
2381
+ /**
2382
+ * Create a TermQuery. The word will be tokenized and if the tokenization
2383
+ * produces more than one token, a PhraseQuery will be returned. For example,
2384
+ * if the word is dbalmain@gmail.com and a LetterTokenizer is used then a
2385
+ * PhraseQuery "dbalmain gmail com" will be returned which is actually exactly
2386
+ * what we want as it will match any documents containing the same email
2387
+ * address and tokenized with the same tokenizer.
2388
+ */
2389
+ static Query *get_term_q(QParser *qp, Symbol field, char *word)
2282
2390
  {
2283
2391
  Query *q;
2284
2392
  Token *token;
@@ -2311,7 +2419,13 @@ static Query *get_term_q(QParser *qp, char *field, char *word)
2311
2419
  return q;
2312
2420
  }
2313
2421
 
2314
- static Query *get_fuzzy_q(QParser *qp, char *field, char *word, char *slop_str)
2422
+ /**
2423
+ * Create a FuzzyQuery. The word will be tokenized and only the first token
2424
+ * will be used. If there are any more tokens after tokenization, they will be
2425
+ * ignored.
2426
+ */
2427
+ static Query *get_fuzzy_q(QParser *qp, Symbol field, char *word,
2428
+ char *slop_str)
2315
2429
  {
2316
2430
  Query *q;
2317
2431
  Token *token;
@@ -2332,6 +2446,10 @@ static Query *get_fuzzy_q(QParser *qp, char *field, char *word, char *slop_str)
2332
2446
  return q;
2333
2447
  }
2334
2448
 
2449
+ /**
2450
+ * Downcase a string taking locale into account and works for multibyte
2451
+ * character sets.
2452
+ */
2335
2453
  static char *lower_str(char *str)
2336
2454
  {
2337
2455
  const int max_len = (int)strlen(str) + 1;
@@ -2357,7 +2475,16 @@ static char *lower_str(char *str)
2357
2475
  return str;
2358
2476
  }
2359
2477
 
2360
- static Query *get_wild_q(QParser *qp, char *field, char *pattern)
2478
+ /**
2479
+ * Create a WildCardQuery. No tokenization will be performed on the pattern
2480
+ * but the pattern will be downcased if +qp->wild_lower+ is set to true and
2481
+ * the field in question is a tokenized field.
2482
+ *
2483
+ * Note: this method will not always return a WildCardQuery. It could be
2484
+ * optimized to a MatchAllQuery if the pattern is '*' or a PrefixQuery if the
2485
+ * only wild char (*, ?) in the pattern is a '*' at the end of the pattern.
2486
+ */
2487
+ static Query *get_wild_q(QParser *qp, Symbol field, char *pattern)
2361
2488
  {
2362
2489
  Query *q;
2363
2490
  bool is_prefix = false;
@@ -2398,22 +2525,32 @@ static Query *get_wild_q(QParser *qp, char *field, char *pattern)
2398
2525
  return q;
2399
2526
  }
2400
2527
 
2401
- static HashSet *add_field(QParser *qp, char *field)
2528
+ /**
2529
+ * Adds another field to the top of the FieldStack.
2530
+ */
2531
+ static HashSet *add_field(QParser *qp, const char *field_name)
2402
2532
  {
2533
+ Symbol field = intern(field_name);
2403
2534
  if (qp->allow_any_fields || hs_exists(qp->all_fields, field)) {
2404
- hs_add(qp->fields, get_cached_field(qp->field_cache, field));
2535
+ hs_add(qp->fields, field);
2405
2536
  }
2406
2537
  return qp->fields;
2407
2538
  }
2408
2539
 
2409
- static HashSet *first_field(QParser *qp, char *field)
2540
+ /**
2541
+ * The method gets called when a field modifier ("field1|field2:") is seen. It
2542
+ * will push a new FieldStack object onto the stack and add +field+ to its
2543
+ * fields set.
2544
+ */
2545
+ static HashSet *first_field(QParser *qp, const char *field)
2410
2546
  {
2411
- qp->fields = qp->fields_buf;
2412
- qp->fields->size = 0;
2413
- h_clear(qp->fields->ht);
2547
+ qp_push_fields(qp, hs_new_ptr(NULL), true);
2414
2548
  return add_field(qp, field);
2415
2549
  }
2416
2550
 
2551
+ /**
2552
+ * Destroy a phrase object freeing all allocated memory.
2553
+ */
2417
2554
  static void ph_destroy(Phrase *self)
2418
2555
  {
2419
2556
  int i;
@@ -2425,6 +2562,9 @@ static void ph_destroy(Phrase *self)
2425
2562
  }
2426
2563
 
2427
2564
 
2565
+ /**
2566
+ * Allocate a new Phrase object
2567
+ */
2428
2568
  static Phrase *ph_new()
2429
2569
  {
2430
2570
  Phrase *self = ALLOC_AND_ZERO(Phrase);
@@ -2433,6 +2573,10 @@ static Phrase *ph_new()
2433
2573
  return self;
2434
2574
  }
2435
2575
 
2576
+ /**
2577
+ * Add the first word to the phrase. This method is also in charge of
2578
+ * allocating a new Phrase object.
2579
+ */
2436
2580
  static Phrase *ph_first_word(char *word)
2437
2581
  {
2438
2582
  Phrase *self = ph_new();
@@ -2444,6 +2588,9 @@ static Phrase *ph_first_word(char *word)
2444
2588
  return self;
2445
2589
  }
2446
2590
 
2591
+ /**
2592
+ * Add a new word to the Phrase
2593
+ */
2447
2594
  static Phrase *ph_add_word(Phrase *self, char *word)
2448
2595
  {
2449
2596
  if (word) {
@@ -2466,6 +2613,10 @@ static Phrase *ph_add_word(Phrase *self, char *word)
2466
2613
  return self;
2467
2614
  }
2468
2615
 
2616
+ /**
2617
+ * Adds a word to the Phrase object in the same position as the previous word
2618
+ * added to the Phrase. This will later be turned into a multi-PhraseQuery.
2619
+ */
2469
2620
  static Phrase *ph_add_multi_word(Phrase *self, char *word)
2470
2621
  {
2471
2622
  const int index = self->size - 1;
@@ -2477,7 +2628,35 @@ static Phrase *ph_add_multi_word(Phrase *self, char *word)
2477
2628
  return self;
2478
2629
  }
2479
2630
 
2480
- static Query *get_phrase_query(QParser *qp, char *field,
2631
+ /**
2632
+ * Build a phrase query for a single field. It might seem like a better idea
2633
+ * to build the PhraseQuery once and duplicate it for each field but this
2634
+ * would be buggy in the case of PerFieldAnalyzers in which case a different
2635
+ * tokenizer could be used for each field.
2636
+ *
2637
+ * Note that the query object returned by this method is not always a
2638
+ * PhraseQuery. If there is only one term in the query then the query is
2639
+ * simplified to a TermQuery. If there are multiple terms but only a single
2640
+ * position, then a MultiTermQuery is retured.
2641
+ *
2642
+ * Note that each word in the query gets tokenized. Unlike get_term_q, if the
2643
+ * word gets tokenized into more than one token, the rest of the tokens are
2644
+ * ignored. For example, if you have the phrase;
2645
+ *
2646
+ * "email: dbalmain@gmail.com"
2647
+ *
2648
+ * the Phrase object will contain to positions with the words 'email:' and
2649
+ * 'dbalmain@gmail.com'. Now, if you are using a LetterTokenizer then the
2650
+ * second word will be tokenized into the tokens ['dbalmain', 'gmail', 'com']
2651
+ * and only the first token will be used, so the resulting phrase query will
2652
+ * actually look like this;
2653
+ *
2654
+ * "email dbalmain"
2655
+ *
2656
+ * This problem can easily be solved by using the StandardTokenizer or any
2657
+ * custom tokenizer which will leave dbalmain@gmail.com as a single token.
2658
+ */
2659
+ static Query *get_phrase_query(QParser *qp, Symbol field,
2481
2660
  Phrase *phrase, char *slop_str)
2482
2661
  {
2483
2662
  const int pos_cnt = phrase->size;
@@ -2497,13 +2676,14 @@ static Query *get_phrase_query(QParser *qp, char *field,
2497
2676
 
2498
2677
  for (i = 0; i < word_count; i++) {
2499
2678
  token = ts_next(get_cached_ts(qp, field, words[i]));
2500
- free(words[i]);
2501
2679
  if (token) {
2680
+ free(words[i]);
2502
2681
  last_word = words[i] = estrdup(token->text);
2503
2682
  ++term_cnt;
2504
2683
  }
2505
2684
  else {
2506
- words[i] = estrdup("");
2685
+ /* empty words will later be ignored */
2686
+ words[i][0] = '\0';
2507
2687
  }
2508
2688
  }
2509
2689
 
@@ -2517,6 +2697,7 @@ static Query *get_phrase_query(QParser *qp, char *field,
2517
2697
  default:
2518
2698
  q = multi_tq_new_conf(field, term_cnt, 0.0);
2519
2699
  for (i = 0; i < word_count; i++) {
2700
+ /* ignore empty words */
2520
2701
  if (words[i][0]) {
2521
2702
  multi_tq_add_term(q, words[i]);
2522
2703
  }
@@ -2582,19 +2763,31 @@ static Query *get_phrase_query(QParser *qp, char *field,
2582
2763
  return q;
2583
2764
  }
2584
2765
 
2766
+ /**
2767
+ * Get a phrase query from the Phrase object. The Phrase object is built up by
2768
+ * the query parser as the all PhraseQuery didn't work well for this. Once the
2769
+ * PhraseQuery has been built the Phrase object needs to be destroyed.
2770
+ */
2585
2771
  static Query *get_phrase_q(QParser *qp, Phrase *phrase, char *slop_str)
2586
2772
  {
2587
- Query *q = NULL;
2773
+ Query *volatile q = NULL;
2588
2774
  FLDS(q, get_phrase_query(qp, field, phrase, slop_str));
2589
2775
  ph_destroy(phrase);
2590
2776
  return q;
2591
2777
  }
2592
2778
 
2593
- static Query *get_r_q(QParser *qp, char *field, char *from, char *to,
2779
+ /**
2780
+ * Gets a RangeQuery object.
2781
+ *
2782
+ * Just like with WildCardQuery, RangeQuery needs to downcase its terms if the
2783
+ * tokenizer also downcased its terms.
2784
+ */
2785
+ static Query *get_r_q(QParser *qp, Symbol field, char *from, char *to,
2594
2786
  bool inc_lower, bool inc_upper)
2595
2787
  {
2596
2788
  Query *rq;
2597
- if (qp->wild_lower) {
2789
+ if (qp->wild_lower
2790
+ && (!qp->tokenized_fields || hs_exists(qp->tokenized_fields, field))) {
2598
2791
  if (from) {
2599
2792
  lower_str(from);
2600
2793
  }
@@ -2603,6 +2796,9 @@ static Query *get_r_q(QParser *qp, char *field, char *from, char *to,
2603
2796
  }
2604
2797
  }
2605
2798
  /*
2799
+ * terms don't get tokenized as it doesn't really make sense to do so for
2800
+ * range queries.
2801
+
2606
2802
  if (from) {
2607
2803
  TokenStream *stream = get_cached_ts(qp, field, from);
2608
2804
  Token *token = ts_next(stream);
@@ -2615,34 +2811,81 @@ static Query *get_r_q(QParser *qp, char *field, char *from, char *to,
2615
2811
  }
2616
2812
  */
2617
2813
 
2618
- rq = rq_new(field, from, to, inc_lower, inc_upper);
2814
+ rq = qp->use_typed_range_query ?
2815
+ trq_new(field, from, to, inc_lower, inc_upper) :
2816
+ rq_new(field, from, to, inc_lower, inc_upper);
2619
2817
  return rq;
2620
2818
  }
2621
2819
 
2622
- void qp_destroy(QParser *self)
2820
+ /**
2821
+ * Every time the query parser sees a new field modifier ("field1|field2:")
2822
+ * it pushes a new FieldStack object onto the stack and sets its fields to the
2823
+ * fields specified in the fields modifier. If the field modifier is '*',
2824
+ * fs->fields is set to all_fields. fs->fields is set to +qp->def_field+ at
2825
+ * the bottom of the stack (ie the very first set of fields pushed onto the
2826
+ * stack).
2827
+ */
2828
+ static void qp_push_fields(QParser *self, HashSet *fields, bool destroy)
2623
2829
  {
2624
- if (self->close_def_fields) {
2625
- hs_destroy(self->def_fields);
2830
+ FieldStack *fs = ALLOC(FieldStack);
2831
+
2832
+ fs->next = self->fields_top;
2833
+ fs->fields = fields;
2834
+ fs->destroy = destroy;
2835
+
2836
+ self->fields_top = fs;
2837
+ self->fields = fields;
2838
+ }
2839
+
2840
+ /**
2841
+ * Pops the top of the fields stack and frees any memory used by it. This will
2842
+ * get called when query modified by a field modifier ("field1|field2:") has
2843
+ * been fully parsed and the field specifier no longer applies.
2844
+ */
2845
+ static void qp_pop_fields(QParser *self)
2846
+ {
2847
+ FieldStack *fs = self->fields_top;
2848
+
2849
+ if (fs->destroy) {
2850
+ hs_destroy(fs->fields);
2851
+ }
2852
+ self->fields_top = fs->next;
2853
+ if (self->fields_top) {
2854
+ self->fields = self->fields_top->fields;
2626
2855
  }
2627
- if (self->tokenized_fields) {
2856
+ free(fs);
2857
+ }
2858
+
2859
+ /**
2860
+ * Free all memory allocated by the QueryParser.
2861
+ */
2862
+ void qp_destroy(QParser *self)
2863
+ {
2864
+ if (self->tokenized_fields != self->all_fields) {
2628
2865
  hs_destroy(self->tokenized_fields);
2629
2866
  }
2630
- if (self->dynbuf) {
2631
- free(self->dynbuf);
2867
+ if (self->def_fields != self->all_fields) {
2868
+ hs_destroy(self->def_fields);
2632
2869
  }
2633
2870
  hs_destroy(self->all_fields);
2634
- hs_destroy(self->fields_buf);
2635
- h_destroy(self->field_cache);
2871
+
2872
+ qp_pop_fields(self);
2873
+ assert(NULL == self->fields_top);
2874
+
2636
2875
  h_destroy(self->ts_cache);
2637
2876
  tk_destroy(self->non_tokenizer);
2638
2877
  a_deref(self->analyzer);
2639
2878
  free(self);
2640
2879
  }
2641
2880
 
2642
- QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
2643
- HashSet *tokenized_fields, Analyzer *analyzer)
2881
+ /**
2882
+ * Creates a new QueryParser setting all boolean parameters to their defaults.
2883
+ * If +def_fields+ is NULL then +all_fields+ is used in place of +def_fields+.
2884
+ * Not also that this method ensures that all fields that exist in
2885
+ * +def_fields+ must also exist in +all_fields+. This should make sense.
2886
+ */
2887
+ QParser *qp_new(Analyzer *analyzer)
2644
2888
  {
2645
- int i;
2646
2889
  QParser *self = ALLOC(QParser);
2647
2890
  self->or_default = true;
2648
2891
  self->wild_lower = true;
@@ -2651,48 +2894,66 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
2651
2894
  self->handle_parse_errors = false;
2652
2895
  self->allow_any_fields = false;
2653
2896
  self->use_keywords = true;
2897
+ self->use_typed_range_query = false;
2654
2898
  self->def_slop = 0;
2655
- self->fields_buf = hs_new_str(NULL);
2656
- self->all_fields = all_fields;
2657
- self->tokenized_fields = tokenized_fields;
2658
- if (def_fields) {
2659
- self->def_fields = def_fields;
2660
- for (i = 0; i < self->def_fields->size; i++) {
2661
- if (!hs_exists(self->all_fields, self->def_fields->elems[i])) {
2662
- hs_add(self->all_fields, estrdup(self->def_fields->elems[i]));
2663
- }
2664
- }
2665
- self->close_def_fields = true;
2666
- }
2667
- else {
2668
- self->def_fields = all_fields;
2669
- self->close_def_fields = false;
2670
- }
2671
- self->field_cache = h_new_str((free_ft)NULL, &free);
2672
- for (i = 0; i < self->all_fields->size; i++) {
2673
- char *field = estrdup(self->all_fields->elems[i]);
2674
- h_set(self->field_cache, field, field);
2675
- }
2676
- self->fields = self->def_fields;
2899
+
2900
+ self->tokenized_fields = hs_new_ptr(NULL);
2901
+ self->all_fields = hs_new_ptr(NULL);
2902
+ self->def_fields = hs_new_ptr(NULL);
2903
+
2904
+ self->fields_top = NULL;
2905
+ qp_push_fields(self, self->def_fields, false);
2906
+
2677
2907
  /* make sure all_fields contains the default fields */
2678
2908
  self->analyzer = analyzer;
2679
- self->ts_cache = h_new_str(&free, (free_ft)&ts_deref);
2909
+ self->ts_cache = h_new_ptr((free_ft)&ts_deref);
2680
2910
  self->buf_index = 0;
2681
- self->dynbuf = 0;
2911
+ self->dynbuf = NULL;
2682
2912
  self->non_tokenizer = non_tokenizer_new();
2683
2913
  mutex_init(&self->mutex, NULL);
2684
2914
  return self;
2685
2915
  }
2686
2916
 
2917
+ void qp_add_field(QParser *self,
2918
+ Symbol field,
2919
+ bool is_default,
2920
+ bool is_tokenized)
2921
+ {
2922
+ hs_add(self->all_fields, field);
2923
+ if (is_default) {
2924
+ hs_add(self->def_fields, field);
2925
+ }
2926
+ if (is_tokenized) {
2927
+ hs_add(self->tokenized_fields, field);
2928
+ }
2929
+ }
2930
+
2687
2931
  /* these chars have meaning within phrases */
2688
2932
  static const char *PHRASE_CHARS = "<>|\"";
2689
2933
 
2690
- static void str_insert(char *str, int len, char chr)
2934
+ /**
2935
+ * +str_insert_char+ inserts a character at the beginning of a string by
2936
+ * shifting the rest of the string right.
2937
+ */
2938
+ static void str_insert_char(char *str, int len, char chr)
2691
2939
  {
2692
2940
  memmove(str+1, str, len*sizeof(char));
2693
2941
  *str = chr;
2694
2942
  }
2695
2943
 
2944
+ /**
2945
+ * +qp_clean_str+ basically scans the query string and ensures that all open
2946
+ * and close parentheses '()' and quotes '"' are balanced. It does this by
2947
+ * inserting or appending extra parentheses or quotes to the string. This
2948
+ * obviously won't necessarily be exactly what the user wanted but we are
2949
+ * never going to know that anyway. The main job of this method is to help the
2950
+ * query at least parse correctly.
2951
+ *
2952
+ * It also checks that all special characters within phrases (ie between
2953
+ * quotes) are escaped correctly unless they have meaning within a phrase
2954
+ * ( <>,|," ). Note that '<' and '>' will also be escaped unless the appear
2955
+ * together like so; '<>'.
2956
+ */
2696
2957
  char *qp_clean_str(char *str)
2697
2958
  {
2698
2959
  int b, pb = -1;
@@ -2711,8 +2972,8 @@ char *qp_clean_str(char *str)
2711
2972
  *nsp++ = '\\'; /* this was left off the first time through */
2712
2973
  }
2713
2974
  *nsp++ = b;
2714
- /* \\ has escaped itself so has no power. Assign pb random char : */
2715
- pb = ((b == '\\') ? ':' : b);
2975
+ /* \ has escaped itself so has no power. Assign pb random char 'r' */
2976
+ pb = ((b == '\\') ? 'r' : b);
2716
2977
  continue;
2717
2978
  }
2718
2979
  switch (b) {
@@ -2737,7 +2998,7 @@ char *qp_clean_str(char *str)
2737
2998
  case ')':
2738
2999
  if (!quote_open) {
2739
3000
  if (br_cnt == 0) {
2740
- str_insert(new_str, (int)(nsp - new_str), '(');
3001
+ str_insert_char(new_str, (int)(nsp - new_str), '(');
2741
3002
  nsp++;
2742
3003
  }
2743
3004
  else {
@@ -2782,18 +3043,35 @@ char *qp_clean_str(char *str)
2782
3043
  return new_str;
2783
3044
  }
2784
3045
 
2785
- Query *qp_get_bad_query(QParser *qp, char *str)
3046
+ /**
3047
+ * Takes a string and finds whatever tokens it can using the QueryParser's
3048
+ * analyzer. It then turns these tokens (if any) into a boolean query. If it
3049
+ * fails to find any tokens, this method will return NULL.
3050
+ */
3051
+ static Query *qp_get_bad_query(QParser *qp, char *str)
2786
3052
  {
2787
3053
  Query *volatile q = NULL;
2788
3054
  qp->recovering = true;
3055
+ assert(qp->fields_top->next == NULL);
2789
3056
  FLDS(q, get_term_q(qp, field, str));
2790
3057
  return q;
2791
3058
  }
2792
3059
 
3060
+ /**
3061
+ * +qp_parse+ takes a string and turns it into a Query object using Ferret's
3062
+ * query language. It must either raise an error or return a query object. It
3063
+ * must not return NULL. If the yacc parser fails it will use a very basic
3064
+ * boolean query parser which takes whatever tokens it can find in the query
3065
+ * and terns them into a boolean query on the default fields.
3066
+ */
2793
3067
  Query *qp_parse(QParser *self, char *qstr)
2794
3068
  {
2795
3069
  Query *result = NULL;
2796
3070
  mutex_lock(&self->mutex);
3071
+ /* if qp->fields_top->next is not NULL we have a left over field-stack
3072
+ * object that was not popped during the last query parse */
3073
+ assert(NULL == self->fields_top->next);
3074
+
2797
3075
  self->recovering = self->destruct = false;
2798
3076
  if (self->clean_str) {
2799
3077
  self->qstrp = self->qstr = qp_clean_str(qstr);