isomorfeus-ferret 0.12.1 → 0.12.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +612 -612
  3. data/README.md +80 -44
  4. data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
  5. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
  6. data/ext/isomorfeus_ferret_ext/frb_search.c +14 -2
  7. data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
  8. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
  10. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
  11. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +7 -7
  12. data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
  13. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
  14. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
  15. data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
  16. data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
  17. data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
  18. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
  19. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
  20. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
  21. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
  22. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
  23. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
  24. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
  25. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
  26. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
  27. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
  28. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
  29. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
  30. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
  31. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
  32. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
  33. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
  34. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
  35. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
  36. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
  37. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
  38. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
  39. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
  40. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
  41. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
  42. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
  43. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
  44. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
  45. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
  46. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
  47. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
  48. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
  49. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
  50. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
  51. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
  52. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
  53. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
  54. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
  55. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
  56. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
  57. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
  58. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
  59. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
  60. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
  61. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
  62. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
  63. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
  64. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
  65. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
  66. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
  67. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
  68. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
  69. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
  70. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
  71. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
  72. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
  73. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
  74. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
  75. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
  76. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
  77. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
  78. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
  79. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
  80. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
  81. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
  82. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
  83. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
  84. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
  85. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
  86. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
  87. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
  88. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
  89. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
  90. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
  91. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
  92. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
  93. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
  94. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
  95. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
  96. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
  97. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
  98. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
  99. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
  100. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
  101. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
  102. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
  103. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
  104. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
  105. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
  106. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
  107. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
  108. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
  109. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
  110. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
  111. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
  112. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
  113. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
  114. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
  115. data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
  116. data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
  117. data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
  118. data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
  119. data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
  120. data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
  121. data/ext/isomorfeus_ferret_ext/test.c +7 -1
  122. data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
  123. data/lib/isomorfeus/ferret/index/index.rb +1 -1
  124. data/lib/isomorfeus/ferret/version.rb +1 -1
  125. metadata +43 -7
  126. data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -1,1366 +0,0 @@
1
- /*****************************************************************************
2
- * QueryParser
3
- * ===========
4
- *
5
- * Brief Overview
6
- * --------------
7
- *
8
- * === Creating a QueryParser
9
- *
10
- * +qp_new+ allocates a new QueryParser and assigns three very important
11
- * HashSets; +qp->def_fields+, +qp->tkz_fields+ and +qp->all_fields+. The
12
- * query language allows you to assign a field or a set of fields to each
13
- * part of the query.
14
- *
15
- * - +qp->def_fields+ is the set of fields that a query is applied to by
16
- * default when no fields are specified.
17
- * - +qp->all_fields+ is the set of fields that gets searched when the user
18
- * requests a search of all fields.
19
- * - +qp->tkz_fields+ is the set of fields that gets tokenized before being
20
- * added to the query parser.
21
- *
22
- * === qp_parse
23
- *
24
- * The main QueryParser method is +qp_parse+. It gets called with a the query
25
- * string and returns a Query object which can then be passed to the
26
- * IndexSearcher. The first thing it does is to clean the query string if
27
- * +qp->clean_str+ is set to true. The cleaning is done with the
28
- * +qp_clean_str+.
29
- *
30
- * It then calls the yacc parser which will set +qp->result+ to the parsed
31
- * query. If parsing fails in any way, +qp->result+ should be set to NULL, in
32
- * which case qp_parse does one of two things depending on the value of
33
- * +qp->handle_parse_errors+;
34
- *
35
- * - If it is set to true, qp_parse attempts to do a very basic parsing of
36
- * the query by ignoring all special characters and parsing the query as
37
- * a plain boolean query.
38
- * - If it is set to false, qp_parse will raise a PARSE_ERROR and hopefully
39
- * free all allocated memory.
40
- *
41
- * === The Lexer
42
- *
43
- * +yylex+ is the lexing method called by the QueryParser. It breaks the
44
- * query up into special characters;
45
- *
46
- * ( "&:()[]{}!\"~^|<>=*?+-" )
47
- *
48
- * and tokens;
49
- *
50
- * - QWRD
51
- * - WILD_STR
52
- * - AND['AND', '&&']
53
- * - OR['OR', '||']
54
- * - REQ['REQ', '+']
55
- * - NOT['NOT', '-', '~']
56
- *
57
- * QWRD tokens are query word tokens which are made up of characters other
58
- * than the special characters. They can also contain special characters when
59
- * escaped with a backslash '\'. WILD_STR is the same as QWRD except that it
60
- * may also contain '?' and '*' characters.
61
- *
62
- * === The Parser
63
- *
64
- * For a better understanding of the how the query parser works, it is a good
65
- * idea to study the Ferret Query Language (FQL) described below. Once you
66
- * understand FQL the one tricky part that needs to be mentioned is how
67
- * fields are handled. This is where +qp->def_fields+ and +qp->all_fields
68
- * come into play. When no fields are specified then the default fields are
69
- * used. The '*:' field specifier will search all fields contained in the
70
- * all_fields set. Otherwise all fields specified in the field descripter
71
- * separated by '|' will be searched. For example 'title|content:' will
72
- * search the title and content fields. When fields are specified like this,
73
- * the parser will push the fields onto a stack and all queries modified by
74
- * the field specifier will be applied to the fields on top of the stack.
75
- * The parser uses the FLDS macro to handle the current fields. It takes the
76
- * current query building function in the parser and calls it for all the
77
- * current search fields (on top of the stack).
78
- *
79
- * Ferret Query Language (FQL)
80
- * ---------------------------
81
- *
82
- * FIXME to be continued...
83
- *****************************************************************************/
84
- %{
85
- #include <string.h>
86
- #include <ctype.h>
87
- #include <wctype.h>
88
- #include <assert.h>
89
- #include "frt_global.h"
90
- #include "frt_except.h"
91
- #include "frt_search.h"
92
- #include "frt_array.h"
93
- #include "frt_internal.h"
94
-
95
- typedef struct Phrase {
96
- int size;
97
- int capa;
98
- int pos_inc;
99
- FrtPhrasePosition *positions;
100
- } Phrase;
101
-
102
- #define BCA_INIT_CAPA 4
103
- typedef struct BCArray {
104
- int size;
105
- int capa;
106
- BooleanClause **clauses;
107
- } BCArray;
108
-
109
- float qp_default_fuzzy_min_sim = 0.5;
110
- int qp_default_fuzzy_pre_len = 0;
111
-
112
- %}
113
- %union {
114
- FrtQuery *query;
115
- BooleanClause *bcls;
116
- BCArray *bclss;
117
- FrtHashSet *hashset;
118
- Phrase *phrase;
119
- char *str;
120
- }
121
- %{
122
- static int yylex(YYSTYPE *lvalp, FrtQParser *qp);
123
- static int yyerror(QParser *qp, char const *msg);
124
-
125
- #define PHRASE_INIT_CAPA 4
126
- static FrtQuery *get_bool_q(BCArray *bca);
127
-
128
- static BCArray *first_cls(BooleanClause *boolean_clause);
129
- static BCArray *add_and_cls(BCArray *bca, BooleanClause *clause);
130
- static BCArray *add_or_cls(BCArray *bca, BooleanClause *clause);
131
- static BCArray *add_default_cls(QParser *qp, BCArray *bca,
132
- BooleanClause *clause);
133
- static void bca_destroy(BCArray *bca);
134
-
135
- static BooleanClause *get_bool_cls(FrtQuery *q, BCType occur);
136
-
137
- static FrtQuery *get_term_q(QParser *qp, FrtSymbol field, char *word);
138
- static FrtQuery *get_fuzzy_q(QParser *qp, FrtSymbol field, char *word,
139
- char *slop);
140
- static FrtQuery *get_wild_q(QParser *qp, FrtSymbol field, char *pattern);
141
-
142
- static FrtHashSet *first_field(QParser *qp, const char *field);
143
- static FrtHashSet *add_field(QParser *qp, const char *field);
144
-
145
- static FrtQuery *get_phrase_q(QParser *qp, Phrase *phrase, char *slop);
146
-
147
- static Phrase *ph_first_word(char *word);
148
- static Phrase *ph_add_word(Phrase *self, char *word);
149
- static Phrase *ph_add_multi_word(Phrase *self, char *word);
150
- static void ph_destroy(Phrase *self);
151
-
152
- static FrtQuery *get_r_q(QParser *qp, FrtSymbol field, char *from, char *to,
153
- bool inc_lower, bool inc_upper);
154
-
155
- static void qp_push_fields(QParser *self, FrtHashSet *fields, bool destroy);
156
- static void qp_pop_fields(QParser *self);
157
-
158
- /**
159
- * +FLDS+ calls +func+ for all fields on top of the field stack. +func+
160
- * must return a query. If there is more than one field on top of FieldStack
161
- * then +FLDS+ will combing all the queries returned by +func+ into a single
162
- * BooleanQuery which it than assigns to +q+. If there is only one field, the
163
- * return value of +func+ is assigned to +q+ directly.
164
- */
165
- #define FLDS(q, func) do {\
166
- FRT_TRY {\
167
- FrtSymbol field;\
168
- if (qp->fields->size == 0) {\
169
- q = NULL;\
170
- } else if (qp->fields->size == 1) {\
171
- field = (Symbol)qp->fields->first->elem;\
172
- q = func;\
173
- } else {\
174
- FrtQuery *volatile sq; FrtHashSetEntry *volatile hse;\
175
- q = bq_new_max(false, qp->max_clauses);\
176
- for (hse = qp->fields->first; hse; hse = hse->next) {\
177
- field = (Symbol)hse->elem;\
178
- sq = func;\
179
- FRT_TRY\
180
- if (sq) frt_bq_add_query_nr(q, sq, FRT_BC_SHOULD);\
181
- FRT_XCATCHALL\
182
- if (sq) frt_q_deref(sq);\
183
- FRT_XENDTRY\
184
- }\
185
- if (((FrtBooleanQuery *)q)->clause_cnt == 0) {\
186
- frt_q_deref(q);\
187
- q = NULL;\
188
- }\
189
- }\
190
- } FRT_XCATCHALL\
191
- qp->destruct = true;\
192
- FRT_HANDLED();\
193
- FRT_XENDTRY\
194
- if (qp->destruct && !qp->recovering && q) {q_deref(q); q = NULL;}\
195
- } while (0)
196
-
197
- #define Y if (qp->destruct) goto yyerrorlab;
198
- #define T FRT_TRY
199
- #define E\
200
- FRT_XCATCHALL\
201
- qp->destruct = true;\
202
- FRT_HANDLED();\
203
- FRT_XENDTRY\
204
- if (qp->destruct) Y;
205
- %}
206
- %expect 1
207
- %pure-parser
208
- %parse-param { FrtQParser *qp }
209
- %lex-param { FrtQParser *qp }
210
- %token <str> QWRD WILD_STR
211
- %type <query> q bool_q boosted_q term_q wild_q field_q phrase_q range_q
212
- %type <bcls> bool_cls
213
- %type <bclss> bool_clss
214
- %type <hashset> field
215
- %type <phrase> ph_words
216
- %nonassoc LOW
217
- %left AND OR
218
- %nonassoc REQ NOT
219
- %left ':'
220
- %nonassoc HIGH
221
- %destructor { if ($$ && qp->destruct) frt_q_deref($$); } q bool_q boosted_q term_q wild_q field_q phrase_q range_q
222
- %destructor { if ($$ && qp->destruct) bc_deref($$); } bool_cls
223
- %destructor { if ($$ && qp->destruct) bca_destroy($$); } bool_clss
224
- %destructor { if ($$ && qp->destruct) ph_destroy($$); } ph_words
225
- %%
226
- bool_q : /* Nothing */ { qp->result = $$ = NULL; }
227
- | bool_clss { T qp->result = $$ = get_bool_q($1); E }
228
- ;
229
- bool_clss : bool_cls { T $$ = first_cls($1); E }
230
- | bool_clss AND bool_cls { T $$ = add_and_cls($1, $3); E }
231
- | bool_clss OR bool_cls { T $$ = add_or_cls($1, $3); E }
232
- | bool_clss bool_cls { T $$ = add_default_cls(qp, $1, $2); E }
233
- ;
234
- bool_cls : REQ boosted_q { T $$ = get_bool_cls($2, FRT_BC_MUST); E }
235
- | NOT boosted_q { T $$ = get_bool_cls($2, FRT_BC_MUST_NOT); E }
236
- | boosted_q { T $$ = get_bool_cls($1, FRT_BC_SHOULD); E }
237
- ;
238
- boosted_q : q
239
- | q '^' QWRD { T if ($1) sscanf($3,"%f",&($1->boost)); $$=$1; E }
240
- ;
241
- q : term_q
242
- | '(' ')' { T $$ = bq_new_max(true, qp->max_clauses); E }
243
- | '(' bool_clss ')' { T $$ = get_bool_q($2); E }
244
- | field_q
245
- | phrase_q
246
- | range_q
247
- | wild_q
248
- ;
249
- term_q : QWRD { FLDS($$, get_term_q(qp, field, $1)); Y}
250
- | QWRD '~' QWRD %prec HIGH { FLDS($$, get_fuzzy_q(qp, field, $1, $3)); Y}
251
- | QWRD '~' %prec LOW { FLDS($$, get_fuzzy_q(qp, field, $1, NULL)); Y}
252
- ;
253
- wild_q : WILD_STR { FLDS($$, get_wild_q(qp, field, $1)); Y}
254
- ;
255
- field_q : field ':' q { qp_pop_fields(qp); }
256
- { $$ = $3; }
257
- | '*' { qp_push_fields(qp, qp->all_fields, false); } ':' q { qp_pop_fields(qp); }
258
- { $$ = $4; }
259
- ;
260
- field : QWRD { $$ = first_field(qp, $1); }
261
- | field '|' QWRD { $$ = add_field(qp, $3);}
262
- ;
263
- phrase_q : '"' ph_words '"' { $$ = get_phrase_q(qp, $2, NULL); }
264
- | '"' ph_words '"' '~' QWRD { $$ = get_phrase_q(qp, $2, $5); }
265
- | '"' '"' { $$ = NULL; }
266
- | '"' '"' '~' QWRD { $$ = NULL; (void)$4;}
267
- ;
268
- ph_words : QWRD { $$ = ph_first_word($1); }
269
- | '<' '>' { $$ = ph_first_word(NULL); }
270
- | ph_words QWRD { $$ = ph_add_word($1, $2); }
271
- | ph_words '<' '>' { $$ = ph_add_word($1, NULL); }
272
- | ph_words '|' QWRD { $$ = ph_add_multi_word($1, $3); }
273
- ;
274
- range_q : '[' QWRD QWRD ']' { FLDS($$, get_r_q(qp, field, $2, $3, true, true)); Y}
275
- | '[' QWRD QWRD '}' { FLDS($$, get_r_q(qp, field, $2, $3, true, false)); Y}
276
- | '{' QWRD QWRD ']' { FLDS($$, get_r_q(qp, field, $2, $3, false, true)); Y}
277
- | '{' QWRD QWRD '}' { FLDS($$, get_r_q(qp, field, $2, $3, false, false)); Y}
278
- | '<' QWRD '}' { FLDS($$, get_r_q(qp, field, NULL,$2, false, false)); Y}
279
- | '<' QWRD ']' { FLDS($$, get_r_q(qp, field, NULL,$2, false, true)); Y}
280
- | '[' QWRD '>' { FLDS($$, get_r_q(qp, field, $2, NULL,true, false)); Y}
281
- | '{' QWRD '>' { FLDS($$, get_r_q(qp, field, $2, NULL,false, false)); Y}
282
- | '<' QWRD { FLDS($$, get_r_q(qp, field, NULL,$2, false, false)); Y}
283
- | '<' '=' QWRD { FLDS($$, get_r_q(qp, field, NULL,$3, false, true)); Y}
284
- | '>' '=' QWRD { FLDS($$, get_r_q(qp, field, $3, NULL,true, false)); Y}
285
- | '>' QWRD { FLDS($$, get_r_q(qp, field, $2, NULL,false, false)); Y}
286
- ;
287
- %%
288
-
289
- static const char *special_char = "&:()[]{}!\"~^|<>=*?+-";
290
- static const char *not_word = " \t()[]{}!\"~^|<>=";
291
-
292
- /**
293
- * +get_word+ gets the next query-word from the query string. A query-word is
294
- * basically a string of non-special or escaped special characters. It is
295
- * FrtAnalyzer agnostic. It is up to the get_*_q methods to tokenize the word and
296
- * turn it into a +Query+. See the documentation for each get_*_q method to
297
- * see how it handles tokenization.
298
- *
299
- * Note that +get_word+ is also responsible for returning field names and
300
- * matching the special tokens 'AND', 'NOT', 'REQ' and 'OR'.
301
- */
302
- static int get_word(YYSTYPE *lvalp, FrtQParser *qp)
303
- {
304
- bool is_wild = false;
305
- int len;
306
- char c;
307
- char *buf = qp->buf[qp->buf_index];
308
- char *bufp = buf;
309
- qp->buf_index = (qp->buf_index + 1) % QP_CONC_WORDS;
310
-
311
- if (qp->dynbuf) {
312
- free(qp->dynbuf);
313
- qp->dynbuf = NULL;
314
- }
315
-
316
- qp->qstrp--; /* need to back up one character */
317
-
318
- while (!strchr(not_word, (c = *qp->qstrp++))) {
319
- switch (c) {
320
- case '\\':
321
- if ((c = *qp->qstrp) == '\0') {
322
- *bufp++ = '\\';
323
- }
324
- else {
325
- *bufp++ = c;
326
- qp->qstrp++;
327
- }
328
- break;
329
- case ':':
330
- if ((*qp->qstrp) == ':') {
331
- qp->qstrp++;
332
- *bufp++ = ':';
333
- *bufp++ = ':';
334
- }
335
- else {
336
- goto get_word_done;
337
- }
338
- break;
339
- case '*': case '?':
340
- is_wild = true;
341
- /* fall through */
342
- default:
343
- *bufp++ = c;
344
- }
345
- /* we've exceeded the static buffer. switch to the dynamic one. The
346
- * dynamic buffer is allocated enough space to hold the whole query
347
- * string so it's capacity doesn't need to be checked again once
348
- * allocated. */
349
- if (!qp->dynbuf && ((bufp - buf) == MAX_WORD_SIZE)) {
350
- qp->dynbuf = FRT_ALLOC_AND_ZERO_N(char, strlen(qp->qstr) + 1);
351
- strncpy(qp->dynbuf, buf, MAX_WORD_SIZE);
352
- buf = qp->dynbuf;
353
- bufp = buf + MAX_WORD_SIZE;
354
- }
355
- }
356
- get_word_done:
357
- qp->qstrp--;
358
- /* check for keywords. There are only four so we have a bit of a hack
359
- * which just checks for all of them. */
360
- *bufp = '\0';
361
- len = (int)(bufp - buf);
362
- if (qp->use_keywords) {
363
- if (len == 3) {
364
- if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
365
- if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
366
- if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
367
- }
368
- if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
369
- }
370
-
371
- /* found a word so return it. */
372
- lvalp->str = buf;
373
- if (is_wild) {
374
- return WILD_STR;
375
- }
376
- return QWRD;
377
- }
378
-
379
- /**
380
- * +yylex+ is the lexing method called by the QueryParser. It breaks the
381
- * query up into special characters;
382
- *
383
- * ( "&:()[]{}!\"~^|<>=*?+-" )
384
- *
385
- * and tokens;
386
- *
387
- * - QWRD
388
- * - WILD_STR
389
- * - AND['AND', '&&']
390
- * - OR['OR', '||']
391
- * - REQ['REQ', '+']
392
- * - NOT['NOT', '-', '~']
393
- *
394
- * QWRD tokens are query word tokens which are made up of characters other
395
- * than the special characters. They can also contain special characters when
396
- * escaped with a backslash '\'. WILD_STR is the same as QWRD except that it
397
- * may also contain '?' and '*' characters.
398
- *
399
- * If any of the special chars are seen they will usually be returned straight
400
- * away. The exceptions are the wild chars '*' and '?', and '&' which will be
401
- * treated as a plain old word character unless followed by another '&'.
402
- *
403
- * If no special characters or tokens are found then yylex delegates to
404
- * +get_word+ which will fetch the next query-word.
405
- */
406
- static int yylex(YYSTYPE *lvalp, FrtQParser *qp)
407
- {
408
- char c, nc;
409
-
410
- while ((c=*qp->qstrp++) == ' ' || c == '\t') {
411
- }
412
-
413
- if (c == '\0') return 0;
414
-
415
- if (strchr(special_char, c)) { /* comment */
416
- nc = *qp->qstrp;
417
- switch (c) {
418
- case '-': case '!': return NOT;
419
- case '+': return REQ;
420
- case '*':
421
- if (nc == ':') return c;
422
- break;
423
- case '?':
424
- break;
425
- case '&':
426
- if (nc == '&') {
427
- qp->qstrp++;
428
- return AND;
429
- }
430
- break; /* Don't return single & character. Use in word. */
431
- case '|':
432
- if (nc == '|') {
433
- qp->qstrp++;
434
- return OR;
435
- }
436
- default:
437
- return c;
438
- }
439
- }
440
-
441
- return get_word(lvalp, qp);
442
- }
443
-
444
- /**
445
- * yyerror gets called if there is an parse error with the yacc parser.
446
- * It is responsible for clearing any memory that was allocated during the
447
- * parsing process.
448
- */
449
- static int yyerror(QParser *qp, char const *msg)
450
- {
451
- qp->destruct = true;
452
- if (!qp->handle_parse_errors) {
453
- char buf[1024];
454
- buf[1023] = '\0';
455
- strncpy(buf, qp->qstr, 1023);
456
- if (qp->clean_str) {
457
- free(qp->qstr);
458
- }
459
- frt_mutex_unlock(&qp->mutex);
460
- snprintf(xmsg_buffer, XMSG_BUFFER_SIZE,
461
- "couldn't parse query ``%s''. Error message "
462
- " was %s", buf, (char *)msg);
463
- }
464
- while (qp->fields_top->next != NULL) {
465
- qp_pop_fields(qp);
466
- }
467
- return 0;
468
- }
469
-
470
- #define BQ(query) ((FrtBooleanQuery *)(query))
471
-
472
- /**
473
- * The QueryParser caches a tokenizer for each field so that it doesn't need
474
- * to allocate a new tokenizer for each term in the query. This would be quite
475
- * expensive as tokenizers use quite a large hunk of memory.
476
- *
477
- * This method returns the query parser for a particular field and sets it up
478
- * with the text to be tokenized.
479
- */
480
- static FrtTokenStream *get_cached_ts(QParser *qp, FrtSymbol field, char *text)
481
- {
482
- FrtTokenStream *ts;
483
- if (frt_hs_exists(qp->tokenized_fields, field)) {
484
- ts = (FrtTokenStream *)frt_h_get(qp->ts_cache, field);
485
- if (!ts) {
486
- ts = frt_a_get_ts(qp->analyzer, field, text);
487
- frt_h_set(qp->ts_cache, field, ts);
488
- }
489
- else {
490
- ts->reset(ts, text);
491
- }
492
- }
493
- else {
494
- ts = qp->non_tokenizer;
495
- ts->reset(ts, text);
496
- }
497
- return ts;
498
- }
499
-
500
- /**
501
- * Turns a BooleanClause array into a BooleanQuery. It will optimize the query
502
- * if 0 or 1 clauses are present to NULL or the actual query in the clause
503
- * respectively.
504
- */
505
- static FrtQuery *get_bool_q(BCArray *bca)
506
- {
507
- FrtQuery *q;
508
- const int clause_count = bca->size;
509
-
510
- if (clause_count == 0) {
511
- q = NULL;
512
- free(bca->clauses);
513
- }
514
- else if (clause_count == 1) {
515
- BooleanClause *bc = bca->clauses[0];
516
- if (bc->is_prohibited) {
517
- q = frt_bq_new(false);
518
- frt_bq_add_query_nr(q, bc->query, FRT_BC_MUST_NOT);
519
- frt_bq_add_query_nr(q, frt_maq_new(), FRT_BC_MUST);
520
- }
521
- else {
522
- q = bc->query;
523
- }
524
- free(bc);
525
- free(bca->clauses);
526
- }
527
- else {
528
- q = frt_bq_new(false);
529
- /* copy clauses into query */
530
-
531
- BQ(q)->clause_cnt = clause_count;
532
- BQ(q)->clause_capa = bca->capa;
533
- free(BQ(q)->clauses);
534
- BQ(q)->clauses = bca->clauses;
535
- }
536
- free(bca);
537
- return q;
538
- }
539
-
540
- /**
541
- * Base method for appending BooleanClauses to a BooleanClause array. This
542
- * method doesn't care about the type of clause (MUST, SHOULD, MUST_NOT).
543
- */
544
- static void bca_add_clause(BCArray *bca, BooleanClause *clause)
545
- {
546
- if (bca->size >= bca->capa) {
547
- bca->capa <<= 1;
548
- FRT_REALLOC_N(bca->clauses, BooleanClause *, bca->capa);
549
- }
550
- bca->clauses[bca->size] = clause;
551
- bca->size++;
552
- }
553
-
554
- /**
555
- * Add the first clause to a BooleanClause array. This method is also
556
- * responsible for allocating a new BooleanClause array.
557
- */
558
- static BCArray *first_cls(BooleanClause *clause)
559
- {
560
- BCArray *bca = FRT_ALLOC_AND_ZERO(BCArray);
561
- bca->capa = BCA_INIT_CAPA;
562
- bca->clauses = FRT_ALLOC_N(BooleanClause *, BCA_INIT_CAPA);
563
- if (clause) {
564
- bca_add_clause(bca, clause);
565
- }
566
- return bca;
567
- }
568
-
569
- /**
570
- * Add AND clause to the BooleanClause array. The means that it will set the
571
- * clause being added and the previously added clause from SHOULD clauses to
572
- * MUST clauses. (If they are currently MUST_NOT clauses they stay as they
573
- * are.)
574
- */
575
- static BCArray *add_and_cls(BCArray *bca, BooleanClause *clause)
576
- {
577
- if (clause) {
578
- if (bca->size == 1) {
579
- if (!bca->clauses[0]->is_prohibited) {
580
- bc_set_occur(bca->clauses[0], FRT_BC_MUST);
581
- }
582
- }
583
- if (!clause->is_prohibited) {
584
- bc_set_occur(clause, FRT_BC_MUST);
585
- }
586
- bca_add_clause(bca, clause);
587
- }
588
- return bca;
589
- }
590
-
591
- /**
592
- * Add SHOULD clause to the BooleanClause array.
593
- */
594
- static BCArray *add_or_cls(BCArray *bca, BooleanClause *clause)
595
- {
596
- if (clause) {
597
- bca_add_clause(bca, clause);
598
- }
599
- return bca;
600
- }
601
-
602
- /**
603
- * Add AND or OR clause to the BooleanClause array, depending on the default
604
- * clause type.
605
- */
606
- static BCArray *add_default_cls(QParser *qp, BCArray *bca,
607
- BooleanClause *clause)
608
- {
609
- if (qp->or_default) {
610
- add_or_cls(bca, clause);
611
- }
612
- else {
613
- add_and_cls(bca, clause);
614
- }
615
- return bca;
616
- }
617
-
618
- /**
619
- * destroy array of BooleanClauses
620
- */
621
- static void bca_destroy(BCArray *bca)
622
- {
623
- int i;
624
- for (i = 0; i < bca->size; i++) {
625
- bc_deref(bca->clauses[i]);
626
- }
627
- free(bca->clauses);
628
- free(bca);
629
- }
630
-
631
- /**
632
- * Turn a query into a BooleanClause for addition to a BooleanQuery.
633
- */
634
- static BooleanClause *get_bool_cls(FrtQuery *q, BCType occur)
635
- {
636
- if (q) {
637
- return bc_new(q, occur);
638
- }
639
- else {
640
- return NULL;
641
- }
642
- }
643
-
644
- /**
645
- * Create a TermQuery. The word will be tokenized and if the tokenization
646
- * produces more than one token, a PhraseQuery will be returned. For example,
647
- * if the word is dbalmain@gmail.com and a LetterTokenizer is used then a
648
- * PhraseQuery "dbalmain gmail com" will be returned which is actually exactly
649
- * what we want as it will match any documents containing the same email
650
- * address and tokenized with the same tokenizer.
651
- */
652
- static FrtQuery *get_term_q(QParser *qp, FrtSymbol field, char *word)
653
- {
654
- FrtQuery *q;
655
- FrtToken *token;
656
- FrtTokenStream *stream = get_cached_ts(qp, field, word);
657
-
658
- if ((token = frt_ts_next(stream)) == NULL) {
659
- q = NULL;
660
- }
661
- else {
662
- q = frt_tq_new(field, token->text);
663
- if ((token = frt_ts_next(stream)) != NULL) {
664
- /* Less likely case, destroy the term query and create a
665
- * phrase query instead */
666
- FrtQuery *phq = frt_phq_new(field);
667
- frt_phq_add_term(phq, ((TermQuery *)q)->term, 0);
668
- q->destroy_i(q);
669
- q = phq;
670
- do {
671
- if (token->pos_inc) {
672
- frt_phq_add_term(q, token->text, token->pos_inc);
673
- /* add some slop since single term was expected */
674
- ((FrtPhraseQuery *)q)->slop++;
675
- }
676
- else {
677
- frt_phq_append_multi_term(q, token->text);
678
- }
679
- } while ((token = frt_ts_next(stream)) != NULL);
680
- }
681
- }
682
- return q;
683
- }
684
-
685
- /**
686
- * Create a FuzzyQuery. The word will be tokenized and only the first token
687
- * will be used. If there are any more tokens after tokenization, they will be
688
- * ignored.
689
- */
690
- static FrtQuery *get_fuzzy_q(QParser *qp, FrtSymbol field, char *word,
691
- char *slop_str)
692
- {
693
- FrtQuery *q;
694
- FrtToken *token;
695
- FrtTokenStream *stream = get_cached_ts(qp, field, word);
696
-
697
- if ((token = frt_ts_next(stream)) == NULL) {
698
- q = NULL;
699
- }
700
- else {
701
- /* it only makes sense to find one term in a fuzzy query */
702
- float slop = qp_default_fuzzy_min_sim;
703
- if (slop_str) {
704
- sscanf(slop_str, "%f", &slop);
705
- }
706
- q = frt_fuzq_new_conf(field, token->text, slop, qp_default_fuzzy_pre_len,
707
- qp->max_clauses);
708
- }
709
- return q;
710
- }
711
-
712
- /**
713
- * Downcase a string taking locale into account and works for multibyte
714
- * character sets.
715
- */
716
- static char *lower_str(char *str)
717
- {
718
- const int max_len = (int)strlen(str) + 1;
719
- int cnt;
720
- wchar_t *wstr = FRT_ALLOC_N(wchar_t, max_len);
721
- if ((cnt = mbstowcs(wstr, str, max_len)) > 0) {
722
- wchar_t *w = wstr;
723
- while (*w) {
724
- *w = towlower(*w);
725
- w++;
726
- }
727
- wcstombs(str, wstr, max_len);
728
- }
729
- else {
730
- char *s = str;
731
- while (*s) {
732
- *s = tolower(*s);
733
- s++;
734
- }
735
- }
736
- free(wstr);
737
- str[max_len] = '\0';
738
- return str;
739
- }
740
-
741
- /**
742
- * Create a WildCardQuery. No tokenization will be performed on the pattern
743
- * but the pattern will be downcased if +qp->wild_lower+ is set to true and
744
- * the field in question is a tokenized field.
745
- *
746
- * Note: this method will not always return a WildCardQuery. It could be
747
- * optimized to a MatchAllQuery if the pattern is '*' or a PrefixQuery if the
748
- * only wild char (*, ?) in the pattern is a '*' at the end of the pattern.
749
- */
750
- static FrtQuery *get_wild_q(QParser *qp, FrtSymbol field, char *pattern)
751
- {
752
- FrtQuery *q;
753
- bool is_prefix = false;
754
- char *p;
755
- int len = (int)strlen(pattern);
756
-
757
- if (qp->wild_lower
758
- && (!qp->tokenized_fields || frt_hs_exists(qp->tokenized_fields, field))) {
759
- lower_str(pattern);
760
- }
761
-
762
- /* simplify the wildcard query to a prefix query if possible. Basically a
763
- * prefix query is any wildcard query that has a '*' as the last character
764
- * and no other wildcard characters before it. "*" by itself will expand
765
- * to a MatchAllQuery */
766
- if (strcmp(pattern, "*") == 0) {
767
- return frt_maq_new();
768
- }
769
- if (pattern[len - 1] == '*') {
770
- is_prefix = true;
771
- for (p = &pattern[len - 2]; p >= pattern; p--) {
772
- if (*p == '*' || *p == '?') {
773
- is_prefix = false;
774
- break;
775
- }
776
- }
777
- }
778
- if (is_prefix) {
779
- /* chop off the '*' temporarily to create the query */
780
- pattern[len - 1] = 0;
781
- q = frt_prefixq_new(field, pattern);
782
- pattern[len - 1] = '*';
783
- }
784
- else {
785
- q = frt_wcq_new(field, pattern);
786
- }
787
- MTQMaxTerms(q) = qp->max_clauses;
788
- return q;
789
- }
790
-
791
- /**
792
- * Adds another field to the top of the FieldStack.
793
- */
794
- static FrtHashSet *add_field(QParser *qp, const char *field_name)
795
- {
796
- FrtSymbol field = field_name;
797
- if (qp->allow_any_fields || frt_hs_exists(qp->all_fields, field)) {
798
- frt_hs_add(qp->fields, field);
799
- }
800
- return qp->fields;
801
- }
802
-
803
- /**
804
- * The method gets called when a field modifier ("field1|field2:") is seen. It
805
- * will push a new FieldStack object onto the stack and add +field+ to its
806
- * fields set.
807
- */
808
- static FrtHashSet *first_field(QParser *qp, const char *field)
809
- {
810
- qp_push_fields(qp, frt_hs_new_ptr(NULL), true);
811
- return add_field(qp, field);
812
- }
813
-
814
- /**
815
- * Destroy a phrase object freeing all allocated memory.
816
- */
817
- static void ph_destroy(Phrase *self)
818
- {
819
- int i;
820
- for (i = 0; i < self->size; i++) {
821
- frt_ary_destroy(self->positions[i].terms, &free);
822
- }
823
- free(self->positions);
824
- free(self);
825
- }
826
-
827
-
828
- /**
829
- * Allocate a new Phrase object
830
- */
831
- static Phrase *ph_new()
832
- {
833
- Phrase *self = FRT_ALLOC_AND_ZERO(Phrase);
834
- self->capa = PHRASE_INIT_CAPA;
835
- self->positions = FRT_ALLOC_AND_ZERO_N(PhrasePosition, PHRASE_INIT_CAPA);
836
- return self;
837
- }
838
-
839
- /**
840
- * Add the first word to the phrase. This method is also in charge of
841
- * allocating a new Phrase object.
842
- */
843
- static Phrase *ph_first_word(char *word)
844
- {
845
- Phrase *self = ph_new();
846
- if (word) { /* no point in adding NULL in start */
847
- self->positions[0].terms = frt_ary_new_type_capa(char *, 1);
848
- frt_ary_push(self->positions[0].terms, frt_estrdup(word));
849
- self->size = 1;
850
- }
851
- return self;
852
- }
853
-
854
- /**
855
- * Add a new word to the Phrase
856
- */
857
- static Phrase *ph_add_word(Phrase *self, char *word)
858
- {
859
- if (word) {
860
- const int index = self->size;
861
- FrtPhrasePosition *pp = self->positions;
862
- if (index >= self->capa) {
863
- self->capa <<= 1;
864
- FRT_REALLOC_N(pp, PhrasePosition, self->capa);
865
- self->positions = pp;
866
- }
867
- pp[index].pos = self->pos_inc;
868
- pp[index].terms = frt_ary_new_type_capa(char *, 1);
869
- frt_ary_push(pp[index].terms, frt_estrdup(word));
870
- self->size++;
871
- self->pos_inc = 0;
872
- }
873
- else {
874
- self->pos_inc++;
875
- }
876
- return self;
877
- }
878
-
879
- /**
880
- * Adds a word to the Phrase object in the same position as the previous word
881
- * added to the Phrase. This will later be turned into a multi-PhraseQuery.
882
- */
883
- static Phrase *ph_add_multi_word(Phrase *self, char *word)
884
- {
885
- const int index = self->size - 1;
886
- FrtPhrasePosition *pp = self->positions;
887
-
888
- if (word) {
889
- frt_ary_push(pp[index].terms, frt_estrdup(word));
890
- }
891
- return self;
892
- }
893
-
894
- /**
895
- * Build a phrase query for a single field. It might seem like a better idea
896
- * to build the PhraseQuery once and duplicate it for each field but this
897
- * would be buggy in the case of PerFieldAnalyzers in which case a different
898
- * tokenizer could be used for each field.
899
- *
900
- * Note that the query object returned by this method is not always a
901
- * PhraseQuery. If there is only one term in the query then the query is
902
- * simplified to a TermQuery. If there are multiple terms but only a single
903
- * position, then a MultiTermQuery is retured.
904
- *
905
- * Note that each word in the query gets tokenized. Unlike get_term_q, if the
906
- * word gets tokenized into more than one token, the rest of the tokens are
907
- * ignored. For example, if you have the phrase;
908
- *
909
- * "email: dbalmain@gmail.com"
910
- *
911
- * the Phrase object will contain to positions with the words 'email:' and
912
- * 'dbalmain@gmail.com'. Now, if you are using a LetterTokenizer then the
913
- * second word will be tokenized into the tokens ['dbalmain', 'gmail', 'com']
914
- * and only the first token will be used, so the resulting phrase query will
915
- * actually look like this;
916
- *
917
- * "email dbalmain"
918
- *
919
- * This problem can easily be solved by using the StandardTokenizer or any
920
- * custom tokenizer which will leave dbalmain@gmail.com as a single token.
921
- */
922
- static FrtQuery *get_phrase_query(QParser *qp, FrtSymbol field,
923
- Phrase *phrase, char *slop_str)
924
- {
925
- const int pos_cnt = phrase->size;
926
- FrtQuery *q = NULL;
927
-
928
- if (pos_cnt == 1) {
929
- char **words = phrase->positions[0].terms;
930
- const int word_count = frt_ary_size(words);
931
- if (word_count == 1) {
932
- q = get_term_q(qp, field, words[0]);
933
- }
934
- else {
935
- int i;
936
- int term_cnt = 0;
937
- FrtToken *token;
938
- char *last_word = NULL;
939
-
940
- for (i = 0; i < word_count; i++) {
941
- token = frt_ts_next(get_cached_ts(qp, field, words[i]));
942
- if (token) {
943
- free(words[i]);
944
- last_word = words[i] = frt_estrdup(token->text);
945
- ++term_cnt;
946
- }
947
- else {
948
- /* empty words will later be ignored */
949
- words[i][0] = '\0';
950
- }
951
- }
952
-
953
- switch (term_cnt) {
954
- case 0:
955
- q = frt_bq_new(false);
956
- break;
957
- case 1:
958
- q = frt_tq_new(field, last_word);
959
- break;
960
- default:
961
- q = frt_multi_tq_new_conf(field, term_cnt, 0.0);
962
- for (i = 0; i < word_count; i++) {
963
- /* ignore empty words */
964
- if (words[i][0]) {
965
- frt_multi_tq_add_term(q, words[i]);
966
- }
967
- }
968
- break;
969
- }
970
- }
971
- }
972
- else if (pos_cnt > 1) {
973
- FrtToken *token;
974
- FrtTokenStream *stream;
975
- int i, j;
976
- int pos_inc = 0;
977
- q = frt_phq_new(field);
978
- if (slop_str) {
979
- int slop;
980
- sscanf(slop_str,"%d",&slop);
981
- ((FrtPhraseQuery *)q)->slop = slop;
982
- }
983
-
984
- for (i = 0; i < pos_cnt; i++) {
985
- char **words = phrase->positions[i].terms;
986
- const int word_count = frt_ary_size(words);
987
- if (pos_inc) {
988
- ((FrtPhraseQuery *)q)->slop++;
989
- }
990
- pos_inc += phrase->positions[i].pos + 1; /* Actually holds pos_inc*/
991
-
992
- if (word_count == 1) {
993
- stream = get_cached_ts(qp, field, words[0]);
994
- while ((token = frt_ts_next(stream))) {
995
- if (token->pos_inc) {
996
- frt_phq_add_term(q, token->text,
997
- pos_inc ? pos_inc : token->pos_inc);
998
- }
999
- else {
1000
- frt_phq_append_multi_term(q, token->text);
1001
- ((FrtPhraseQuery *)q)->slop++;
1002
- }
1003
- pos_inc = 0;
1004
- }
1005
- }
1006
- else {
1007
- bool added_position = false;
1008
-
1009
- for (j = 0; j < word_count; j++) {
1010
- stream = get_cached_ts(qp, field, words[j]);
1011
- if ((token = frt_ts_next(stream))) {
1012
- if (!added_position) {
1013
- frt_phq_add_term(q, token->text,
1014
- pos_inc ? pos_inc : token->pos_inc);
1015
- added_position = true;
1016
- pos_inc = 0;
1017
- }
1018
- else {
1019
- frt_phq_append_multi_term(q, token->text);
1020
- }
1021
- }
1022
- }
1023
- }
1024
- }
1025
- }
1026
- return q;
1027
- }
1028
-
1029
- /**
1030
- * Get a phrase query from the Phrase object. The Phrase object is built up by
1031
- * the query parser as the all PhraseQuery didn't work well for this. Once the
1032
- * PhraseQuery has been built the Phrase object needs to be destroyed.
1033
- */
1034
- static FrtQuery *get_phrase_q(QParser *qp, Phrase *phrase, char *slop_str)
1035
- {
1036
- FrtQuery *volatile q = NULL;
1037
- FLDS(q, get_phrase_query(qp, field, phrase, slop_str));
1038
- ph_destroy(phrase);
1039
- return q;
1040
- }
1041
-
1042
- /**
1043
- * Gets a RangeQuery object.
1044
- *
1045
- * Just like with WildCardQuery, RangeQuery needs to downcase its terms if the
1046
- * tokenizer also downcased its terms.
1047
- */
1048
- static FrtQuery *get_r_q(QParser *qp, FrtSymbol field, char *from, char *to,
1049
- bool inc_lower, bool inc_upper)
1050
- {
1051
- FrtQuery *rq;
1052
- if (qp->wild_lower
1053
- && (!qp->tokenized_fields || frt_hs_exists(qp->tokenized_fields, field))) {
1054
- if (from) {
1055
- lower_str(from);
1056
- }
1057
- if (to) {
1058
- lower_str(to);
1059
- }
1060
- }
1061
- /*
1062
- * terms don't get tokenized as it doesn't really make sense to do so for
1063
- * range queries.
1064
-
1065
- if (from) {
1066
- FrtTokenStream *stream = get_cached_ts(qp, field, from);
1067
- FrtToken *token = frt_ts_next(stream);
1068
- from = token ? frt_estrdup(token->text) : NULL;
1069
- }
1070
- if (to) {
1071
- FrtTokenStream *stream = get_cached_ts(qp, field, to);
1072
- FrtToken *token = frt_ts_next(stream);
1073
- to = token ? frt_estrdup(token->text) : NULL;
1074
- }
1075
- */
1076
-
1077
- rq = qp->use_typed_range_query ?
1078
- frt_trq_new(field, from, to, inc_lower, inc_upper) :
1079
- frt_rq_new(field, from, to, inc_lower, inc_upper);
1080
- return rq;
1081
- }
1082
-
1083
- /**
1084
- * Every time the query parser sees a new field modifier ("field1|field2:")
1085
- * it pushes a new FieldStack object onto the stack and sets its fields to the
1086
- * fields specified in the fields modifier. If the field modifier is '*',
1087
- * fs->fields is set to all_fields. fs->fields is set to +qp->def_field+ at
1088
- * the bottom of the stack (ie the very first set of fields pushed onto the
1089
- * stack).
1090
- */
1091
- static void qp_push_fields(QParser *self, FrtHashSet *fields, bool destroy)
1092
- {
1093
- FieldStack *fs = FRT_ALLOC(FieldStack);
1094
-
1095
- fs->next = self->fields_top;
1096
- fs->fields = fields;
1097
- fs->destroy = destroy;
1098
-
1099
- self->fields_top = fs;
1100
- self->fields = fields;
1101
- }
1102
-
1103
- /**
1104
- * Pops the top of the fields stack and frees any memory used by it. This will
1105
- * get called when query modified by a field modifier ("field1|field2:") has
1106
- * been fully parsed and the field specifier no longer applies.
1107
- */
1108
- static void qp_pop_fields(QParser *self)
1109
- {
1110
- FieldStack *fs = self->fields_top;
1111
-
1112
- if (fs->destroy) {
1113
- frt_hs_destroy(fs->fields);
1114
- }
1115
- self->fields_top = fs->next;
1116
- if (self->fields_top) {
1117
- self->fields = self->fields_top->fields;
1118
- }
1119
- free(fs);
1120
- }
1121
-
1122
- /**
1123
- * Free all memory allocated by the QueryParser.
1124
- */
1125
- void frt_qp_destroy(QParser *self)
1126
- {
1127
- if (self->tokenized_fields != self->all_fields) {
1128
- frt_hs_destroy(self->tokenized_fields);
1129
- }
1130
- if (self->def_fields != self->all_fields) {
1131
- frt_hs_destroy(self->def_fields);
1132
- }
1133
- frt_hs_destroy(self->all_fields);
1134
-
1135
- qp_pop_fields(self);
1136
- assert(NULL == self->fields_top);
1137
-
1138
- frt_h_destroy(self->ts_cache);
1139
- frt_tk_destroy(self->non_tokenizer);
1140
- frt_a_deref(self->analyzer);
1141
- free(self);
1142
- }
1143
-
1144
- /**
1145
- * Creates a new QueryParser setting all boolean parameters to their defaults.
1146
- * If +def_fields+ is NULL then +all_fields+ is used in place of +def_fields+.
1147
- * Not also that this method ensures that all fields that exist in
1148
- * +def_fields+ must also exist in +all_fields+. This should make sense.
1149
- */
1150
- QParser *qp_new(FrtAnalyzer *analyzer)
1151
- {
1152
- FrtQParser *self = FRT_ALLOC(QParser);
1153
- self->or_default = true;
1154
- self->wild_lower = true;
1155
- self->clean_str = false;
1156
- self->max_clauses = QP_MAX_CLAUSES;
1157
- self->handle_parse_errors = false;
1158
- self->allow_any_fields = false;
1159
- self->use_keywords = true;
1160
- self->use_typed_range_query = false;
1161
- self->def_slop = 0;
1162
-
1163
- self->tokenized_fields = frt_hs_new_ptr(NULL);
1164
- self->all_fields = frt_hs_new_ptr(NULL);
1165
- self->def_fields = frt_hs_new_ptr(NULL);
1166
-
1167
- self->fields_top = NULL;
1168
- qp_push_fields(self, self->def_fields, false);
1169
-
1170
- /* make sure all_fields contains the default fields */
1171
- self->analyzer = analyzer;
1172
- self->ts_cache = frt_h_new_ptr((frt_free_ft)&ts_deref);
1173
- self->buf_index = 0;
1174
- self->dynbuf = NULL;
1175
- self->non_tokenizer = non_tokenizer_new();
1176
- frt_mutex_init(&self->mutex, NULL);
1177
- return self;
1178
- }
1179
-
1180
- void frt_qp_add_field(QParser *self,
1181
- FrtSymbol field,
1182
- bool is_default,
1183
- bool is_tokenized)
1184
- {
1185
- frt_hs_add(self->all_fields, field);
1186
- if (is_default) {
1187
- frt_hs_add(self->def_fields, field);
1188
- }
1189
- if (is_tokenized) {
1190
- frt_hs_add(self->tokenized_fields, field);
1191
- }
1192
- }
1193
-
1194
- /* these chars have meaning within phrases */
1195
- static const char *PHRASE_CHARS = "<>|\"";
1196
-
1197
- /**
1198
- * +str_insert_char+ inserts a character at the beginning of a string by
1199
- * shifting the rest of the string right.
1200
- */
1201
- static void str_insert_char(char *str, int len, char chr)
1202
- {
1203
- memmove(str+1, str, len*sizeof(char));
1204
- *str = chr;
1205
- }
1206
-
1207
- /**
1208
- * +qp_clean_str+ basically scans the query string and ensures that all open
1209
- * and close parentheses '()' and quotes '"' are balanced. It does this by
1210
- * inserting or appending extra parentheses or quotes to the string. This
1211
- * obviously won't necessarily be exactly what the user wanted but we are
1212
- * never going to know that anyway. The main job of this method is to help the
1213
- * query at least parse correctly.
1214
- *
1215
- * It also checks that all special characters within phrases (ie between
1216
- * quotes) are escaped correctly unless they have meaning within a phrase
1217
- * ( <>,|," ). Note that '<' and '>' will also be escaped unless the appear
1218
- * together like so; '<>'.
1219
- */
1220
- char *qp_clean_str(char *str)
1221
- {
1222
- int b, pb = -1;
1223
- int br_cnt = 0;
1224
- bool quote_open = false;
1225
- char *sp, *nsp;
1226
-
1227
- /* leave a little extra */
1228
- char *new_str = FRT_ALLOC_N(char, strlen(str)*2 + 1);
1229
-
1230
- for (sp = str, nsp = new_str; *sp; sp++) {
1231
- b = *sp;
1232
- /* ignore escaped characters */
1233
- if (pb == '\\') {
1234
- if (quote_open && strrchr(PHRASE_CHARS, b)) {
1235
- *nsp++ = '\\'; /* this was left off the first time through */
1236
- }
1237
- *nsp++ = b;
1238
- /* \ has escaped itself so has no power. Assign pb random char 'r' */
1239
- pb = ((b == '\\') ? 'r' : b);
1240
- continue;
1241
- }
1242
- switch (b) {
1243
- case '\\':
1244
- if (!quote_open) { /* We do our own escaping below */
1245
- *nsp++ = b;
1246
- }
1247
- break;
1248
- case '"':
1249
- quote_open = !quote_open;
1250
- *nsp++ = b;
1251
- break;
1252
- case '(':
1253
- if (!quote_open) {
1254
- br_cnt++;
1255
- }
1256
- else {
1257
- *nsp++ = '\\';
1258
- }
1259
- *nsp++ = b;
1260
- break;
1261
- case ')':
1262
- if (!quote_open) {
1263
- if (br_cnt == 0) {
1264
- str_insert_char(new_str, (int)(nsp - new_str), '(');
1265
- nsp++;
1266
- }
1267
- else {
1268
- br_cnt--;
1269
- }
1270
- }
1271
- else {
1272
- *nsp++ = '\\';
1273
- }
1274
- *nsp++ = b;
1275
- break;
1276
- case '>':
1277
- if (quote_open) {
1278
- if (pb == '<') {
1279
- /* remove the escape character */
1280
- nsp--;
1281
- nsp[-1] = '<';
1282
- }
1283
- else {
1284
- *nsp++ = '\\';
1285
- }
1286
- }
1287
- *nsp++ = b;
1288
- break;
1289
- default:
1290
- if (quote_open) {
1291
- if (strrchr(special_char, b) && b != '|') {
1292
- *nsp++ = '\\';
1293
- }
1294
- }
1295
- *nsp++ = b;
1296
- }
1297
- pb = b;
1298
- }
1299
- if (quote_open) {
1300
- *nsp++ = '"';
1301
- }
1302
- for (;br_cnt > 0; br_cnt--) {
1303
- *nsp++ = ')';
1304
- }
1305
- *nsp = '\0';
1306
- return new_str;
1307
- }
1308
-
1309
- /**
1310
- * Takes a string and finds whatever tokens it can using the QueryParser's
1311
- * analyzer. It then turns these tokens (if any) into a boolean query. If it
1312
- * fails to find any tokens, this method will return NULL.
1313
- */
1314
- static FrtQuery *qp_get_bad_query(QParser *qp, char *str)
1315
- {
1316
- FrtQuery *volatile q = NULL;
1317
- qp->recovering = true;
1318
- assert(qp->fields_top->next == NULL);
1319
- FLDS(q, get_term_q(qp, field, str));
1320
- return q;
1321
- }
1322
-
1323
- /**
1324
- * +qp_parse+ takes a string and turns it into a Query object using Ferret's
1325
- * query language. It must either raise an error or return a query object. It
1326
- * must not return NULL. If the yacc parser fails it will use a very basic
1327
- * boolean query parser which takes whatever tokens it can find in the query
1328
- * and terns them into a boolean query on the default fields.
1329
- */
1330
- extern VALUE cQueryParseException;
1331
- Query *qp_parse(QParser *self, char *qstr)
1332
- {
1333
- FrtQuery *result = NULL;
1334
- frt_mutex_lock(&self->mutex);
1335
- /* if qp->fields_top->next is not NULL we have a left over field-stack
1336
- * object that was not popped during the last query parse */
1337
- assert(NULL == self->fields_top->next);
1338
-
1339
- self->recovering = self->destruct = false;
1340
- if (self->clean_str) {
1341
- self->qstrp = self->qstr = frt_qp_clean_str(qstr);
1342
- }
1343
- else {
1344
- self->qstrp = self->qstr = qstr;
1345
- }
1346
- self->fields = self->def_fields;
1347
- self->result = NULL;
1348
-
1349
- if (0 == yyparse(self)) result = self->result;
1350
- if (!result && self->handle_parse_errors) {
1351
- self->destruct = false;
1352
- result = qp_get_bad_query(self, self->qstr);
1353
- }
1354
- if (self->destruct && !self->handle_parse_errors) {
1355
- rb_raise(cQueryParseException, xmsg_buffer);
1356
- }
1357
- if (!result) {
1358
- result = frt_bq_new(false);
1359
- }
1360
- if (self->clean_str) {
1361
- free(self->qstr);
1362
- }
1363
-
1364
- frt_mutex_unlock(&self->mutex);
1365
- return result;
1366
- }