ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_qparser.c
CHANGED
@@ -4,14 +4,16 @@
|
|
4
4
|
static VALUE cQueryParser;
|
5
5
|
VALUE cQueryParseException;
|
6
6
|
|
7
|
-
VALUE
|
8
|
-
VALUE
|
9
|
-
VALUE
|
10
|
-
VALUE
|
11
|
-
VALUE
|
12
|
-
VALUE
|
13
|
-
VALUE
|
14
|
-
|
7
|
+
extern VALUE sym_analyzer;
|
8
|
+
static VALUE sym_wild_card_downcase;
|
9
|
+
static VALUE sym_all_fields;
|
10
|
+
static VALUE sym_default_field;
|
11
|
+
static VALUE sym_validate_fields;
|
12
|
+
static VALUE sym_or_default;
|
13
|
+
static VALUE sym_default_slop;
|
14
|
+
static VALUE sym_handle_parse_errors;
|
15
|
+
static VALUE sym_clean_string;
|
16
|
+
static VALUE sym_max_clauses;
|
15
17
|
|
16
18
|
extern VALUE frt_get_analyzer(Analyzer *a);
|
17
19
|
extern VALUE frt_get_q(Query *q);
|
@@ -26,163 +28,231 @@ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
|
|
26
28
|
static void
|
27
29
|
frt_qp_free(void *p)
|
28
30
|
{
|
29
|
-
|
30
|
-
|
31
|
-
qp_destroy(qp);
|
31
|
+
object_del(p);
|
32
|
+
qp_destroy((QParser *)p);
|
32
33
|
}
|
33
34
|
|
34
35
|
static void
|
35
36
|
frt_qp_mark(void *p)
|
36
37
|
{
|
37
|
-
|
38
|
-
frt_gc_mark(qp->analyzer);
|
38
|
+
frt_gc_mark(((QParser *)p)->analyzer);
|
39
39
|
}
|
40
40
|
|
41
|
-
HashSet *
|
41
|
+
static HashSet *
|
42
42
|
frt_get_fields(VALUE rfields)
|
43
43
|
{
|
44
|
-
|
45
|
-
|
46
|
-
|
44
|
+
VALUE rval;
|
45
|
+
HashSet *fields = hs_new_str(&free);
|
46
|
+
char *s, *p, *str;
|
47
47
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
} else {
|
55
|
-
rval = rb_obj_as_string(rfields);
|
56
|
-
if (strcmp("*", RSTRING(rval)->ptr) == 0) {
|
57
|
-
hs_destroy(fields);
|
58
|
-
fields = NULL;
|
48
|
+
if (TYPE(rfields) == T_ARRAY) {
|
49
|
+
int i;
|
50
|
+
for (i = 0; i < RARRAY(rfields)->len; i++) {
|
51
|
+
rval = rb_obj_as_string(RARRAY(rfields)->ptr[i]);
|
52
|
+
hs_add(fields, estrdup(RSTRING(rval)->ptr));
|
53
|
+
}
|
59
54
|
} else {
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
55
|
+
rval = rb_obj_as_string(rfields);
|
56
|
+
if (strcmp("*", RSTRING(rval)->ptr) == 0) {
|
57
|
+
hs_destroy(fields);
|
58
|
+
fields = NULL;
|
59
|
+
} else {
|
60
|
+
s = str = estrdup(RSTRING(rval)->ptr);
|
61
|
+
while ((p = strchr(s, '|')) != '\0') {
|
62
|
+
*p = '\0';
|
63
|
+
hs_add(fields, estrdup(s));
|
64
|
+
s = p + 1;
|
65
|
+
}
|
66
|
+
hs_add(fields, estrdup(s));
|
67
|
+
free(str);
|
68
|
+
}
|
68
69
|
}
|
69
|
-
|
70
|
-
return fields;
|
70
|
+
return fields;
|
71
71
|
}
|
72
72
|
|
73
|
+
/*
|
74
|
+
* call-seq:
|
75
|
+
* QueryParser.new(options = {}) -> QueryParser
|
76
|
+
*
|
77
|
+
* Create a new QueryParser. The QueryParser is used to convert string
|
78
|
+
* queries into Query objects. The options are;
|
79
|
+
*
|
80
|
+
* === Options
|
81
|
+
*
|
82
|
+
* :default_field:: Default: "*" (all fields). The default field to
|
83
|
+
* search when no field is specified in the search
|
84
|
+
* string. It can also be an array of fields.
|
85
|
+
* :analyzer:: Default: StandardAnalyzer. Analyzer used by the
|
86
|
+
* query parser to parse query terms
|
87
|
+
* :wild_card_downcase:: Default: true. Specifies whether wild-card queries
|
88
|
+
* should be downcased or not since they are not
|
89
|
+
* passed through the parser
|
90
|
+
* :all_fields:: Default: []. Lets the query parser know what
|
91
|
+
* fields are available for searching, particularly
|
92
|
+
* when the "*" is specified as the search field
|
93
|
+
* :validate_fields:: Default: false. Set to true if you want an
|
94
|
+
* exception to be raised if there is an attempt to
|
95
|
+
* search a non-existent field
|
96
|
+
* :or_default:: Default: true. Use "OR" as the default boolean
|
97
|
+
* operator
|
98
|
+
* :default_slop:: Default: 0. Default slop to use in PhraseQuery
|
99
|
+
* :handle_parser_errors:: Default: true. QueryParser will quietly handle all
|
100
|
+
* parsing errors internally. If you'd like to handle
|
101
|
+
* them yourself, set this parameter to false.
|
102
|
+
* :clean_string:: Default: true. QueryParser will do a quick
|
103
|
+
* once-over the query string make sure that quotes
|
104
|
+
* and brackets match up and special characters are
|
105
|
+
* escaped
|
106
|
+
* :max_clauses:: Default: 512. the maximum number of clauses
|
107
|
+
* allowed in boolean queries and the maximum number
|
108
|
+
* of terms allowed in multi, prefix, wild-card or
|
109
|
+
* fuzzy queries when those queries are generated by
|
110
|
+
* rewriting other queries
|
111
|
+
*/
|
73
112
|
static VALUE
|
74
113
|
frt_qp_init(int argc, VALUE *argv, VALUE self)
|
75
114
|
{
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
}
|
101
|
-
|
102
|
-
if (!analyzer) {
|
103
|
-
analyzer = mb_standard_analyzer_create(true);
|
104
|
-
}
|
105
|
-
|
106
|
-
qp = qp_create(all_fields, def_fields, analyzer);
|
107
|
-
qp->allow_any_fields = true;
|
108
|
-
qp->clean_str = true;
|
109
|
-
/* handle options */
|
110
|
-
if (argc == 2) {
|
111
|
-
if (Qnil != (rval = rb_hash_aref(roptions, rhandle_parse_errors_key))) {
|
112
|
-
qp->handle_parse_errors = RTEST(rval);
|
113
|
-
}
|
114
|
-
if (Qnil != (rval = rb_hash_aref(roptions, rallow_any_fields_key))) {
|
115
|
-
qp->allow_any_fields = RTEST(rval);
|
116
|
-
}
|
117
|
-
if (Qnil != (rval = rb_hash_aref(roptions, rwild_lower_key))) {
|
118
|
-
qp->wild_lower = RTEST(rval);
|
115
|
+
VALUE roptions;
|
116
|
+
VALUE rval;
|
117
|
+
Analyzer *analyzer = NULL;
|
118
|
+
bool has_options = false;
|
119
|
+
|
120
|
+
HashSet *all_fields = NULL;
|
121
|
+
HashSet *def_fields = NULL;
|
122
|
+
QParser *qp;
|
123
|
+
|
124
|
+
if (rb_scan_args(argc, argv, "01", &roptions) > 0) {
|
125
|
+
if (TYPE(roptions) == T_HASH) {
|
126
|
+
has_options = true;
|
127
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_default_field))) {
|
128
|
+
def_fields = frt_get_fields(rval);
|
129
|
+
}
|
130
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_analyzer))) {
|
131
|
+
analyzer = frt_get_cwrapped_analyzer(rval);
|
132
|
+
}
|
133
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_all_fields))) {
|
134
|
+
all_fields = frt_get_fields(rval);
|
135
|
+
}
|
136
|
+
} else {
|
137
|
+
def_fields = frt_get_fields(roptions);
|
138
|
+
}
|
119
139
|
}
|
120
|
-
if (
|
121
|
-
|
140
|
+
if (all_fields == NULL) {
|
141
|
+
all_fields = hs_new_str(&free);
|
122
142
|
}
|
123
|
-
|
124
|
-
|
143
|
+
|
144
|
+
if (!analyzer) {
|
145
|
+
analyzer = mb_standard_analyzer_new(true);
|
125
146
|
}
|
126
|
-
|
127
|
-
|
147
|
+
|
148
|
+
qp = qp_new(all_fields, def_fields, analyzer);
|
149
|
+
qp->allow_any_fields = true;
|
150
|
+
qp->clean_str = true;
|
151
|
+
/* handle options */
|
152
|
+
if (argc > 0) {
|
153
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
|
154
|
+
qp->handle_parse_errors = RTEST(rval);
|
155
|
+
}
|
156
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_validate_fields))) {
|
157
|
+
qp->allow_any_fields = !RTEST(rval);
|
158
|
+
}
|
159
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_wild_card_downcase))) {
|
160
|
+
qp->wild_lower = RTEST(rval);
|
161
|
+
}
|
162
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_or_default))) {
|
163
|
+
qp->or_default = (FIX2INT(rval) == BC_MUST) ? false : true;
|
164
|
+
}
|
165
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_default_slop))) {
|
166
|
+
qp->def_slop = FIX2INT(rval);
|
167
|
+
}
|
168
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_clean_string))) {
|
169
|
+
qp->clean_str = RTEST(rval);
|
170
|
+
}
|
171
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
|
172
|
+
qp->max_clauses = FIX2INT(rval);
|
173
|
+
}
|
128
174
|
}
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
return self;
|
175
|
+
Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
|
176
|
+
object_add(qp, self);
|
177
|
+
return self;
|
133
178
|
}
|
134
179
|
|
135
180
|
#define GET_QP QParser *qp = (QParser *)DATA_PTR(self)
|
181
|
+
/*
|
182
|
+
* call-seq:
|
183
|
+
* query_parser.parse(query_string) -> Query
|
184
|
+
*
|
185
|
+
* Parse a query string returning a Query object if parsing was successful.
|
186
|
+
* Will raise a QueryParseException if unsuccessful.
|
187
|
+
*/
|
136
188
|
static VALUE
|
137
189
|
frt_qp_parse(VALUE self, VALUE rstr)
|
138
190
|
{
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
191
|
+
const char *msg = NULL;
|
192
|
+
volatile VALUE rq;
|
193
|
+
GET_QP;
|
194
|
+
rstr = rb_obj_as_string(rstr);
|
195
|
+
TRY
|
196
|
+
rq = frt_get_q(qp_parse(qp, RSTRING(rstr)->ptr));
|
197
|
+
break;
|
198
|
+
default:
|
199
|
+
msg = xcontext.msg;
|
200
|
+
HANDLED();
|
201
|
+
XENDTRY
|
202
|
+
|
203
|
+
if (msg) {
|
204
|
+
rb_raise(cQueryParseException, msg);
|
205
|
+
}
|
206
|
+
|
207
|
+
return rq;
|
156
208
|
}
|
157
209
|
|
210
|
+
/*
|
211
|
+
* call-seq:
|
212
|
+
* query_parser.fields -> Array of Symbols
|
213
|
+
*
|
214
|
+
* Returns the list of all fields that the QueryParser knows about.
|
215
|
+
*/
|
158
216
|
static VALUE
|
159
217
|
frt_qp_get_fields(VALUE self)
|
160
218
|
{
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
219
|
+
GET_QP;
|
220
|
+
int i;
|
221
|
+
HashSet *fields = qp->all_fields;
|
222
|
+
VALUE rfields = rb_ary_new();
|
165
223
|
|
166
|
-
|
167
|
-
|
168
|
-
|
224
|
+
for (i = 0; i < fields->size; i++) {
|
225
|
+
rb_ary_push(rfields, ID2SYM(rb_intern((char *)fields->elems[i])));
|
226
|
+
}
|
169
227
|
|
170
|
-
|
228
|
+
return rfields;
|
171
229
|
}
|
172
230
|
|
231
|
+
/*
|
232
|
+
* call-seq:
|
233
|
+
* query_parser.fields = fields -> self
|
234
|
+
*
|
235
|
+
* Set the list of fields. These fields are expanded for searches on "*".
|
236
|
+
*/
|
173
237
|
static VALUE
|
174
238
|
frt_qp_set_fields(VALUE self, VALUE rfields)
|
175
239
|
{
|
176
|
-
|
177
|
-
|
240
|
+
GET_QP;
|
241
|
+
HashSet *fields = frt_get_fields(rfields);
|
178
242
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
243
|
+
if (qp->def_fields == qp->all_fields) {
|
244
|
+
qp->def_fields = NULL;
|
245
|
+
}
|
246
|
+
if (fields == NULL) {
|
247
|
+
fields = hs_new_str(&free);
|
248
|
+
}
|
249
|
+
hs_destroy(qp->all_fields);
|
250
|
+
qp->all_fields = fields;
|
251
|
+
if (qp->def_fields == NULL) {
|
252
|
+
qp->def_fields = fields;
|
253
|
+
}
|
184
254
|
|
185
|
-
|
255
|
+
return self;
|
186
256
|
}
|
187
257
|
|
188
258
|
/****************************************************************************
|
@@ -191,28 +261,245 @@ frt_qp_set_fields(VALUE self, VALUE rfields)
|
|
191
261
|
*
|
192
262
|
****************************************************************************/
|
193
263
|
|
264
|
+
/* rdoc hack
|
265
|
+
extern VALUE mFerret = rb_define_module("Ferret");
|
266
|
+
extern VALUE cQueryParser = rb_define_module_under(mFerret, "QueryParser");
|
267
|
+
*/
|
268
|
+
|
269
|
+
/*
|
270
|
+
* Document-class: Ferret::QueryParser::QueryParseException
|
271
|
+
*
|
272
|
+
* == Summary
|
273
|
+
*
|
274
|
+
* Exception raised when there is an error parsing the query string passed to
|
275
|
+
* QueryParser.
|
276
|
+
*/
|
277
|
+
void
|
278
|
+
Init_QueryParseException(void)
|
279
|
+
{
|
280
|
+
cQueryParseException = rb_define_class_under(cQueryParser,
|
281
|
+
"QueryParseException",
|
282
|
+
rb_eStandardError);
|
283
|
+
}
|
284
|
+
|
285
|
+
/*
|
286
|
+
* Document-class: Ferret::QueryParser
|
287
|
+
*
|
288
|
+
* == Summary
|
289
|
+
*
|
290
|
+
* The QueryParser is used to transform user submitted query strings into
|
291
|
+
* QueryObjects. Ferret using its own Query Language known from now on as
|
292
|
+
* Ferret Query Language or FQL.
|
293
|
+
*
|
294
|
+
* == Ferret Query Language
|
295
|
+
*
|
296
|
+
* === Preamble
|
297
|
+
*
|
298
|
+
* The following characters are special characters in FQL;
|
299
|
+
*
|
300
|
+
* :, (, ), [, ], {, }, !, +, ", ~, ^, -, |, <, >, =, *, ?, \
|
301
|
+
*
|
302
|
+
* If you want to use one of these characters in one of your terms you need
|
303
|
+
* to escape it with a \ character. \ escapes itself. The exception to this
|
304
|
+
* rule is within Phrases which a strings surrounded by double quotes (and
|
305
|
+
* will be explained further bellow in the section on PhraseQueries). In
|
306
|
+
* Phrases, only ", | and <> have special meaning and need to be escaped if
|
307
|
+
* you want the literal value. <> is escaped \<\>.
|
308
|
+
*
|
309
|
+
* In the following examples I have only written the query string. This would
|
310
|
+
* be parse like;
|
311
|
+
*
|
312
|
+
* query = query_parser.parse("pet:(dog AND cat)")
|
313
|
+
* puts query # => "+pet:dog +pet:cat"
|
314
|
+
*
|
315
|
+
* === TermQuery
|
316
|
+
*
|
317
|
+
* A term query is the most basic query of all and is what most of the other
|
318
|
+
* queries are built upon. The term consists of a single word. eg;
|
319
|
+
*
|
320
|
+
* 'term'
|
321
|
+
*
|
322
|
+
* Note that the analyzer will be run on the term and if it splits the term
|
323
|
+
* in two then it will be turned into a phrase query. For example, with the
|
324
|
+
* plain Ferret::Analysis::Analyzer, the following;
|
325
|
+
*
|
326
|
+
* 'dave12balmain'
|
327
|
+
*
|
328
|
+
* is equivalent to;
|
329
|
+
*
|
330
|
+
* '"dave balmain"'
|
331
|
+
*
|
332
|
+
* Which we will explain now...
|
333
|
+
*
|
334
|
+
* === PhraseQuery
|
335
|
+
*
|
336
|
+
* A phrase query is a string of terms surrounded by double quotes. For
|
337
|
+
* example you could write;
|
338
|
+
*
|
339
|
+
* '"quick brown fox"'
|
340
|
+
*
|
341
|
+
* But if a "fast" fox is just as good as a quick one you could use the |
|
342
|
+
* character to specify alternate terms.
|
343
|
+
*
|
344
|
+
* '"quick|speedy|fast brown fox"'
|
345
|
+
*
|
346
|
+
* What if we don't care what colour the fox is. We can use the <> to specify
|
347
|
+
* a place setter. eg;
|
348
|
+
*
|
349
|
+
* '"quick|speedy|fast <> fox"'
|
350
|
+
*
|
351
|
+
* This will match any word in between quick and fox. Alternatively we could
|
352
|
+
* set the "slop" for the phrase which allows a certain variation in the
|
353
|
+
* match of the phrase. The slop for a phrase is an integer indicating how
|
354
|
+
* many positions you are allowed to move the terms to get a match. Read more
|
355
|
+
* about the slop factor in Ferret::Search::PhraseQuery. To set the slop
|
356
|
+
* factor for a phrase you can type;
|
357
|
+
*
|
358
|
+
* '"big house"~2'
|
359
|
+
*
|
360
|
+
* This would match "big house", "big red house", "big red brick house" and
|
361
|
+
* even "house big". That's right, you don't need to have th terms in order
|
362
|
+
* if you allow some slop in your phrases. (See Ferret::Search::Spans if you
|
363
|
+
* need a phrase type query with ordered terms.)
|
364
|
+
*
|
365
|
+
* These basic queries will be run on the default field which is set when you
|
366
|
+
* create the query_parser. But what if you want to search a different field.
|
367
|
+
* You'll be needing a ...
|
368
|
+
*
|
369
|
+
* === FieldQuery
|
370
|
+
*
|
371
|
+
* A field query is any field prefixed by <fieldname>:. For example, to
|
372
|
+
* search for all instances of the term "ski" in field "sport", you'd write;
|
373
|
+
*
|
374
|
+
* 'sport:ski'
|
375
|
+
* Or we can apply a field to phrase;
|
376
|
+
*
|
377
|
+
* 'sport:"skiing is fun"'
|
378
|
+
*
|
379
|
+
* Now we have a few types of queries, we'll be needing to glue them together
|
380
|
+
* with a ...
|
381
|
+
*
|
382
|
+
* === BooleanQuery
|
383
|
+
*
|
384
|
+
* There are a couple of ways of writing boolean queries. Firstly you can
|
385
|
+
* specify which terms are required, optional or required not to exist (not).
|
386
|
+
*
|
387
|
+
* * '+' or "REQ" can be used to indicate a required query. "REQ" must be
|
388
|
+
* surrounded by white space.
|
389
|
+
* * '-', '!' or "NOT" are used to indicate query that is required to be
|
390
|
+
* false. "NOT" must be surrounded by white space.
|
391
|
+
* * all other queries are optional if the above symbols are used.
|
392
|
+
*
|
393
|
+
* Some examples;
|
394
|
+
*
|
395
|
+
* '+sport:ski -sport:snowboard sport:toboggan'
|
396
|
+
* '+ingredient:chocolate +ingredient:strawberries -ingredient:wheat'
|
397
|
+
*
|
398
|
+
* You may also use the boolean operators "AND", "&&", "OR" and "||". eg;
|
399
|
+
*
|
400
|
+
* 'sport:ski AND NOT sport:snowboard OR sport:toboggan'
|
401
|
+
* 'ingredient:chocolate AND ingredient:strawberries AND NOT ingredient:wheat'
|
402
|
+
*
|
403
|
+
* You can set the default operator when you create the query parse.
|
404
|
+
*
|
405
|
+
* === RangeQuery
|
406
|
+
*
|
407
|
+
* A range query finds all documents with terms between the two query terms.
|
408
|
+
* This can be very useful in particular for dates. eg;
|
409
|
+
*
|
410
|
+
* 'date:[20050725 20050905]' # all dates >= 20050725 and <= 20050905
|
411
|
+
* 'date:[20050725 20050905}' # all dates >= 20050725 and < 20050905
|
412
|
+
* 'date:{20050725 20050905]' # all dates > 20050725 and <= 20050905
|
413
|
+
* 'date:{20050725 20050905}' # all dates > 20050725 and < 20050905
|
414
|
+
*
|
415
|
+
* You can also do open ended queries like this;
|
416
|
+
*
|
417
|
+
* 'date:[20050725>' # all dates >= 20050725
|
418
|
+
* 'date:{20050725>' # all dates > 20050725
|
419
|
+
* 'date:<20050905]' # all dates <= 20050905
|
420
|
+
* 'date:<20050905}' # all dates < 20050905
|
421
|
+
*
|
422
|
+
* Or like this;
|
423
|
+
*
|
424
|
+
* 'date: >= 20050725'
|
425
|
+
* 'date: > 20050725'
|
426
|
+
* 'date: <= 20050905'
|
427
|
+
* 'date: < 20050905'
|
428
|
+
*
|
429
|
+
* If you prefer the above style you could use a boolean query but like this;
|
430
|
+
*
|
431
|
+
* 'date:( >= 20050725 AND <= 20050905)'
|
432
|
+
*
|
433
|
+
* But rangequery only solution shown first will be faster.
|
434
|
+
*
|
435
|
+
* === WildQuery
|
436
|
+
*
|
437
|
+
* A wild query is a query using the pattern matching characters * and ?. *
|
438
|
+
* matchs 0 or more characters while ? matchs a single character. This type
|
439
|
+
* of query can be really useful for matching heirarchical categories for
|
440
|
+
* example. Let's say we had this structure;
|
441
|
+
*
|
442
|
+
* /sport/skiing
|
443
|
+
* /sport/cycling
|
444
|
+
* /coding1/ruby
|
445
|
+
* /coding1/c
|
446
|
+
* /coding2/python
|
447
|
+
* /coding2/perl
|
448
|
+
*
|
449
|
+
* If you wanted all categories with programming languages you could use the
|
450
|
+
* query;
|
451
|
+
*
|
452
|
+
* 'category:/coding?/?*'
|
453
|
+
*
|
454
|
+
* Note that this query can be quite expensive if not used carefully. In the
|
455
|
+
* example above there would be no problem but you should be careful not use
|
456
|
+
* the wild characters at the beginning of the query as it'll have to iterate
|
457
|
+
* through every term in that field. Having said that, some fields like the
|
458
|
+
* category field above will only have a small number of distinct fields so
|
459
|
+
* this could be ok.
|
460
|
+
*
|
461
|
+
* === FuzzyQuery
|
462
|
+
*
|
463
|
+
* This is like the sloppy phrase query above, except you are now adding slop
|
464
|
+
* to a term. Basically it measures the Levenshtein distance between two
|
465
|
+
* terms and if the value is below the slop threshold the term is a match.
|
466
|
+
* This time though the slop must be a float between 0 and 1.0, 1.0 being a
|
467
|
+
* perfect match and 0 being far from a match. The default is set to 0.5 so
|
468
|
+
* you don't need to give a slop value if you don't want to. You can set the
|
469
|
+
* default in the Ferret::Search::FuzzyQuery class. Here are a couple of
|
470
|
+
* examples;
|
471
|
+
*
|
472
|
+
* 'content:ferret~'
|
473
|
+
* 'content:Ostralya~0.4'
|
474
|
+
*
|
475
|
+
* Note that this query can be quite expensive. If you'd like to use this
|
476
|
+
* query, you may want to set a mininum prefix length in the FuzzyQuery
|
477
|
+
* class. This can substantially reduce the number of terms that the query
|
478
|
+
* will iterate over.
|
479
|
+
*
|
480
|
+
*/
|
194
481
|
void
|
195
|
-
|
482
|
+
Init_QueryParser(void)
|
196
483
|
{
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
484
|
+
/* hash keys */
|
485
|
+
sym_wild_card_downcase = ID2SYM(rb_intern("wild_card_downcase"));
|
486
|
+
sym_all_fields = ID2SYM(rb_intern("fields"));
|
487
|
+
sym_default_field = ID2SYM(rb_intern("default_field"));
|
488
|
+
sym_validate_fields = ID2SYM(rb_intern("validate_fields"));
|
489
|
+
sym_or_default = ID2SYM(rb_intern("or_default"));
|
490
|
+
sym_default_slop = ID2SYM(rb_intern("default_slop"));
|
491
|
+
sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
|
492
|
+
sym_clean_string = ID2SYM(rb_intern("clean_string"));
|
493
|
+
sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
|
494
|
+
|
495
|
+
/* QueryParser */
|
496
|
+
cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
|
497
|
+
rb_define_alloc_func(cQueryParser, frt_data_alloc);
|
498
|
+
|
499
|
+
rb_define_method(cQueryParser, "initialize", frt_qp_init, -1);
|
500
|
+
rb_define_method(cQueryParser, "parse", frt_qp_parse, 1);
|
501
|
+
rb_define_method(cQueryParser, "fields", frt_qp_get_fields, 0);
|
502
|
+
rb_define_method(cQueryParser, "fields=", frt_qp_set_fields, 1);
|
503
|
+
|
504
|
+
Init_QueryParseException();
|
218
505
|
}
|