ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/search.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
require 'ferret/search/similarity.rb'
|
2
|
-
require 'ferret/search/boolean_clause.rb'
|
3
|
-
require 'ferret/search/scorer.rb'
|
4
|
-
require 'ferret/search/score_doc.rb'
|
5
|
-
require 'ferret/search/score_doc_comparator.rb'
|
6
|
-
require 'ferret/search/weight.rb'
|
7
|
-
require 'ferret/search/query.rb'
|
8
|
-
require 'ferret/search/term_query.rb'
|
9
|
-
require 'ferret/search/term_scorer.rb'
|
10
|
-
require 'ferret/search/top_docs.rb'
|
11
|
-
require 'ferret/search/boolean_query.rb'
|
12
|
-
require 'ferret/search/conjunction_scorer.rb'
|
13
|
-
require 'ferret/search/disjunction_sum_scorer.rb'
|
14
|
-
require 'ferret/search/multi_term_query.rb'
|
15
|
-
require 'ferret/search/phrase_query.rb'
|
16
|
-
require 'ferret/search/multi_phrase_query.rb'
|
17
|
-
require 'ferret/search/prefix_query.rb'
|
18
|
-
require 'ferret/search/range_query.rb'
|
19
|
-
require 'ferret/search/filtered_term_enum.rb'
|
20
|
-
require 'ferret/search/wildcard_term_enum.rb'
|
21
|
-
require 'ferret/search/wildcard_query.rb'
|
22
|
-
require 'ferret/search/fuzzy_term_enum.rb'
|
23
|
-
require 'ferret/search/fuzzy_query.rb'
|
24
|
-
require 'ferret/search/phrase_positions.rb'
|
25
|
-
require 'ferret/search/phrase_scorer.rb'
|
26
|
-
require 'ferret/search/exact_phrase_scorer.rb'
|
27
|
-
require 'ferret/search/sloppy_phrase_scorer.rb'
|
28
|
-
require 'ferret/search/boolean_scorer.rb'
|
29
|
-
require 'ferret/search/explanation.rb'
|
30
|
-
require 'ferret/search/field_doc.rb'
|
31
|
-
require 'ferret/search/hit_collector.rb'
|
32
|
-
require 'ferret/search/hit_queue.rb'
|
33
|
-
require 'ferret/search/non_matching_scorer.rb'
|
34
|
-
require 'ferret/search/req_excl_scorer.rb'
|
35
|
-
require 'ferret/search/req_opt_sum_scorer.rb'
|
36
|
-
require 'ferret/search/score_doc.rb'
|
37
|
-
require 'ferret/search/score_doc_comparator.rb'
|
38
|
-
require 'ferret/search/sort_field.rb'
|
39
|
-
require 'ferret/search/sort.rb'
|
40
|
-
require 'ferret/search/field_cache.rb'
|
41
|
-
require 'ferret/search/field_sorted_hit_queue.rb'
|
42
|
-
require 'ferret/search/filter.rb'
|
43
|
-
require 'ferret/search/range_filter.rb'
|
44
|
-
require 'ferret/search/query_filter.rb'
|
45
|
-
require 'ferret/search/caching_wrapper_filter.rb'
|
46
|
-
require 'ferret/search/filtered_query.rb'
|
47
|
-
require 'ferret/search/match_all_query.rb'
|
48
|
-
require 'ferret/search/spans.rb'
|
49
|
-
require 'ferret/search/index_searcher.rb'
|
50
|
-
require 'ferret/search/multi_searcher.rb'
|
@@ -1,100 +0,0 @@
|
|
1
|
-
|
2
|
-
module Ferret::Search
|
3
|
-
|
4
|
-
# A clause in a BooleanQuery.
|
5
|
-
class BooleanClause
|
6
|
-
|
7
|
-
class Occur < Ferret::Utils::Parameter
|
8
|
-
|
9
|
-
def to_s()
|
10
|
-
return "+" if (self == MUST)
|
11
|
-
return "-" if (self == MUST_NOT)
|
12
|
-
return ""
|
13
|
-
end
|
14
|
-
|
15
|
-
# Use this operator for terms that _must_ appear in the matching
|
16
|
-
# documents.
|
17
|
-
MUST = Occur.new("MUST")
|
18
|
-
|
19
|
-
# Use this operator for terms that _should_ appear in the matching
|
20
|
-
# documents. For a BooleanQuery with two +SHOULD+ subqueries, at
|
21
|
-
# least one of the queries must appear in the matching documents.
|
22
|
-
SHOULD = Occur.new("SHOULD")
|
23
|
-
|
24
|
-
# Use this operator for terms that _must not_ appear in the matching
|
25
|
-
# documents. Note that it is not possible to search for queries that
|
26
|
-
# only consist of a +MUST_NOT+ query.
|
27
|
-
MUST_NOT = Occur.new("MUST_NOT")
|
28
|
-
end
|
29
|
-
|
30
|
-
# The query whose matching documents are combined by the boolean query.
|
31
|
-
attr_accessor :query
|
32
|
-
|
33
|
-
# If true, documents documents which _do not_ match this sub-query will
|
34
|
-
# _not_ match the boolean query.
|
35
|
-
attr_writer :required
|
36
|
-
def required?
|
37
|
-
@required
|
38
|
-
end
|
39
|
-
|
40
|
-
# If true, documents documents which _do_ match this sub-query will _not_
|
41
|
-
# match the boolean query.
|
42
|
-
attr_writer :prohibited
|
43
|
-
def prohibited?
|
44
|
-
@prohibited
|
45
|
-
end
|
46
|
-
|
47
|
-
# See BooleanQuery::Occur for values for this attribute
|
48
|
-
attr_reader :occur
|
49
|
-
def occur=(occur)
|
50
|
-
@occur = occur
|
51
|
-
set_fields(occur)
|
52
|
-
end
|
53
|
-
|
54
|
-
# Constructs a BooleanClause. Default value for occur is Occur::SHOULD
|
55
|
-
def initialize(query, occur = Occur::SHOULD)
|
56
|
-
@query = query
|
57
|
-
@occur = occur
|
58
|
-
set_fields(occur)
|
59
|
-
end
|
60
|
-
|
61
|
-
|
62
|
-
# Returns true iff +other+ is equal to this.
|
63
|
-
def eql?(other)
|
64
|
-
if not other.instance_of?(BooleanClause)
|
65
|
-
return false
|
66
|
-
end
|
67
|
-
return (@query == other.query and
|
68
|
-
@required == other.required? and
|
69
|
-
@prohibited == other.prohibited?)
|
70
|
-
end
|
71
|
-
alias :== :eql?
|
72
|
-
|
73
|
-
# Returns a hash code value for this object.
|
74
|
-
def hash()
|
75
|
-
return @query.hash() ^ (@required ? 1 : 0) ^ (@prohibited ? 2 : 0)
|
76
|
-
end
|
77
|
-
|
78
|
-
# represent a boolean clause as a string
|
79
|
-
def to_s()
|
80
|
-
return @occur.to_s() + @query.to_s()
|
81
|
-
end
|
82
|
-
|
83
|
-
private
|
84
|
-
|
85
|
-
def set_fields(occur)
|
86
|
-
if (occur == Occur::MUST)
|
87
|
-
@required = true
|
88
|
-
@prohibited = false
|
89
|
-
elsif (occur == Occur::SHOULD)
|
90
|
-
@required = false
|
91
|
-
@prohibited = false
|
92
|
-
elsif (occur == Occur::MUST_NOT)
|
93
|
-
@required = false
|
94
|
-
@prohibited = true
|
95
|
-
else
|
96
|
-
raise ArgumentError, "Unknown operator " + occur
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
@@ -1,299 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# A Query that matches documents matching boolean combinations of other
|
3
|
-
# queries, e.g. TermQuerys, PhraseQuerys or other BooleanQuerys.
|
4
|
-
class BooleanQuery < Query
|
5
|
-
|
6
|
-
# The maximum number of clauses permitted. Default value is 1024.
|
7
|
-
#
|
8
|
-
# TermQuery clauses are generated from for example prefix queries and
|
9
|
-
# fuzzy queries. Each TermQuery needs some buffer space during search,
|
10
|
-
# so this parameter indirectly controls the maximum buffer requirements
|
11
|
-
# for query search.
|
12
|
-
#
|
13
|
-
# When this parameter becomes a bottleneck for a Query one can use a
|
14
|
-
# Filter. For example instead of a RangeQuery one can use a RangeFilter.
|
15
|
-
#
|
16
|
-
# Attempts to add more than the permitted number of clauses cause
|
17
|
-
# TooManyClauses to be raisen.
|
18
|
-
attr_accessor :max_clause_count
|
19
|
-
attr_accessor :clauses
|
20
|
-
DEFAULT_MAX_CLAUSE_COUNT = 1024
|
21
|
-
|
22
|
-
@@max_clause_count = DEFAULT_MAX_CLAUSE_COUNT
|
23
|
-
def BooleanQuery.max_clause_count
|
24
|
-
return @@max_clause_count
|
25
|
-
end
|
26
|
-
def BooleanQuery.max_clause_count=(mcc)
|
27
|
-
@@max_clause_count = mcc
|
28
|
-
end
|
29
|
-
|
30
|
-
# Thrown when an attempt is made to add more than #max_clause_count()
|
31
|
-
# clauses. This typically happens if a PrefixQuery, FuzzyQuery,
|
32
|
-
# WildcardQuery, or RangeQuery is expanded to many terms during search.
|
33
|
-
class TooManyClauses < Exception
|
34
|
-
end
|
35
|
-
|
36
|
-
# Constructs an empty boolean query.
|
37
|
-
#
|
38
|
-
# Similarity#coord(int,int) may be disabled in scoring, as appropriate.
|
39
|
-
# For example, this score factor does not make sense for most automatically
|
40
|
-
# generated queries, like WildcardQuery and FuzzyQuery.
|
41
|
-
#
|
42
|
-
# coord_disabled:: disables Similarity#coord(int,int) in scoring.
|
43
|
-
def initialize(coord_disabled = false)
|
44
|
-
super()
|
45
|
-
@coord_disabled = coord_disabled
|
46
|
-
@clauses = []
|
47
|
-
end
|
48
|
-
|
49
|
-
# Returns true iff Similarity#coord(int,int) is disabled in scoring for
|
50
|
-
# this query instance.
|
51
|
-
# See #BooleanQuery(boolean)
|
52
|
-
def coord_disabled?()
|
53
|
-
return @coord_disabled
|
54
|
-
end
|
55
|
-
|
56
|
-
def similarity(searcher)
|
57
|
-
sim = super
|
58
|
-
if (@coord_disabled) # disable coord as requested
|
59
|
-
class <<sim
|
60
|
-
def coord(overlap, max_overlap)
|
61
|
-
return 1.0
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
return sim
|
66
|
-
end
|
67
|
-
|
68
|
-
# Adds a clause to a boolean query. Clauses may be:
|
69
|
-
#
|
70
|
-
# required:: which means that documents which _do not_ match this
|
71
|
-
# sub-query will _not_ match the boolean query
|
72
|
-
# prohibited:: which means that documents which _do_ match this
|
73
|
-
# sub-query will _not_ match the boolean query; or
|
74
|
-
# neither:: in which case matched documents are neither prohibited
|
75
|
-
# from nor required to match the sub-query. However, a
|
76
|
-
# document must match at least 1 sub-query to match the
|
77
|
-
# boolean query.
|
78
|
-
#
|
79
|
-
# * For +required+ use add(query, BooleanClause::Occur::MUST)
|
80
|
-
# * For +prohibited+ use add(query, BooleanClause::Occur::MUST_NOT)
|
81
|
-
# * For +neither+ use add(query, BooleanClause::Occur::SHOULD)
|
82
|
-
#
|
83
|
-
# raises:: TooManyClauses if the new number of clauses exceeds the
|
84
|
-
# maximum clause number #max_clause_count()
|
85
|
-
def add_query(query, occur=BooleanClause::Occur::SHOULD)
|
86
|
-
add_clause(BooleanClause.new(query, occur))
|
87
|
-
end
|
88
|
-
|
89
|
-
# Adds a clause to a boolean query.
|
90
|
-
# raises:: TooManyClauses if the new number of clauses exceeds the
|
91
|
-
# maximum clause number. See #max_clause_count()
|
92
|
-
def add_clause(clause)
|
93
|
-
if @clauses.size >= @@max_clause_count
|
94
|
-
raise TooManyClauses
|
95
|
-
end
|
96
|
-
|
97
|
-
@clauses << clause
|
98
|
-
self
|
99
|
-
end
|
100
|
-
alias :<< :add_clause
|
101
|
-
|
102
|
-
class BooleanWeight < Weight
|
103
|
-
attr_accessor :similarity
|
104
|
-
attr_accessor :weights
|
105
|
-
attr_reader :query
|
106
|
-
|
107
|
-
def initialize(query, searcher)
|
108
|
-
@query = query
|
109
|
-
@weights = []
|
110
|
-
|
111
|
-
@similarity = query.similarity(searcher)
|
112
|
-
query.clauses.each do |clause|
|
113
|
-
@weights << clause.query.create_weight(searcher)
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def value()
|
118
|
-
return @query.boost()
|
119
|
-
end
|
120
|
-
|
121
|
-
def sum_of_squared_weights()
|
122
|
-
sum = 0
|
123
|
-
@weights.each_with_index do |weight, i|
|
124
|
-
clause = @query.clauses[i]
|
125
|
-
if not clause.prohibited?
|
126
|
-
sum += weight.sum_of_squared_weights() # sum sub weights
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
sum *= @query.boost() * @query.boost() # boost each sub-weight
|
131
|
-
|
132
|
-
return sum
|
133
|
-
end
|
134
|
-
|
135
|
-
|
136
|
-
def normalize(norm)
|
137
|
-
norm *= @query.boost()
|
138
|
-
@weights.each_with_index do |weight, i|
|
139
|
-
clause = @query.clauses[i]
|
140
|
-
if not clause.prohibited?
|
141
|
-
weight.normalize(norm)
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
# returns:: An alternative Scorer that uses and provides skip_to(),
|
147
|
-
# and scores documents in document number order.
|
148
|
-
def scorer(reader)
|
149
|
-
result = BooleanScorer.new(@similarity)
|
150
|
-
|
151
|
-
@weights.each_with_index do |weight, i|
|
152
|
-
clause = @query.clauses[i]
|
153
|
-
sub_scorer = weight.scorer(reader)
|
154
|
-
if (sub_scorer != nil)
|
155
|
-
result.add_scorer(sub_scorer, clause.occur)
|
156
|
-
elsif (clause.required?())
|
157
|
-
return nil
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
return result
|
162
|
-
end
|
163
|
-
|
164
|
-
def explain(reader, doc)
|
165
|
-
|
166
|
-
sum_expl = Explanation.new()
|
167
|
-
sum_expl.description = "sum of:"
|
168
|
-
coord = 0
|
169
|
-
max_coord = 0
|
170
|
-
sum = 0.0
|
171
|
-
|
172
|
-
@weights.each_with_index do |weight, i|
|
173
|
-
clause = @query.clauses[i]
|
174
|
-
explanation = weight.explain(reader, doc)
|
175
|
-
max_coord += 1 if not clause.prohibited?
|
176
|
-
if explanation.value > 0
|
177
|
-
if not clause.prohibited?
|
178
|
-
sum_expl << explanation
|
179
|
-
sum += explanation.value
|
180
|
-
coord += 1
|
181
|
-
else
|
182
|
-
return Explanation.new(0.0, "match prohibited")
|
183
|
-
end
|
184
|
-
elsif clause.required?
|
185
|
-
return Explanation.new(0.0, "match required")
|
186
|
-
end
|
187
|
-
end
|
188
|
-
sum_expl.value = sum
|
189
|
-
|
190
|
-
if (coord == 1) # only one clause matched
|
191
|
-
sum_expl = sum_expl.details[0] # eliminate wrapper
|
192
|
-
end
|
193
|
-
|
194
|
-
coord_factor = @similarity.coord(coord, max_coord)
|
195
|
-
if (coord_factor == 1.0) # coord is no-op
|
196
|
-
return sum_expl # eliminate wrapper
|
197
|
-
else
|
198
|
-
result = Explanation.new()
|
199
|
-
result.description = "product of:"
|
200
|
-
result << sum_expl
|
201
|
-
result << Explanation.new(coord_factor, "coord(#{coord}/#{max_coord})")
|
202
|
-
result.value = sum * coord_factor
|
203
|
-
return result
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end #end BooleanWeight
|
207
|
-
|
208
|
-
def create_weight(searcher)
|
209
|
-
return BooleanWeight.new(self, searcher)
|
210
|
-
end
|
211
|
-
|
212
|
-
def rewrite(reader)
|
213
|
-
if @clauses.size == 1 # optimize 1-clause queries
|
214
|
-
clause = @clauses[0]
|
215
|
-
if not clause.prohibited? # just return clause
|
216
|
-
|
217
|
-
query = clause.query.rewrite(reader) # rewrite first
|
218
|
-
|
219
|
-
if boost() != 1.0 # incorporate boost
|
220
|
-
if query == clause.query # if rewrite was no-op
|
221
|
-
query = query.clone # then clone before boost
|
222
|
-
end
|
223
|
-
query.boost = boost() * query.boost()
|
224
|
-
end
|
225
|
-
|
226
|
-
return query
|
227
|
-
end
|
228
|
-
end
|
229
|
-
|
230
|
-
clone = nil # recursively rewrite
|
231
|
-
@clauses.each_with_index do |clause, i|
|
232
|
-
query = clause.query().rewrite(reader)
|
233
|
-
if query != clause.query() # clause rewrote: must clone
|
234
|
-
clone ||= clone()
|
235
|
-
clone.clauses[i] = BooleanClause.new(query, clause.occur)
|
236
|
-
end
|
237
|
-
end
|
238
|
-
if (clone != nil)
|
239
|
-
return clone # some clauses rewrote
|
240
|
-
else
|
241
|
-
return self # no clauses rewrote
|
242
|
-
end
|
243
|
-
end
|
244
|
-
|
245
|
-
def extract_terms(terms)
|
246
|
-
@clauses.each do |clause|
|
247
|
-
clause.query.extract_terms(terms)
|
248
|
-
end
|
249
|
-
end
|
250
|
-
|
251
|
-
def initialize_copy(o)
|
252
|
-
super
|
253
|
-
@clauses = o.clauses.clone
|
254
|
-
end
|
255
|
-
|
256
|
-
# Prints a user-readable version of this query.
|
257
|
-
def to_s(field = nil)
|
258
|
-
buffer = ""
|
259
|
-
buffer << "(" if boost != 1.0
|
260
|
-
|
261
|
-
@clauses.each_with_index do |clause, i|
|
262
|
-
if clause.prohibited?
|
263
|
-
buffer << "-"
|
264
|
-
elsif clause.required?
|
265
|
-
buffer << "+"
|
266
|
-
end
|
267
|
-
|
268
|
-
sub_query = clause.query
|
269
|
-
if sub_query.instance_of? BooleanQuery # wrap sub-bools in parens
|
270
|
-
buffer << "(#{clause.query.to_s(field)})"
|
271
|
-
else
|
272
|
-
buffer << clause.query.to_s(field)
|
273
|
-
end
|
274
|
-
|
275
|
-
if i != (@clauses.size - 1)
|
276
|
-
buffer << " "
|
277
|
-
end
|
278
|
-
end
|
279
|
-
|
280
|
-
buffer << ")^#{boost}" if boost() != 1.0
|
281
|
-
|
282
|
-
return buffer
|
283
|
-
end
|
284
|
-
|
285
|
-
# Returns true iff +o+ is equal to this.
|
286
|
-
def eql?(other)
|
287
|
-
if not other.instance_of?(BooleanQuery)
|
288
|
-
return false
|
289
|
-
end
|
290
|
-
return (boost() == other.boost() and @clauses == other.clauses)
|
291
|
-
end
|
292
|
-
alias :== :eql?
|
293
|
-
|
294
|
-
# Returns a hash code value for this object.
|
295
|
-
def hash()
|
296
|
-
return boost().hash ^ @clauses.hash
|
297
|
-
end
|
298
|
-
end
|
299
|
-
end
|