ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,294 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# An alternative to BooleanScorer.
|
3
|
-
#
|
4
|
-
# Uses ConjunctionScorer, DisjunctionScorer, ReqOptScorer and ReqExclScorer.
|
5
|
-
#
|
6
|
-
# Implements skip_to(), and has no limitations on the numbers of added scorers.
|
7
|
-
class BooleanScorer < Scorer
|
8
|
-
attr_reader :required_scorers, :coordinator
|
9
|
-
|
10
|
-
class Coordinator
|
11
|
-
attr_accessor :max_coord, :nr_matchers
|
12
|
-
|
13
|
-
def initialize(similarity)
|
14
|
-
@max_coord = 0 # to be increased for each non prohibited scorer
|
15
|
-
@coord_factors = nil
|
16
|
-
@similarity = similarity
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
def init() # use after all scorers have been added.
|
21
|
-
@coord_factors = Array.new(@max_coord + 1)
|
22
|
-
|
23
|
-
(@max_coord+1).times do |i|
|
24
|
-
@coord_factors[i] = @similarity.coord(i, @max_coord)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
def init_doc()
|
30
|
-
@nr_matchers = 0
|
31
|
-
end
|
32
|
-
|
33
|
-
def coord_factor()
|
34
|
-
return @coord_factors[@nr_matchers]
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# The scorer to which all scoring will be delegated,
|
39
|
-
# except for computing and using the coordination factor.
|
40
|
-
|
41
|
-
def initialize(similarity)
|
42
|
-
super(similarity)
|
43
|
-
@required_scorers = []
|
44
|
-
@optional_scorers = []
|
45
|
-
@prohibited_scorers = []
|
46
|
-
@counting_sum_scorer = nil
|
47
|
-
@coordinator = Coordinator.new(similarity)
|
48
|
-
end
|
49
|
-
|
50
|
-
def add_scorer(scorer, occur)
|
51
|
-
unless occur == BooleanClause::Occur::MUST_NOT
|
52
|
-
@coordinator.max_coord += 1
|
53
|
-
end
|
54
|
-
|
55
|
-
case occur
|
56
|
-
when BooleanClause::Occur::MUST: @required_scorers << scorer
|
57
|
-
when BooleanClause::Occur::SHOULD: @optional_scorers << scorer
|
58
|
-
when BooleanClause::Occur::MUST_NOT: @prohibited_scorers << scorer
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
# Initialize the match counting scorer that sums all the
|
63
|
-
# scores.
|
64
|
-
# When "counting" is used in a name it means counting the number
|
65
|
-
# of matching scorers.<br>
|
66
|
-
# When "sum" is used in a name it means score value summing
|
67
|
-
# over the matching scorers
|
68
|
-
def init_counting_sum_scorer()
|
69
|
-
@coordinator.init()
|
70
|
-
@counting_sum_scorer = make_counting_sum_scorer()
|
71
|
-
end
|
72
|
-
|
73
|
-
# Count a scorer as a single match.
|
74
|
-
class SingleMatchScorer < Scorer
|
75
|
-
def initialize(parent_scorer, scorer)
|
76
|
-
super(scorer.similarity)
|
77
|
-
@scorer = scorer
|
78
|
-
@parent_scorer = parent_scorer
|
79
|
-
end
|
80
|
-
def score()
|
81
|
-
@parent_scorer.coordinator.nr_matchers += 1
|
82
|
-
return @scorer.score
|
83
|
-
end
|
84
|
-
def doc()
|
85
|
-
return @scorer.doc
|
86
|
-
end
|
87
|
-
def next?
|
88
|
-
return @scorer.next?
|
89
|
-
end
|
90
|
-
def skip_to(doc_num)
|
91
|
-
return @scorer.skip_to(doc_num)
|
92
|
-
end
|
93
|
-
def explain(doc_num)
|
94
|
-
return @scorer.explain(doc_num)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
class CountingDisjunctionSumScorer < DisjunctionSumScorer
|
99
|
-
def initialize(parent_scorer, scorers)
|
100
|
-
super(scorers)
|
101
|
-
@parent_scorer = parent_scorer
|
102
|
-
end
|
103
|
-
def score
|
104
|
-
@parent_scorer.coordinator.nr_matchers += @nr_matchers
|
105
|
-
return super
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
def counting_disjunction_sum_scorer(scorers)
|
110
|
-
# each scorer from the list counted as a single matcher
|
111
|
-
|
112
|
-
return CountingDisjunctionSumScorer.new(self, scorers)
|
113
|
-
end
|
114
|
-
|
115
|
-
class CountingConjunctionScorer < ConjunctionScorer
|
116
|
-
def initialize(parent_scorer, similarity)
|
117
|
-
super(similarity)
|
118
|
-
@parent_scorer = parent_scorer
|
119
|
-
@required_num_matchers = parent_scorer.required_scorers.size
|
120
|
-
@last_scored_doc = -1
|
121
|
-
end
|
122
|
-
def score
|
123
|
-
if (@parent_scorer.doc() > @last_scored_doc)
|
124
|
-
@last_scored_doc = @parent_scorer.doc()
|
125
|
-
@parent_scorer.coordinator.nr_matchers += @required_num_matchers
|
126
|
-
end
|
127
|
-
|
128
|
-
return super
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
def counting_conjunction_sum_scorer(required_scorers)
|
133
|
-
# each scorer from the list counted as a single matcher
|
134
|
-
|
135
|
-
required_num_matchers = required_scorers.size
|
136
|
-
ccs = CountingConjunctionScorer.new(self, Similarity.default)
|
137
|
-
@required_scorers.each do |scorer|
|
138
|
-
ccs << scorer
|
139
|
-
end
|
140
|
-
return ccs
|
141
|
-
end
|
142
|
-
|
143
|
-
# Returns the scorer to be used for match counting and score summing.
|
144
|
-
# Uses required_scorers, optional_scorers and prohibited_scorers.
|
145
|
-
def make_counting_sum_scorer()
|
146
|
-
# each scorer counted as a single matcher
|
147
|
-
if @required_scorers.size == 0
|
148
|
-
if @optional_scorers.size == 0
|
149
|
-
return NonMatchingScorer.new # only prohibited scorers
|
150
|
-
elsif @optional_scorers.size == 1
|
151
|
-
return make_counting_sum_scorer2( # the only optional scorer is required
|
152
|
-
SingleMatchScorer.new(self, @optional_scorers[0]),
|
153
|
-
[]) # no optional scorers left
|
154
|
-
else # more than 1 @optional_scorers, no required scorers
|
155
|
-
return make_counting_sum_scorer2( # at least one optional scorer is required
|
156
|
-
counting_disjunction_sum_scorer(@optional_scorers),
|
157
|
-
[]) # no optional scorers left
|
158
|
-
end
|
159
|
-
elsif @required_scorers.size == 1 # 1 required
|
160
|
-
return make_counting_sum_scorer2(
|
161
|
-
SingleMatchScorer.new(self, @required_scorers[0]),
|
162
|
-
@optional_scorers)
|
163
|
-
else # more required scorers
|
164
|
-
return make_counting_sum_scorer2(
|
165
|
-
counting_conjunction_sum_scorer(@required_scorers),
|
166
|
-
@optional_scorers)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
# Returns the scorer to be used for match counting and score summing.
|
171
|
-
# Uses the arguments and prohibited_scorers.
|
172
|
-
# required_counting_sum_scorer:: A required scorer already built.
|
173
|
-
# @optional_scorers:: A list of optional scorers, possibly empty.
|
174
|
-
def make_counting_sum_scorer2(required_counting_sum_scorer, optional_scorers)
|
175
|
-
|
176
|
-
if (optional_scorers.size == 0)
|
177
|
-
if (@prohibited_scorers.size == 0)
|
178
|
-
return required_counting_sum_scorer
|
179
|
-
elsif (@prohibited_scorers.size == 1)
|
180
|
-
return ReqExclScorer.new(required_counting_sum_scorer,
|
181
|
-
@prohibited_scorers[0])
|
182
|
-
else # no optional, more than 1 prohibited
|
183
|
-
return ReqExclScorer.new(
|
184
|
-
required_counting_sum_scorer,
|
185
|
-
DisjunctionSumScorer.new(@prohibited_scorers))
|
186
|
-
end
|
187
|
-
elsif (optional_scorers.size == 1)
|
188
|
-
return make_counting_sum_scorer3(
|
189
|
-
required_counting_sum_scorer,
|
190
|
-
SingleMatchScorer.new(self, optional_scorers[0]))
|
191
|
-
else # more optional
|
192
|
-
return make_counting_sum_scorer3(
|
193
|
-
required_counting_sum_scorer,
|
194
|
-
counting_disjunction_sum_scorer(optional_scorers))
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
# Returns the scorer to be used for match counting and score summing.
|
199
|
-
# Uses the arguments and prohibited_scorers.
|
200
|
-
# required_counting_sum_scorer:: A required scorer already built.
|
201
|
-
# optional_counting_sum_scorer:: An optional scorer already built.
|
202
|
-
def make_counting_sum_scorer3(required_counting_sum_scorer,
|
203
|
-
optional_counting_sum_scorer)
|
204
|
-
if (@prohibited_scorers.size == 0) # no prohibited
|
205
|
-
return ReqOptSumScorer.new(required_counting_sum_scorer,
|
206
|
-
optional_counting_sum_scorer)
|
207
|
-
elsif (@prohibited_scorers.size == 1) # 1 prohibited
|
208
|
-
return ReqOptSumScorer.new(
|
209
|
-
ReqExclScorer.new(required_counting_sum_scorer,
|
210
|
-
@prohibited_scorers[0]),
|
211
|
-
optional_counting_sum_scorer)
|
212
|
-
else # more prohibited
|
213
|
-
return ReqOptSumScorer.new(
|
214
|
-
ReqExclScorer.new(required_counting_sum_scorer,
|
215
|
-
DisjunctionSumScorer.new(@prohibited_scorers)),
|
216
|
-
optional_counting_sum_scorer)
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
# Expert: Iterates over matching all documents, yielding the document
|
221
|
-
# number and the score.
|
222
|
-
#
|
223
|
-
# returns:: true if more matching documents may remain.
|
224
|
-
def each_hit() # :yields: doc, score
|
225
|
-
if @counting_sum_scorer.nil?
|
226
|
-
init_counting_sum_scorer()
|
227
|
-
end
|
228
|
-
while @counting_sum_scorer.next?
|
229
|
-
yield(@counting_sum_scorer.doc, score())
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
# Expert: Iterates over matching documents in a range.
|
234
|
-
#
|
235
|
-
# NOTE: that #next? needs to be called first.
|
236
|
-
#
|
237
|
-
# max:: Do not score documents past this. Default will search all documents
|
238
|
-
# avaliable.
|
239
|
-
# returns:: true if more matching documents may remain.
|
240
|
-
def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
|
241
|
-
# nil pointer exception when next? was not called before:
|
242
|
-
doc_num = @counting_sum_scorer.doc()
|
243
|
-
while (doc_num < max)
|
244
|
-
yield(doc_num, score())
|
245
|
-
if not @counting_sum_scorer.next?
|
246
|
-
return false
|
247
|
-
end
|
248
|
-
doc_num = @counting_sum_scorer.doc()
|
249
|
-
end
|
250
|
-
return true
|
251
|
-
end
|
252
|
-
|
253
|
-
def doc()
|
254
|
-
return @counting_sum_scorer.doc
|
255
|
-
end
|
256
|
-
|
257
|
-
def next?
|
258
|
-
if (@counting_sum_scorer == nil)
|
259
|
-
init_counting_sum_scorer()
|
260
|
-
end
|
261
|
-
return @counting_sum_scorer.next?
|
262
|
-
end
|
263
|
-
|
264
|
-
def score()
|
265
|
-
@coordinator.init_doc()
|
266
|
-
sum = @counting_sum_scorer.score()
|
267
|
-
return sum * @coordinator.coord_factor()
|
268
|
-
end
|
269
|
-
|
270
|
-
# Skips to the first match beyond the current whose document number is
|
271
|
-
# greater than or equal to a given target.
|
272
|
-
#
|
273
|
-
# When this method is used the #explain(int) method should not be used.
|
274
|
-
#
|
275
|
-
# target:: The target document number.
|
276
|
-
# returns:: true iff there is such a match.
|
277
|
-
def skip_to(target)
|
278
|
-
if (@counting_sum_scorer == nil)
|
279
|
-
init_counting_sum_scorer()
|
280
|
-
end
|
281
|
-
return @counting_sum_scorer.skip_to(target)
|
282
|
-
end
|
283
|
-
|
284
|
-
# TODO: Implement an explanation of the coordination factor.
|
285
|
-
# doc:: The document number for the explanation.
|
286
|
-
# raises:: UnsupportedOperationException
|
287
|
-
def explain(doc)
|
288
|
-
raise NotImplementedError
|
289
|
-
# How to explain the coordination factor?
|
290
|
-
#init_counting_sum_scorer()
|
291
|
-
#return @counting_sum_scorer.explain(doc); # misses coord factor.
|
292
|
-
end
|
293
|
-
end
|
294
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
require 'monitor'
|
3
|
-
|
4
|
-
# Wraps another filter's result and caches it. The caching
|
5
|
-
# behavior is like QueryFilter. The purpose is to allow
|
6
|
-
# filters to simply filter, and then wrap with this class to add
|
7
|
-
# caching, keeping the two concerns decoupled yet composable.
|
8
|
-
class CachingWrapperFilter < Filter
|
9
|
-
# filter:: Filter to cache results of
|
10
|
-
def initialize(filter)
|
11
|
-
@filter = filter
|
12
|
-
@cache = nil
|
13
|
-
end
|
14
|
-
|
15
|
-
def bits(reader)
|
16
|
-
if (@cache == nil)
|
17
|
-
@cache = Ferret::Utils::WeakKeyHash.new
|
18
|
-
end
|
19
|
-
|
20
|
-
@cache.synchronize() do # check cache
|
21
|
-
bits = @cache[reader]
|
22
|
-
if bits
|
23
|
-
return bits
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
bits = @filter.bits(reader)
|
28
|
-
|
29
|
-
@cache.synchronize() do # update cache
|
30
|
-
@cache[reader] = bits
|
31
|
-
end
|
32
|
-
|
33
|
-
return bits
|
34
|
-
end
|
35
|
-
|
36
|
-
def to_s()
|
37
|
-
return "CachingWrapperFilter(#{@filter})"
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,99 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
module Ferret::Search
|
3
|
-
# Scorer for conjunctions, sets of queries, all of which are required.
|
4
|
-
class ConjunctionScorer < Scorer
|
5
|
-
|
6
|
-
def initialize(similarity)
|
7
|
-
super
|
8
|
-
@scorers = []
|
9
|
-
@first_time = true
|
10
|
-
@more = true
|
11
|
-
end
|
12
|
-
|
13
|
-
def add(scorer)
|
14
|
-
@scorers << scorer
|
15
|
-
end
|
16
|
-
alias :<< :add
|
17
|
-
|
18
|
-
def first()
|
19
|
-
return @scorers.first
|
20
|
-
end
|
21
|
-
|
22
|
-
def last()
|
23
|
-
return @scorers.last
|
24
|
-
end
|
25
|
-
|
26
|
-
def doc()
|
27
|
-
return first().doc()
|
28
|
-
end
|
29
|
-
|
30
|
-
def next?()
|
31
|
-
if (@first_time)
|
32
|
-
init(true)
|
33
|
-
elsif (@more)
|
34
|
-
@more = last().next? # trigger further scanning
|
35
|
-
end
|
36
|
-
return do_next()
|
37
|
-
end
|
38
|
-
|
39
|
-
def do_next()
|
40
|
-
while @more and first().doc < last().doc # find doc w/ all clauses
|
41
|
-
@more = first().skip_to(last().doc) # skip first upto last
|
42
|
-
@scorers << @scorers.shift # move first to last
|
43
|
-
end
|
44
|
-
return @more # found a doc with all clauses
|
45
|
-
end
|
46
|
-
|
47
|
-
def skip_to(target)
|
48
|
-
if(@first_time)
|
49
|
-
init(false)
|
50
|
-
end
|
51
|
-
|
52
|
-
@scorers.each do |scorer|
|
53
|
-
break if not @more
|
54
|
-
@more = scorer.skip_to(target)
|
55
|
-
end
|
56
|
-
|
57
|
-
sort_scorers() if @more # resort the scorers
|
58
|
-
|
59
|
-
return do_next()
|
60
|
-
end
|
61
|
-
|
62
|
-
# Sums the scores of all of the scorers for the current document.
|
63
|
-
def score()
|
64
|
-
score = 0.0 # sum scores
|
65
|
-
@scorers.each do |scorer|
|
66
|
-
score += scorer.score
|
67
|
-
end
|
68
|
-
score *= @coord
|
69
|
-
return score
|
70
|
-
end
|
71
|
-
|
72
|
-
def init(init_scorers)
|
73
|
-
# compute coord factor
|
74
|
-
@coord = similarity().coord(@scorers.size(), @scorers.size())
|
75
|
-
|
76
|
-
@more = @scorers.size() > 0
|
77
|
-
|
78
|
-
if init_scorers
|
79
|
-
# move each scorer to its first entry
|
80
|
-
@scorers.each do |scorer|
|
81
|
-
break if not @more
|
82
|
-
@more = scorer.next?
|
83
|
-
end
|
84
|
-
sort_scorers() if @more
|
85
|
-
end
|
86
|
-
|
87
|
-
@first_time = false
|
88
|
-
end
|
89
|
-
|
90
|
-
def sort_scorers()
|
91
|
-
# move @scorers to an array
|
92
|
-
@scorers.sort! {|a,b| a.doc <=> b.doc }
|
93
|
-
end
|
94
|
-
|
95
|
-
def explain(doc)
|
96
|
-
raise NotImplementedError
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
@@ -1,205 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# A Scorer for OR like queries, counterpart of Lucene's +ConjunctionScorer+.
|
3
|
-
# This Scorer implements Scorer#skip_to(int) and uses skip_to() on the given Scorers.
|
4
|
-
class DisjunctionSumScorer < Scorer
|
5
|
-
# the sub-scorers
|
6
|
-
attr_accessor :sub_scorers
|
7
|
-
|
8
|
-
# Construct a +DisjunctionScorer+.
|
9
|
-
# sub_scorers:: A collection of at least two subscorers.
|
10
|
-
#
|
11
|
-
# minimum_nr_matchers:: The positive minimum number of subscorers that
|
12
|
-
# should match to match this query.
|
13
|
-
#
|
14
|
-
# When +@minimum_nr_matchers+ is bigger than the number
|
15
|
-
# of +sub_scorers+,no matches will be produced.
|
16
|
-
#
|
17
|
-
# When @minimum_nr_matchers equals the number of
|
18
|
-
# sub_scorers, it more efficient to use
|
19
|
-
# +ConjunctionScorer+.
|
20
|
-
def initialize(sub_scorers, minimum_nr_matchers = 1)
|
21
|
-
super(nil)
|
22
|
-
|
23
|
-
# The number of subscorers.
|
24
|
-
@nr_scorers = sub_scorers.size
|
25
|
-
|
26
|
-
# The document number of the current match.
|
27
|
-
@current_doc = -1
|
28
|
-
@curret_score = nil
|
29
|
-
# The number of subscorers that provide the current match.
|
30
|
-
@nr_matchers = -1
|
31
|
-
|
32
|
-
if (minimum_nr_matchers <= 0)
|
33
|
-
raise ArgumentError, "Minimum nr of matchers must be positive"
|
34
|
-
end
|
35
|
-
if (@nr_scorers <= 1)
|
36
|
-
raise ArgumentError, "There must be at least 2 sub_scorers"
|
37
|
-
end
|
38
|
-
|
39
|
-
@minimum_nr_matchers = minimum_nr_matchers
|
40
|
-
@sub_scorers = sub_scorers
|
41
|
-
|
42
|
-
# The @scorer_queue contains all subscorers ordered by their current
|
43
|
-
# doc, with the minimum at the top.
|
44
|
-
#
|
45
|
-
# The @scorer_queue is initialized the first time next? or skip_to() is
|
46
|
-
# called.
|
47
|
-
#
|
48
|
-
# An exhausted scorer is immediately removed from the @scorer_queue.
|
49
|
-
#
|
50
|
-
# If less than the @minimum_nr_matchers scorers remain in the
|
51
|
-
# @scorer_queue next? and skip_to() return false.
|
52
|
-
#
|
53
|
-
# After each to call to next? or skip_to()
|
54
|
-
# +currentSumScore+ is the total score of the current matching doc,
|
55
|
-
# +@nr_matchers+ is the number of matching scorers,
|
56
|
-
# and all scorers are after the matching doc, or are exhausted.
|
57
|
-
@scorer_queue = nil
|
58
|
-
end
|
59
|
-
|
60
|
-
# Called the first time next? or skip_to() is called to
|
61
|
-
# initialize +@scorer_queue+.
|
62
|
-
def init_scorer_queue()
|
63
|
-
@scorer_queue = ScorerQueue.new(@nr_scorers)
|
64
|
-
@sub_scorers.each do |sub_scorer|
|
65
|
-
if (sub_scorer.next?) # doc() method will be used in @scorer_queue.
|
66
|
-
@scorer_queue.insert(sub_scorer)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
# A +PriorityQueue+ that orders by Scorer#doc().
|
72
|
-
class ScorerQueue < Ferret::Utils::PriorityQueue
|
73
|
-
def less_than(scorer1, scorer2)
|
74
|
-
return scorer1.doc < scorer2.doc
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def next?
|
79
|
-
if (@scorer_queue == nil)
|
80
|
-
init_scorer_queue()
|
81
|
-
end
|
82
|
-
|
83
|
-
if (@scorer_queue.size < @minimum_nr_matchers)
|
84
|
-
return false
|
85
|
-
else
|
86
|
-
return advance_after_current()
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
|
91
|
-
# Advance all subscorers after the current document determined by the
|
92
|
-
# top of the +@scorer_queue+.
|
93
|
-
# Repeat until at least the minimum number of subscorers match on the same
|
94
|
-
# document and all subscorers are after that document or are exhausted.
|
95
|
-
#
|
96
|
-
# On entry the +@scorer_queue+ has at least +@minimum_nr_matchers+
|
97
|
-
# available. At least the scorer with the minimum document number will be advanced.
|
98
|
-
# returns:: true iff there is a match.
|
99
|
-
#
|
100
|
-
# In case there is a match, +@current_doc+, +currentSumScore+,
|
101
|
-
# and +@nr_matchers+ describe the match.
|
102
|
-
#
|
103
|
-
# TODO Investigate whether it is possible to use skip_to() when
|
104
|
-
# the minimum number of matchers is bigger than one, ie. begin and use the
|
105
|
-
# character of ConjunctionScorer for the minimum number of matchers.
|
106
|
-
def advance_after_current()
|
107
|
-
begin # repeat until minimum nr of matchers
|
108
|
-
top = @scorer_queue.top
|
109
|
-
@current_doc = top.doc
|
110
|
-
@current_score = top.score
|
111
|
-
@nr_matchers = 1
|
112
|
-
begin # Until all subscorers are after @current_doc
|
113
|
-
if top.next?
|
114
|
-
@scorer_queue.adjust_top()
|
115
|
-
else
|
116
|
-
@scorer_queue.pop()
|
117
|
-
if (@scorer_queue.size < (@minimum_nr_matchers - @nr_matchers))
|
118
|
-
# Not enough subscorers left for a match on this document,
|
119
|
-
# and also no more chance of any further match.
|
120
|
-
return false
|
121
|
-
end
|
122
|
-
if (@scorer_queue.size == 0)
|
123
|
-
break # nothing more to advance, check for last match.
|
124
|
-
end
|
125
|
-
end
|
126
|
-
top = @scorer_queue.top
|
127
|
-
if top.doc != @current_doc
|
128
|
-
break # All remaining subscorers are after @current_doc.
|
129
|
-
else
|
130
|
-
@current_score += top.score
|
131
|
-
@nr_matchers += 1
|
132
|
-
end
|
133
|
-
end while (true)
|
134
|
-
|
135
|
-
if (@nr_matchers >= @minimum_nr_matchers)
|
136
|
-
return true
|
137
|
-
elsif (@scorer_queue.size < @minimum_nr_matchers)
|
138
|
-
return false
|
139
|
-
end
|
140
|
-
end while (true)
|
141
|
-
end
|
142
|
-
|
143
|
-
# Returns the score of the current document matching the query.
|
144
|
-
# Initially invalid, until #next? is called the first time.
|
145
|
-
def score()
|
146
|
-
return @current_score
|
147
|
-
end
|
148
|
-
|
149
|
-
# Returns the document number of the current document matching the query.
|
150
|
-
# Initially invalid, until #next? is called the first time.
|
151
|
-
def doc()
|
152
|
-
return @current_doc
|
153
|
-
end
|
154
|
-
|
155
|
-
# Returns the number of subscorers matching the current document.
|
156
|
-
# Initially invalid, until #next? is called the first time.
|
157
|
-
def number_of_matchers()
|
158
|
-
return @nr_matchers
|
159
|
-
end
|
160
|
-
|
161
|
-
# Skips to the first match beyond the current whose document number is
|
162
|
-
# greater than or equal to a given target.
|
163
|
-
#
|
164
|
-
# When this method is used the #explain(int) method should not be used.
|
165
|
-
#
|
166
|
-
# The implementation uses the skip_to() method on the subscorers.
|
167
|
-
# target:: The target document number.
|
168
|
-
# returns:: true iff there is such a match.
|
169
|
-
def skip_to(target)
|
170
|
-
if @scorer_queue.nil?
|
171
|
-
init_scorer_queue()
|
172
|
-
end
|
173
|
-
if @scorer_queue.size < @minimum_nr_matchers
|
174
|
-
return false
|
175
|
-
end
|
176
|
-
if target <= @current_doc
|
177
|
-
target = @current_doc + 1
|
178
|
-
end
|
179
|
-
begin
|
180
|
-
top = @scorer_queue.top
|
181
|
-
if top.doc >= target
|
182
|
-
return advance_after_current()
|
183
|
-
elsif top.skip_to(target)
|
184
|
-
@scorer_queue.adjust_top()
|
185
|
-
else
|
186
|
-
@scorer_queue.pop()
|
187
|
-
if (@scorer_queue.size < @minimum_nr_matchers)
|
188
|
-
return false
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end while (true)
|
192
|
-
end
|
193
|
-
|
194
|
-
# Gives and explanation for the score of a given document.
|
195
|
-
# TODO Show the resulting score. See BooleanScorer.explain() on how to do this.
|
196
|
-
def explain(doc)
|
197
|
-
e = Explanation.new()
|
198
|
-
e.description = "At least " + @minimum_nr_matchers + " of"
|
199
|
-
@sub_scorers.each do |sub_scorer|
|
200
|
-
e.details << sub_scorer.explain(doc)
|
201
|
-
end
|
202
|
-
return e
|
203
|
-
end
|
204
|
-
end
|
205
|
-
end
|