ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,152 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
class PhraseScorer < Scorer
|
3
|
-
attr_reader :first, :last
|
4
|
-
protected :first, :last
|
5
|
-
|
6
|
-
def initialize(weight, tps, positions, similarity, norms)
|
7
|
-
super(similarity)
|
8
|
-
@norms = norms
|
9
|
-
@weight = weight
|
10
|
-
@value = weight.value
|
11
|
-
@first_time = true
|
12
|
-
@more = true
|
13
|
-
|
14
|
-
# convert tps to a list
|
15
|
-
tps.length.times do |i|
|
16
|
-
pp = PhrasePositions.new(tps[i], positions[i])
|
17
|
-
if (@last != nil) # add next to end of list
|
18
|
-
@last.next = pp
|
19
|
-
else
|
20
|
-
@first = pp
|
21
|
-
end
|
22
|
-
@last = pp
|
23
|
-
end
|
24
|
-
|
25
|
-
@pq = PhraseQueue.new(tps.length) # construct empty pq
|
26
|
-
end
|
27
|
-
|
28
|
-
def doc()
|
29
|
-
return @first.doc
|
30
|
-
end
|
31
|
-
|
32
|
-
def next?
|
33
|
-
if (@first_time)
|
34
|
-
init()
|
35
|
-
@first_time = false
|
36
|
-
elsif (@more)
|
37
|
-
@more = @last.next? # trigger further scanning
|
38
|
-
end
|
39
|
-
return do_next()
|
40
|
-
end
|
41
|
-
|
42
|
-
# next without initial increment
|
43
|
-
def do_next()
|
44
|
-
while (@more)
|
45
|
-
while (@more and @first.doc < @last.doc) # find doc w/ all the terms
|
46
|
-
@more = @first.skip_to(@last.doc) # skip first upto last
|
47
|
-
first_to_last() # and move it to the end
|
48
|
-
end
|
49
|
-
|
50
|
-
if (@more)
|
51
|
-
# found a doc with all of the terms
|
52
|
-
@freq = phrase_freq() # check for phrase
|
53
|
-
if (@freq == 0.0) # no match
|
54
|
-
@more = @last.next? # trigger further scanning
|
55
|
-
else
|
56
|
-
return true # found a match
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
return false # no more matches
|
61
|
-
end
|
62
|
-
|
63
|
-
def each()
|
64
|
-
pp = @first
|
65
|
-
while (pp != nil)
|
66
|
-
yield pp
|
67
|
-
pp = pp.next
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def score()
|
72
|
-
raw = similarity().tf(@freq) * @value # raw score
|
73
|
-
return raw * Similarity.decode_norm(@norms[@first.doc]) # normalize
|
74
|
-
end
|
75
|
-
|
76
|
-
def skip_to(target)
|
77
|
-
each() { |pp| break if not @more = pp.skip_to(target) }
|
78
|
-
sort() if @more # re-sort
|
79
|
-
return do_next()
|
80
|
-
end
|
81
|
-
|
82
|
-
def phrase_freq()
|
83
|
-
raise NotImplementedError
|
84
|
-
end
|
85
|
-
|
86
|
-
def init()
|
87
|
-
each do |pp|
|
88
|
-
break if not @more = pp.next?
|
89
|
-
end
|
90
|
-
if @more
|
91
|
-
sort()
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
def sort()
|
96
|
-
@pq.clear()
|
97
|
-
each() do |pp|
|
98
|
-
@pq.push(pp)
|
99
|
-
end
|
100
|
-
pq_to_list()
|
101
|
-
end
|
102
|
-
|
103
|
-
def pq_to_list()
|
104
|
-
@last = @first = nil
|
105
|
-
while (@pq.top() != nil)
|
106
|
-
pp = @pq.pop()
|
107
|
-
if (@last != nil) # add next to end of list
|
108
|
-
@last.next = pp
|
109
|
-
else
|
110
|
-
@first = pp
|
111
|
-
end
|
112
|
-
@last = pp
|
113
|
-
pp.next = nil
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def first_to_last()
|
118
|
-
@last.next = @first # move first to end of list
|
119
|
-
@last = @first
|
120
|
-
@first = @first.next
|
121
|
-
@last.next = nil
|
122
|
-
end
|
123
|
-
|
124
|
-
def explain(doc)
|
125
|
-
tf_explanation = Explanation.new()
|
126
|
-
|
127
|
-
while (next? and doc() < doc)
|
128
|
-
end
|
129
|
-
|
130
|
-
phrase_freq = (doc() == doc) ? @freq : 0.0
|
131
|
-
tf_explanation.value = @similarity.tf(phrase_freq)
|
132
|
-
tf_explanation.description = "tf(phrase_freq=#{phrase_freq})"
|
133
|
-
|
134
|
-
return tf_explanation
|
135
|
-
end
|
136
|
-
|
137
|
-
def to_s() return "phrase_scorer(#{@weight})" end
|
138
|
-
|
139
|
-
end
|
140
|
-
|
141
|
-
|
142
|
-
class PhraseQueue < Ferret::Utils::PriorityQueue
|
143
|
-
def less_than(pp1, pp2)
|
144
|
-
if (pp1.doc == pp2.doc)
|
145
|
-
return pp1.position < pp2.position
|
146
|
-
else
|
147
|
-
return pp1.doc < pp2.doc
|
148
|
-
end
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# A Query that matches documents containing terms with a specified prefix. A
|
3
|
-
# PrefixQuery is built by QueryParser for input like +app*+.
|
4
|
-
class PrefixQuery < Query
|
5
|
-
attr_reader :prefix
|
6
|
-
# Constructs a query for terms starting with +prefix+.
|
7
|
-
def initialize(prefix)
|
8
|
-
super()
|
9
|
-
@prefix = prefix
|
10
|
-
end
|
11
|
-
|
12
|
-
def rewrite(reader)
|
13
|
-
bq = BooleanQuery.new(true)
|
14
|
-
enumerator = reader.terms_from(@prefix)
|
15
|
-
begin
|
16
|
-
prefix_text = @prefix.text
|
17
|
-
prefix_length = prefix_text.length
|
18
|
-
prefix_field = @prefix.field
|
19
|
-
begin
|
20
|
-
term = enumerator.term
|
21
|
-
if (term.nil? or
|
22
|
-
term.field != prefix_field or
|
23
|
-
term.text[0,prefix_length] != prefix_text)
|
24
|
-
break
|
25
|
-
end
|
26
|
-
tq = TermQuery.new(term) # found a match
|
27
|
-
tq.boost = boost() # set the boost
|
28
|
-
bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
|
29
|
-
#puts("added " + term)
|
30
|
-
end while (enumerator.next?)
|
31
|
-
ensure
|
32
|
-
enumerator.close()
|
33
|
-
end
|
34
|
-
return bq
|
35
|
-
end
|
36
|
-
|
37
|
-
# Prints a user-readable version of this query.
|
38
|
-
def to_s(f)
|
39
|
-
buffer = ""
|
40
|
-
buffer << "#{@prefix.field}:" if @prefix.field != f
|
41
|
-
buffer << "#{@prefix.text}*"
|
42
|
-
buffer << "^#{boost()}" if boost() != 1.0
|
43
|
-
return buffer
|
44
|
-
end
|
45
|
-
|
46
|
-
def eql?(o)
|
47
|
-
(@prefix == o.prefix and boost() == o.boost)
|
48
|
-
end
|
49
|
-
|
50
|
-
def hash()
|
51
|
-
boost().hash ^ @prefix.hash
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
data/lib/ferret/search/query.rb
DELETED
@@ -1,140 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# The abstract base class for queries.
|
3
|
-
# Instantiable subclasses are:
|
4
|
-
# * TermQuery
|
5
|
-
# * MultiTermQuery
|
6
|
-
# * BooleanQuery
|
7
|
-
# * WildcardQuery
|
8
|
-
# * PhraseQuery
|
9
|
-
# * PrefixQuery
|
10
|
-
# * MultiPhraseQuery
|
11
|
-
# * FuzzyQuery
|
12
|
-
# * RangeQuery
|
13
|
-
# * Span::SpanQuery
|
14
|
-
#
|
15
|
-
# A parser for queries is contained in:
|
16
|
-
# * Ferret::QueryParser::QueryParser
|
17
|
-
#
|
18
|
-
class Query
|
19
|
-
# documents matching this query clause will (in addition to the normal
|
20
|
-
# weightings) have their score multiplied by the boost factor. It is
|
21
|
-
# 1.0 be default.
|
22
|
-
attr_accessor :boost
|
23
|
-
|
24
|
-
def initialize()
|
25
|
-
@boost = 1.0
|
26
|
-
end
|
27
|
-
|
28
|
-
# Prints a query to a string, with +field+ as the default field for
|
29
|
-
# terms. The representation used is one that is supposed to be readable
|
30
|
-
# by Ferret::QueryParser::QueryParser. However, there are the following
|
31
|
-
# limitations:
|
32
|
-
# * If the query was created by the parser, the printed representation
|
33
|
-
# may not be exactly what was parsed. For example, characters that need
|
34
|
-
# to be escaped will be represented without the required backslash.
|
35
|
-
# * Some of the more complicated queries (e.g. span queries)
|
36
|
-
# don't have a representation that can be parsed by QueryParser.
|
37
|
-
def to_s(field=nil)
|
38
|
-
raise NotImplementedError
|
39
|
-
end
|
40
|
-
|
41
|
-
# Expert: Constructs an appropriate Weight implementation for this query.
|
42
|
-
#
|
43
|
-
# Only implemented by primitive queries, which re-write to themselves.
|
44
|
-
def create_weight(searcher)
|
45
|
-
raise NotImplementedError
|
46
|
-
end
|
47
|
-
|
48
|
-
# Expert: Constructs and initializes a Weight for a top-level query.
|
49
|
-
def weight(searcher)
|
50
|
-
query = searcher.rewrite(self)
|
51
|
-
weight = query.create_weight(searcher)
|
52
|
-
sum = weight.sum_of_squared_weights()
|
53
|
-
norm = similarity(searcher).query_norm(sum)
|
54
|
-
weight.normalize(norm)
|
55
|
-
return weight
|
56
|
-
end
|
57
|
-
|
58
|
-
# Expert: called to re-write queries into primitive queries.
|
59
|
-
def rewrite(reader)
|
60
|
-
return self
|
61
|
-
end
|
62
|
-
|
63
|
-
# Expert: called when re-writing queries under MultiSearcher.
|
64
|
-
#
|
65
|
-
# Create a single query suitable for use by all subsearchers (in 1-1
|
66
|
-
# correspondence with queries). This is an optimization of the OR of
|
67
|
-
# all queries. We handle the common optimization cases of equal
|
68
|
-
# queries and overlapping clauses of boolean OR queries (as generated
|
69
|
-
# by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
|
70
|
-
# Be careful overriding this method as queries[0] determines which
|
71
|
-
# method will be called and is not necessarily of the same type as
|
72
|
-
# the other queries.
|
73
|
-
def combine(queries)
|
74
|
-
uniques = Set.new
|
75
|
-
queries.each do |query|
|
76
|
-
clauses = []
|
77
|
-
# check if we can split the query into clauses
|
78
|
-
splittable = query.respond_to? :clauses
|
79
|
-
if splittable
|
80
|
-
splittable = query.coord_disabled?
|
81
|
-
clauses = query.clauses
|
82
|
-
clauses.each do |clause|
|
83
|
-
splittable = clause.occur == BooleanClause::Occur::SHOULD
|
84
|
-
break unless splittable
|
85
|
-
end
|
86
|
-
end
|
87
|
-
if splittable
|
88
|
-
clauses.each { |clause| uniques << clause.query }
|
89
|
-
else
|
90
|
-
uniques << query
|
91
|
-
end
|
92
|
-
end
|
93
|
-
# optimization: if we have just one query, just return it
|
94
|
-
if uniques.size == 1
|
95
|
-
uniques.each { |query| return query }
|
96
|
-
end
|
97
|
-
|
98
|
-
result = BooleanQuery.new(true)
|
99
|
-
uniques.each do |query|
|
100
|
-
result.add_query(query, BooleanClause::Occur::SHOULD)
|
101
|
-
end
|
102
|
-
return result
|
103
|
-
end
|
104
|
-
|
105
|
-
# Expert: adds all terms occuring in this query to the terms set
|
106
|
-
def extract_terms(terms)
|
107
|
-
raise NotImplementedError
|
108
|
-
end
|
109
|
-
|
110
|
-
|
111
|
-
# Expert: merges the clauses of a set of BooleanQuery's into a single
|
112
|
-
# BooleanQuery.
|
113
|
-
#
|
114
|
-
# A utility for use by #combine() implementations.
|
115
|
-
def merge_boolean_queries(queries)
|
116
|
-
all_clauses = Set.new
|
117
|
-
queries.each do |query|
|
118
|
-
query.clauses.each do |clause|
|
119
|
-
all_clauses << clause
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
coord_disabled = queries.size==0 ? false : queries[0].coord_disabled?
|
124
|
-
result = BooleanQuery.new(coord_disabled)
|
125
|
-
all_clauses.each do |clause|
|
126
|
-
result << clause
|
127
|
-
end
|
128
|
-
return result
|
129
|
-
end
|
130
|
-
|
131
|
-
# Expert: Returns the Similarity implementation to be used for this
|
132
|
-
# query. Subclasses may override this method to specify their own
|
133
|
-
# Similarity implementation, perhaps one that delegates through that of
|
134
|
-
# the Searcher. By default the Searcher's Similarity implementation is
|
135
|
-
# returned.
|
136
|
-
def similarity(searcher)
|
137
|
-
return searcher.similarity
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
require 'monitor'
|
3
|
-
# Constrains search results to only match those which also match a provided
|
4
|
-
# query. Results are cached, so that searches after the first on the same
|
5
|
-
# index using this filter are much faster.
|
6
|
-
#
|
7
|
-
# This could be used, for example, with a RangeQuery on a suitably formatted
|
8
|
-
# date field to implement date filtering. One could re-use a single
|
9
|
-
# QueryFilter that matches, e.g., only documents modified within the last
|
10
|
-
# week. The QueryFilter and RangeQuery would only need to be reconstructed
|
11
|
-
# once per day.
|
12
|
-
class QueryFilter < Filter
|
13
|
-
|
14
|
-
# Constructs a filter which only matches documents matching
|
15
|
-
# +query+.
|
16
|
-
def initialize(query)
|
17
|
-
@query = query
|
18
|
-
@cache = nil
|
19
|
-
end
|
20
|
-
|
21
|
-
def bits(reader)
|
22
|
-
|
23
|
-
if (@cache == nil)
|
24
|
-
@cache = Ferret::Utils::WeakKeyHash.new
|
25
|
-
end
|
26
|
-
|
27
|
-
@cache.synchronize() do # check cache
|
28
|
-
bits = @cache[reader]
|
29
|
-
if bits
|
30
|
-
return bits
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
bits = Ferret::Utils::BitVector.new()
|
35
|
-
|
36
|
-
IndexSearcher.new(reader).search_each(@query) do |doc, score|
|
37
|
-
bits.set(doc) # set bit for hit
|
38
|
-
end
|
39
|
-
|
40
|
-
@cache.synchronize() do # update cache
|
41
|
-
@cache[reader] = bits
|
42
|
-
end
|
43
|
-
|
44
|
-
return bits
|
45
|
-
end
|
46
|
-
|
47
|
-
def to_s()
|
48
|
-
return "QueryFilter(#{@query})"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
@@ -1,103 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# A Filter that restricts search results to a range of values in a given
|
3
|
-
# field.
|
4
|
-
#
|
5
|
-
# This code borrows heavily from RangeQuery, but is implemented as a Filter.
|
6
|
-
class RangeFilter < Filter
|
7
|
-
include Ferret::Index
|
8
|
-
|
9
|
-
# field_name:: The field this range applies to
|
10
|
-
# lower_term:: The lower bound on this range
|
11
|
-
# upper_term:: The upper bound on this range
|
12
|
-
# include_lower:: Does this range include the lower bound?
|
13
|
-
# include_upper:: Does this range include the upper bound?
|
14
|
-
def initialize(field_name, lower_term, upper_term, include_lower, include_upper)
|
15
|
-
@field_name = field_name
|
16
|
-
@lower_term = lower_term
|
17
|
-
@upper_term = upper_term
|
18
|
-
@include_lower = include_lower
|
19
|
-
@include_upper = include_upper
|
20
|
-
|
21
|
-
if (lower_term.nil? and upper_term.nil?)
|
22
|
-
raise ArgumentError, "At least one value must be non-nil"
|
23
|
-
end
|
24
|
-
if (include_lower and lower_term.nil?)
|
25
|
-
raise ArgumentError, "The lower bound must be non-nil to be inclusive"
|
26
|
-
end
|
27
|
-
if (include_upper and upper_term.nil?)
|
28
|
-
raise ArgumentError, "The upper bound must be non-nil to be inclusive"
|
29
|
-
end
|
30
|
-
if (upper_term and lower_term and upper_term < lower_term)
|
31
|
-
raise ArgumentError, "The lower bound must less than the upper bound"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# Constructs a filter for field +field_name+ matching less than or equal to
|
36
|
-
# +upper_term+.
|
37
|
-
def RangeFilter.new_less(field_name, upper_term, include_upper = true)
|
38
|
-
return RangeFilter.new(field_name, nil, upper_term, false, include_upper)
|
39
|
-
end
|
40
|
-
|
41
|
-
# Constructs a filter for field +field_name+ matching greater than or equal
|
42
|
-
# to +lower_term+.
|
43
|
-
def RangeFilter.new_more(field_name, lower_term, include_lower = true)
|
44
|
-
return RangeFilter.new(field_name, lower_term, nil, include_lower, false)
|
45
|
-
end
|
46
|
-
|
47
|
-
# Returns a BitVector with true for documents which should be permitted in
|
48
|
-
# search results, and false for those that should not.
|
49
|
-
def bits(reader)
|
50
|
-
bits = Ferret::Utils::BitVector.new()
|
51
|
-
term_enum = reader.terms_from(Term.new(@field_name, @lower_term||""))
|
52
|
-
|
53
|
-
begin
|
54
|
-
if (term_enum.term() == nil)
|
55
|
-
return bits
|
56
|
-
end
|
57
|
-
check_lower = !@include_lower # make adjustments to set to exclusive
|
58
|
-
|
59
|
-
term_docs = reader.term_docs
|
60
|
-
begin
|
61
|
-
begin
|
62
|
-
term = term_enum.term()
|
63
|
-
break if (term.nil? or term.field != @field_name)
|
64
|
-
|
65
|
-
if (!check_lower or @lower_term.nil? or term.text > @lower_term)
|
66
|
-
check_lower = false
|
67
|
-
if @upper_term
|
68
|
-
compare = @upper_term <=> term.text
|
69
|
-
# if beyond the upper term, or is exclusive and
|
70
|
-
# this is equal to the upper term, break out
|
71
|
-
if ((compare < 0) or (!@include_upper and compare == 0))
|
72
|
-
break
|
73
|
-
end
|
74
|
-
end
|
75
|
-
# we have a good term, find the docs
|
76
|
-
|
77
|
-
term_docs.seek(term_enum)
|
78
|
-
while term_docs.next?
|
79
|
-
bits.set(term_docs.doc)
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end while term_enum.next?
|
83
|
-
ensure
|
84
|
-
term_docs.close()
|
85
|
-
end
|
86
|
-
ensure
|
87
|
-
term_enum.close()
|
88
|
-
end
|
89
|
-
|
90
|
-
return bits
|
91
|
-
end
|
92
|
-
|
93
|
-
def to_s()
|
94
|
-
buffer = "#{@field_name}:"
|
95
|
-
buffer << "[" if @include_lower
|
96
|
-
buffer << @lower_term if @lower_term
|
97
|
-
buffer << "-"
|
98
|
-
buffer << @upper_term if @upper_term
|
99
|
-
buffer << @include_upper ? "]" : "end"
|
100
|
-
return buffer
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|