ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# A Query that matches documents containing a subset of terms provided
|
3
|
+
# by a FilteredTermEnum enumeration.
|
4
|
+
#
|
5
|
+
# +MultiTermQuery+ is not designed to be used by itself. The reason being
|
6
|
+
# that it is not intialized with a FilteredTermEnum enumeration. A
|
7
|
+
# FilteredTermEnum enumeration needs to be provided.
|
8
|
+
#
|
9
|
+
# For example, WildcardQuery and FuzzyQuery extend +MultiTermQuery+ to
|
10
|
+
# provide WildcardTermEnum and FuzzyTermEnum, respectively.
|
11
|
+
class MultiTermQuery < Query
|
12
|
+
attr_reader :term
|
13
|
+
|
14
|
+
# Constructs a query for terms matching +term+.
|
15
|
+
def initialize(term)
|
16
|
+
super()
|
17
|
+
@term = term
|
18
|
+
end
|
19
|
+
|
20
|
+
# Construct the enumeration to be used, expanding the pattern term.
|
21
|
+
def get_term_enum(reader)
|
22
|
+
raise NotImplementedError
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
def rewrite(reader)
|
27
|
+
enumerator = get_term_enum(reader)
|
28
|
+
bq = BooleanQuery.new(true)
|
29
|
+
begin
|
30
|
+
begin
|
31
|
+
t = enumerator.term()
|
32
|
+
if (t != nil)
|
33
|
+
tq = TermQuery.new(t) # found a match
|
34
|
+
tq.boost = boost() * enumerator.difference() # set the boost
|
35
|
+
bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
|
36
|
+
end
|
37
|
+
end while enumerator.next?
|
38
|
+
ensure
|
39
|
+
enumerator.close()
|
40
|
+
end
|
41
|
+
return bq
|
42
|
+
end
|
43
|
+
|
44
|
+
# Prints a user-readable version of this query.
|
45
|
+
def to_s(field = nil)
|
46
|
+
buffer = ""
|
47
|
+
buffer << "#{@term.field}:" if @term.field != field
|
48
|
+
buffer << @term.text
|
49
|
+
buffer << "^#{boost()}" if (boost() != 1.0)
|
50
|
+
return buffer
|
51
|
+
end
|
52
|
+
|
53
|
+
def eql?(o)
|
54
|
+
if not o.instance_of? MultiTermQuery
|
55
|
+
return false
|
56
|
+
end
|
57
|
+
return term == o.term
|
58
|
+
end
|
59
|
+
alias :== :eql?
|
60
|
+
|
61
|
+
def hash()
|
62
|
+
return term.hash()
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# A scorer that matches no document at all.
|
3
|
+
class NonMatchingScorer < Scorer
|
4
|
+
def initialize()
|
5
|
+
super(nil) # no similarity used
|
6
|
+
end
|
7
|
+
|
8
|
+
def next?
|
9
|
+
return false
|
10
|
+
end
|
11
|
+
|
12
|
+
def skip_to(target)
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
|
16
|
+
def explain(doc)
|
17
|
+
e = Explanation.new()
|
18
|
+
e.description = "No document matches."
|
19
|
+
return e
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
class PhrasePositions
|
3
|
+
attr_reader :doc, :position
|
4
|
+
attr_accessor :next
|
5
|
+
|
6
|
+
def initialize(tp_enum, offset)
|
7
|
+
@tp_enum = tp_enum
|
8
|
+
@offset = offset
|
9
|
+
@count = @position = @doc = -1
|
10
|
+
@next = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def next?()
|
14
|
+
if not @tp_enum.next?
|
15
|
+
@tp_enum.close() # close stream
|
16
|
+
@doc = Scorer::MAX_DOCS # sentinel value
|
17
|
+
return false
|
18
|
+
end
|
19
|
+
@doc = @tp_enum.doc
|
20
|
+
@position = 0
|
21
|
+
return true
|
22
|
+
end
|
23
|
+
|
24
|
+
def skip_to(target)
|
25
|
+
if not @tp_enum.skip_to(target)
|
26
|
+
@tp_enum.close() # close stream
|
27
|
+
@doc = Scorer::MAX_DOCS # sentinel value
|
28
|
+
return false
|
29
|
+
end
|
30
|
+
@doc = @tp_enum.doc
|
31
|
+
@position = 0
|
32
|
+
return true
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def first_position()
|
37
|
+
@count = @tp_enum.freq # read first pos
|
38
|
+
next_position()
|
39
|
+
end
|
40
|
+
|
41
|
+
def next_position()
|
42
|
+
@count -= 1
|
43
|
+
if @count >= 0 # read subsequent pos's
|
44
|
+
@position = @tp_enum.next_position() - @offset
|
45
|
+
return true
|
46
|
+
else
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_s
|
52
|
+
"pp->(doc => #{@doc}, position => #{position})"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,217 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# A Query that matches documents containing a particular sequence of terms.
|
3
|
+
# A PhraseQuery is built by QueryParser for input like +"new york"+.
|
4
|
+
#
|
5
|
+
# This query may be combined with other terms or queries with a BooleanQuery.
|
6
|
+
class PhraseQuery < Query
|
7
|
+
def initialize()
|
8
|
+
super
|
9
|
+
@slop = 0
|
10
|
+
@terms = []
|
11
|
+
@positions = []
|
12
|
+
@field = nil
|
13
|
+
end
|
14
|
+
|
15
|
+
# Sets the number of other words permitted between words in query phrase.
|
16
|
+
# If zero, then this is an exact phrase search. For larger values this
|
17
|
+
# works like a +WITHIN+ or +NEAR+ operator.
|
18
|
+
#
|
19
|
+
# The slop is in fact an edit-distance, where the units correspond to
|
20
|
+
# moves of terms in the query phrase out of position. For example, to
|
21
|
+
# switch the order of two words requires two moves (the first move places
|
22
|
+
# the words atop one another), so to permit re-orderings of phrases, the
|
23
|
+
# slop must be at least two.
|
24
|
+
#
|
25
|
+
# More exact matches are scored higher than sloppier matches, thus search
|
26
|
+
# results are sorted by exactness.
|
27
|
+
#
|
28
|
+
# The slop is zero by default, requiring exact matches.
|
29
|
+
attr_accessor :slop
|
30
|
+
attr_reader :terms, :positions, :field
|
31
|
+
|
32
|
+
# Adds a term to the end of the query phrase.
|
33
|
+
#
|
34
|
+
# The relative position of the term is the one immediately after the last
|
35
|
+
# term added, unless explicitly specified. By specifying explicitly,
|
36
|
+
# you can have phrases with more than one term at the same position or
|
37
|
+
# phrases with gaps (e.g. in connection with stopwords).
|
38
|
+
#
|
39
|
+
# term:: the term to search for
|
40
|
+
# position:: the relative position of the term to the rest of the terms
|
41
|
+
# int the query.
|
42
|
+
def add(term, position = nil, pos_inc = 1)
|
43
|
+
if position.nil?
|
44
|
+
position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
|
45
|
+
end
|
46
|
+
|
47
|
+
if @terms.size == 0
|
48
|
+
@field = term.field
|
49
|
+
elsif (term.field != @field)
|
50
|
+
raise ArgumentError, "All phrase terms must be in the same field: #{term}"
|
51
|
+
end
|
52
|
+
|
53
|
+
@terms << term
|
54
|
+
@positions << position
|
55
|
+
end
|
56
|
+
|
57
|
+
def <<(term)
|
58
|
+
add(term)
|
59
|
+
return self
|
60
|
+
end
|
61
|
+
|
62
|
+
class PhraseWeight < Weight
|
63
|
+
attr_reader :query, :value
|
64
|
+
|
65
|
+
def initialize(query, searcher)
|
66
|
+
@query = query
|
67
|
+
@similarity = query.similarity(searcher)
|
68
|
+
@idf = @similarity.idf_phrase(@query.terms, searcher)
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_s() return "phrase_weight(#{@value})" end
|
72
|
+
|
73
|
+
def sum_of_squared_weights()
|
74
|
+
@query_weight = @idf * @query.boost() # compute query weight
|
75
|
+
return @query_weight * @query_weight # square it
|
76
|
+
end
|
77
|
+
|
78
|
+
def normalize(query_norm)
|
79
|
+
@query_norm = query_norm
|
80
|
+
@query_weight *= query_norm # normalize query weight
|
81
|
+
@value = @query_weight * @idf # idf for document
|
82
|
+
end
|
83
|
+
|
84
|
+
def scorer(reader)
|
85
|
+
return nil if @query.terms.size == 0 # optimize zero-term case
|
86
|
+
|
87
|
+
tps = []
|
88
|
+
@query.terms.each do |term|
|
89
|
+
tp = reader.term_positions_for(term)
|
90
|
+
return nil if tp.nil?
|
91
|
+
tps << tp
|
92
|
+
end
|
93
|
+
|
94
|
+
if (@query.slop == 0) # optimize exact case
|
95
|
+
return ExactPhraseScorer.new(self, tps, @query.positions,
|
96
|
+
@similarity,
|
97
|
+
reader.get_norms(@query.field))
|
98
|
+
else
|
99
|
+
return SloppyPhraseScorer.new(self, tps, @query.positions,
|
100
|
+
@similarity,
|
101
|
+
@query.slop,
|
102
|
+
reader.get_norms(@query.field))
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def explain(reader, doc)
|
107
|
+
result = Explanation.new()
|
108
|
+
result.description = "weight(#{@query} in #{doc}), product of:"
|
109
|
+
|
110
|
+
doc_freqs = @query.terms.map do |term|
|
111
|
+
"#{term.text}=#{reader.doc_freq(term)}"
|
112
|
+
end.join(", ")
|
113
|
+
|
114
|
+
idf_expl = Explanation.new(@idf, "idf(#{@query.field}:<#{doc_freqs}>)")
|
115
|
+
|
116
|
+
# explain query weight
|
117
|
+
query_expl = Explanation.new()
|
118
|
+
query_expl.description = "query_weight(#{@query}), product of:"
|
119
|
+
|
120
|
+
boost = @query.boost()
|
121
|
+
if boost != 1.0
|
122
|
+
boost_expl = Explanation.new(boost, "boost")
|
123
|
+
query_expl << boost_expl
|
124
|
+
end
|
125
|
+
query_expl << idf_expl
|
126
|
+
|
127
|
+
query_norm_expl = Explanation.new(@query_norm, "query_norm")
|
128
|
+
query_expl << query_norm_expl
|
129
|
+
|
130
|
+
query_expl.value = boost * @idf * query_norm_expl.value
|
131
|
+
|
132
|
+
result << query_expl
|
133
|
+
|
134
|
+
# explain field weight
|
135
|
+
field_expl = Explanation.new()
|
136
|
+
field_expl.description =
|
137
|
+
"field_weight(#{query} in #{doc}), product of:"
|
138
|
+
|
139
|
+
tf_expl = scorer(reader).explain(doc)
|
140
|
+
field_expl << tf_expl
|
141
|
+
field_expl << idf_expl
|
142
|
+
|
143
|
+
field_norm_expl = Explanation.new()
|
144
|
+
field_norms = reader.get_norms(@query.field)
|
145
|
+
field_norm =
|
146
|
+
field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
|
147
|
+
field_norm_expl.value = field_norm
|
148
|
+
field_norm_expl.description =
|
149
|
+
"field_norm(field=#{@query.field}, doc=#{doc})"
|
150
|
+
field_expl << field_norm_expl
|
151
|
+
|
152
|
+
field_expl.value = tf_expl.value * @idf * field_norm
|
153
|
+
|
154
|
+
result << field_expl
|
155
|
+
|
156
|
+
# combine them
|
157
|
+
result.value = query_expl.value * field_expl.value
|
158
|
+
|
159
|
+
if query_expl.value == 1.0
|
160
|
+
return field_expl
|
161
|
+
else
|
162
|
+
return result
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def create_weight(searcher)
|
168
|
+
if @terms.size == 1 # optimize one-term case
|
169
|
+
term = @terms[0]
|
170
|
+
tq = TermQuery.new(term)
|
171
|
+
tq.boost = boost()
|
172
|
+
return tq.create_weight(searcher)
|
173
|
+
end
|
174
|
+
return PhraseWeight.new(self, searcher)
|
175
|
+
end
|
176
|
+
|
177
|
+
# See Query#extract_terms()
|
178
|
+
def extract_terms(query_terms)
|
179
|
+
query_terms.add_all(@terms)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Prints a user-readable version of this query.
|
183
|
+
def to_s(f=nil)
|
184
|
+
buffer = ""
|
185
|
+
buffer << "#{@field}:" if @field != f
|
186
|
+
buffer << '"'
|
187
|
+
last_pos = -1
|
188
|
+
@terms.each_index do |i|
|
189
|
+
term = @terms[i]
|
190
|
+
pos = @positions[i]
|
191
|
+
last_pos.upto(pos-2) {buffer << "<> "}
|
192
|
+
last_pos = pos
|
193
|
+
buffer << "#{term.text} "
|
194
|
+
end
|
195
|
+
buffer.rstrip!
|
196
|
+
buffer << '"'
|
197
|
+
buffer << "~#{slop}" if (slop != 0)
|
198
|
+
buffer << "^#{boost()}" if boost() != 1.0
|
199
|
+
return buffer
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns true iff +o+ is equal to this.
|
203
|
+
def eql?(o)
|
204
|
+
if not o.instance_of? PhraseQuery
|
205
|
+
return false
|
206
|
+
end
|
207
|
+
return (boost() == o.boost() and @slop == o.slop and
|
208
|
+
@terms == o.terms and @positions == o.positions)
|
209
|
+
end
|
210
|
+
alias :== :eql?
|
211
|
+
|
212
|
+
# Returns a hash code value for this object.
|
213
|
+
def hash()
|
214
|
+
return boost().hash ^ slop.hash ^ @terms.hash ^ @positions.hash
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
class PhraseScorer < Scorer
|
3
|
+
attr_reader :first, :last
|
4
|
+
protected :first, :last
|
5
|
+
|
6
|
+
def initialize(weight, tps, positions, similarity, norms)
|
7
|
+
super(similarity)
|
8
|
+
@norms = norms
|
9
|
+
@weight = weight
|
10
|
+
@value = weight.value
|
11
|
+
@first_time = true
|
12
|
+
@more = true
|
13
|
+
|
14
|
+
# convert tps to a list
|
15
|
+
tps.length.times do |i|
|
16
|
+
pp = PhrasePositions.new(tps[i], positions[i])
|
17
|
+
if (@last != nil) # add next to end of list
|
18
|
+
@last.next = pp
|
19
|
+
else
|
20
|
+
@first = pp
|
21
|
+
end
|
22
|
+
@last = pp
|
23
|
+
end
|
24
|
+
|
25
|
+
@pq = PhraseQueue.new(tps.length) # construct empty pq
|
26
|
+
end
|
27
|
+
|
28
|
+
def doc()
|
29
|
+
return @first.doc
|
30
|
+
end
|
31
|
+
|
32
|
+
def next?
|
33
|
+
if (@first_time)
|
34
|
+
init()
|
35
|
+
@first_time = false
|
36
|
+
elsif (@more)
|
37
|
+
@more = @last.next? # trigger further scanning
|
38
|
+
end
|
39
|
+
return do_next()
|
40
|
+
end
|
41
|
+
|
42
|
+
# next without initial increment
|
43
|
+
def do_next()
|
44
|
+
while (@more)
|
45
|
+
while (@more and @first.doc < @last.doc) # find doc w/ all the terms
|
46
|
+
@more = @first.skip_to(@last.doc) # skip first upto last
|
47
|
+
first_to_last() # and move it to the end
|
48
|
+
end
|
49
|
+
|
50
|
+
if (@more)
|
51
|
+
# found a doc with all of the terms
|
52
|
+
@freq = phrase_freq() # check for phrase
|
53
|
+
if (@freq == 0.0) # no match
|
54
|
+
@more = @last.next? # trigger further scanning
|
55
|
+
else
|
56
|
+
return true # found a match
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
return false # no more matches
|
61
|
+
end
|
62
|
+
|
63
|
+
def each()
|
64
|
+
pp = @first
|
65
|
+
while (pp != nil)
|
66
|
+
yield pp
|
67
|
+
pp = pp.next
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def score()
|
72
|
+
#puts("scoring #{@first.doc}")
|
73
|
+
raw = similarity().tf(@freq) * @value # raw score
|
74
|
+
return raw * Similarity.decode_norm(@norms[@first.doc]) # normalize
|
75
|
+
end
|
76
|
+
|
77
|
+
def skip_to(target)
|
78
|
+
each() { |pp| break if not @more = pp.skip_to(target) }
|
79
|
+
sort() if @more # re-sort
|
80
|
+
return do_next()
|
81
|
+
end
|
82
|
+
|
83
|
+
def phrase_freq()
|
84
|
+
raise NotImplementedError
|
85
|
+
end
|
86
|
+
|
87
|
+
def init()
|
88
|
+
each do |pp|
|
89
|
+
break if not @more = pp.next?
|
90
|
+
end
|
91
|
+
if @more
|
92
|
+
sort()
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def sort()
|
97
|
+
@pq.clear()
|
98
|
+
each() do |pp|
|
99
|
+
@pq.push(pp)
|
100
|
+
end
|
101
|
+
pq_to_list()
|
102
|
+
end
|
103
|
+
|
104
|
+
def pq_to_list()
|
105
|
+
@last = @first = nil
|
106
|
+
while (@pq.top() != nil)
|
107
|
+
pp = @pq.pop()
|
108
|
+
if (@last != nil) # add next to end of list
|
109
|
+
@last.next = pp
|
110
|
+
else
|
111
|
+
@first = pp
|
112
|
+
end
|
113
|
+
@last = pp
|
114
|
+
pp.next = nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def first_to_last()
|
119
|
+
@last.next = @first # move first to end of list
|
120
|
+
@last = @first
|
121
|
+
@first = @first.next
|
122
|
+
@last.next = nil
|
123
|
+
end
|
124
|
+
|
125
|
+
def explain(doc)
|
126
|
+
tf_explanation = Explanation.new()
|
127
|
+
|
128
|
+
while (next? and doc() < doc)
|
129
|
+
end
|
130
|
+
|
131
|
+
phrase_freq = (doc() == doc) ? @freq : 0.0
|
132
|
+
tf_explanation.value = @similarity.tf(phrase_freq)
|
133
|
+
tf_explanation.description = "tf(phrase_freq=#{phrase_freq})"
|
134
|
+
|
135
|
+
return tf_explanation
|
136
|
+
end
|
137
|
+
|
138
|
+
def to_s() return "phrase_scorer(#{@weight})" end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
class PhraseQueue < Ferret::Utils::PriorityQueue
|
144
|
+
def less_than(pp1, pp2)
|
145
|
+
if (pp1.doc == pp2.doc)
|
146
|
+
return pp1.position < pp2.position
|
147
|
+
else
|
148
|
+
return pp1.doc < pp2.doc
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|