ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# A Scorer for queries with a required subscorer and an excluding (prohibited)
|
3
|
+
# subscorer.
|
4
|
+
#
|
5
|
+
# This +Scorer+ implements Scorer#skip_to(int), and it uses the skip_to() on
|
6
|
+
# the given scorers.
|
7
|
+
class ReqExclScorer < Scorer
|
8
|
+
# Construct a +ReqExclScorer+.
|
9
|
+
# req_scorer:: The scorer that must match, except where
|
10
|
+
# excl_scorer:: indicates exclusion.
|
11
|
+
def initialize(req_scorer, excl_scorer)
|
12
|
+
super(nil) # No similarity used.
|
13
|
+
@req_scorer = req_scorer
|
14
|
+
@excl_scorer = excl_scorer
|
15
|
+
|
16
|
+
@first_time = true
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def next?
|
21
|
+
if @first_time
|
22
|
+
if not @excl_scorer.next?
|
23
|
+
@excl_scorer = nil # exhausted at start
|
24
|
+
end
|
25
|
+
@first_time = false
|
26
|
+
end
|
27
|
+
if @req_scorer == nil
|
28
|
+
return false
|
29
|
+
end
|
30
|
+
if not @req_scorer.next?
|
31
|
+
@req_scorer = nil; # exhausted, nothing left
|
32
|
+
return false
|
33
|
+
end
|
34
|
+
if @excl_scorer == nil
|
35
|
+
return true # @req_scorer.next? already returned true
|
36
|
+
end
|
37
|
+
return to_non_excluded()
|
38
|
+
end
|
39
|
+
|
40
|
+
# Advance to non excluded doc.
|
41
|
+
# On entry:
|
42
|
+
#
|
43
|
+
# * @req_scorer != nil
|
44
|
+
# * @excl_scorer != nil
|
45
|
+
# * @req_scorer was advanced once via next? or skip_to() and
|
46
|
+
# @req_scorer.doc() may still be excluded.
|
47
|
+
#
|
48
|
+
# Advances @req_scorer a non excluded required doc, if any.
|
49
|
+
#
|
50
|
+
# returns:: true iff there is a non excluded required doc.
|
51
|
+
def to_non_excluded()
|
52
|
+
excl_doc = @excl_scorer.doc
|
53
|
+
begin
|
54
|
+
req_doc = @req_scorer.doc # may be excluded
|
55
|
+
if (req_doc < excl_doc)
|
56
|
+
return true # @req_scorer advanced to before @excl_scorer, ie. not excluded
|
57
|
+
elsif (req_doc > excl_doc)
|
58
|
+
unless @excl_scorer.skip_to(req_doc)
|
59
|
+
@excl_scorer = nil # exhausted, no more exclusions
|
60
|
+
return true
|
61
|
+
end
|
62
|
+
excl_doc = @excl_scorer.doc
|
63
|
+
if excl_doc > req_doc
|
64
|
+
return true; # not excluded
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end while @req_scorer.next?
|
68
|
+
@req_scorer = nil; # exhausted, nothing left
|
69
|
+
return false
|
70
|
+
end
|
71
|
+
|
72
|
+
# @req_scorer may be nil when next? or skip_to() already return false so
|
73
|
+
# only call when you know that a doc exists
|
74
|
+
def doc()
|
75
|
+
return @req_scorer.doc
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns the score of the current document matching the query.
|
79
|
+
#
|
80
|
+
# Initially invalid, until #next? is called the first time.
|
81
|
+
#
|
82
|
+
# returns:: The score of the required scorer.
|
83
|
+
def score()
|
84
|
+
return @req_scorer.score()
|
85
|
+
end
|
86
|
+
|
87
|
+
# Skips to the first match beyond the current whose document number is
|
88
|
+
# greater than or equal to a given target.
|
89
|
+
#
|
90
|
+
# When this method is used the #explain(int) method should not be used.
|
91
|
+
#
|
92
|
+
# target:: The target document number.
|
93
|
+
# returns:: true iff there is such a match.
|
94
|
+
def skip_to(target)
|
95
|
+
if (@first_time)
|
96
|
+
@first_time = false
|
97
|
+
if (! @excl_scorer.skip_to(target))
|
98
|
+
@excl_scorer = nil; # exhausted
|
99
|
+
end
|
100
|
+
end
|
101
|
+
if (@req_scorer == nil)
|
102
|
+
return false
|
103
|
+
end
|
104
|
+
if (@excl_scorer == nil)
|
105
|
+
return @req_scorer.skip_to(target)
|
106
|
+
end
|
107
|
+
if (! @req_scorer.skip_to(target))
|
108
|
+
@req_scorer = nil
|
109
|
+
return false
|
110
|
+
end
|
111
|
+
return to_non_excluded()
|
112
|
+
end
|
113
|
+
|
114
|
+
def explain(doc)
|
115
|
+
e = Explanation.new()
|
116
|
+
if @excl_scorer.skip_to(doc) and @excl_scorer.doc == doc
|
117
|
+
e.description = "excluded"
|
118
|
+
else
|
119
|
+
e.description = "not excluded"
|
120
|
+
e.details << @req_scorer.explain(doc)
|
121
|
+
end
|
122
|
+
return e
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# A Scorer for queries with a required part and an optional part.
|
3
|
+
# Delays skip_to() on the optional part until a score() is needed.
|
4
|
+
#
|
5
|
+
# This +Scorer+ implements Scorer#skip_to(int).
|
6
|
+
class ReqOptSumScorer < Scorer
|
7
|
+
# The scorers passed from the constructor.
|
8
|
+
# These are set to nil as soon as their next? or skip_to() returns false.
|
9
|
+
#
|
10
|
+
# Construct a +ReqOptScorer+.
|
11
|
+
# req_scorer:: The required scorer. This must match.
|
12
|
+
# opt_scorer:: The optional scorer. This is used for scoring only.
|
13
|
+
def initialize(req_scorer, opt_scorer)
|
14
|
+
super(nil) # No similarity used.
|
15
|
+
@req_scorer = req_scorer
|
16
|
+
@opt_scorer = opt_scorer
|
17
|
+
|
18
|
+
@first_time_opt_scorer = true
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def next?
|
23
|
+
return @req_scorer.next?
|
24
|
+
end
|
25
|
+
|
26
|
+
def skip_to(target)
|
27
|
+
return @req_scorer.skip_to(target)
|
28
|
+
end
|
29
|
+
|
30
|
+
def doc()
|
31
|
+
return @req_scorer.doc()
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the score of the current document matching the query.
|
35
|
+
# Initially invalid, until #next? is called the first time.
|
36
|
+
#
|
37
|
+
# returns:: The score of the required scorer, eventually increased by the
|
38
|
+
# score of the optional scorer when it also matches the current
|
39
|
+
# document.
|
40
|
+
def score()
|
41
|
+
cur_doc = @req_scorer.doc
|
42
|
+
req_score = @req_scorer.score
|
43
|
+
if @first_time_opt_scorer
|
44
|
+
@first_time_opt_scorer = false
|
45
|
+
if not @opt_scorer.skip_to(cur_doc)
|
46
|
+
@opt_scorer = nil
|
47
|
+
return req_score
|
48
|
+
end
|
49
|
+
elsif @opt_scorer.nil?
|
50
|
+
return req_score
|
51
|
+
elsif @opt_scorer.doc < cur_doc and not @opt_scorer.skip_to(cur_doc)
|
52
|
+
@opt_scorer = nil
|
53
|
+
return req_score
|
54
|
+
end
|
55
|
+
# assert (@opt_scorer != nil) and (@opt_scorer.doc() >= cur_doc)
|
56
|
+
return (@opt_scorer.doc == cur_doc) ? req_score + @opt_scorer.score() : req_score
|
57
|
+
end
|
58
|
+
|
59
|
+
# Explain the score of a document.
|
60
|
+
# @todo Also show the total score.
|
61
|
+
# See BooleanScorer.explain() on how to do this.
|
62
|
+
def explain(doc)
|
63
|
+
e = Explanation.new()
|
64
|
+
e.description = "required, optional"
|
65
|
+
e.details << @req_scorer.explain(doc)
|
66
|
+
e.details << @opt_scorer.explain(doc)
|
67
|
+
return e
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Expert: Returned by low-level search implementations.
|
3
|
+
# See TopDocs
|
4
|
+
class ScoreDoc
|
5
|
+
include Comparable
|
6
|
+
# Expert: The score of this document for the query.
|
7
|
+
attr_accessor :score
|
8
|
+
|
9
|
+
# Expert: A hit document's number.
|
10
|
+
attr_accessor :doc
|
11
|
+
|
12
|
+
# Expert: Constructs a ScoreDoc.
|
13
|
+
def initialize(doc, score)
|
14
|
+
@doc = doc
|
15
|
+
@score = score
|
16
|
+
end
|
17
|
+
|
18
|
+
# returns a hash value for storage in a Hash
|
19
|
+
def hash()
|
20
|
+
return 100 * doc * score
|
21
|
+
end
|
22
|
+
|
23
|
+
# score_docA < score_docB if score_docA.score < score_docB.score or
|
24
|
+
# score_docA.doc > score_docB.doc
|
25
|
+
def <=>(other)
|
26
|
+
result = @score.<=>(other.score)
|
27
|
+
if (result == 0)
|
28
|
+
return other.doc.<=>(@doc)
|
29
|
+
else
|
30
|
+
return result
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
"#{@doc} -> %0.2f" % @score
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Expert: Compares two ScoreDoc objects for sorting.
|
3
|
+
class ScoreDocComparator
|
4
|
+
|
5
|
+
# Special comparator for sorting hits according to computed relevance (score).
|
6
|
+
RELEVANCE = ScoreDocComparator.new()
|
7
|
+
class <<RELEVANCE
|
8
|
+
def compare(i, j)
|
9
|
+
return -(i.score <=> j.score)
|
10
|
+
end
|
11
|
+
def sort_value(i)
|
12
|
+
return i.score
|
13
|
+
end
|
14
|
+
def sort_type()
|
15
|
+
return SortField::SortType::SCORE
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# Special comparator for sorting hits according to index order (number).
|
21
|
+
INDEX_ORDER = ScoreDocComparator.new()
|
22
|
+
class <<INDEX_ORDER
|
23
|
+
def compare(i, j)
|
24
|
+
return i.doc <=> j.doc
|
25
|
+
end
|
26
|
+
def sort_value(i)
|
27
|
+
return i.doc
|
28
|
+
end
|
29
|
+
def sort_type()
|
30
|
+
return SortField::SortType::DOC
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# Compares two ScoreDoc objects and returns a result indicating their
|
36
|
+
# sort order.
|
37
|
+
# i:: First ScoreDoc
|
38
|
+
# j:: Second ScoreDoc
|
39
|
+
# returns:: +-1+ if +i+ should come before +j+
|
40
|
+
# +1+ if +i+ should come after +j+
|
41
|
+
# +0+ if they are equal
|
42
|
+
def compare(i, j)
|
43
|
+
return NotImplementedError
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
# Returns the value used to sort the given document. The object returned
|
48
|
+
# must implement the java.io.Serializable interface. This is used by
|
49
|
+
# multisearchers to determine how to collate results from their searchers.
|
50
|
+
#
|
51
|
+
# See FieldDoc
|
52
|
+
# i:: Document
|
53
|
+
# returns:: Serializable object
|
54
|
+
def sort_value(i)
|
55
|
+
return NotImplementedError
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
# Returns the type of sort. Should return +SortField.SCORE+,
|
60
|
+
# +SortField.DOC+, +SortField.STRING+, +SortField.INTEGER+,
|
61
|
+
# +SortField.FLOAT+ or +SortField.CUSTOM+. It is not valid to return
|
62
|
+
# +SortField.AUTO+.
|
63
|
+
# This is used by multisearchers to determine how to collate results from
|
64
|
+
# their searchers. returns:: One of the constants in SortField.
|
65
|
+
# See SortField
|
66
|
+
def sort_type()
|
67
|
+
return NotImplementedError
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class SimpleFieldComparator < ScoreDocComparator
|
72
|
+
def initialize(index, sort_type)
|
73
|
+
@index = index
|
74
|
+
@sort_type = sort_type
|
75
|
+
end
|
76
|
+
|
77
|
+
def compare(j, i)
|
78
|
+
return @index[i.doc] <=> @index[j.doc]
|
79
|
+
end
|
80
|
+
def sort_value(i)
|
81
|
+
return @index[i.doc]
|
82
|
+
end
|
83
|
+
def sort_type()
|
84
|
+
return @sort_type
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class SpecialFieldComparator < SimpleFieldComparator
|
89
|
+
def initialize(index, sort_type, comparator)
|
90
|
+
super(index, sort_type)
|
91
|
+
@comparator = comparator
|
92
|
+
end
|
93
|
+
def compare(j, i)
|
94
|
+
return @comparator.call(@index[i.doc], @index[j.doc])
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
class StringFieldComparator < ScoreDocComparator
|
99
|
+
def initialize(index)
|
100
|
+
@str_index = index.str_index
|
101
|
+
@str_map = index.str_map
|
102
|
+
end
|
103
|
+
|
104
|
+
def compare(i, j)
|
105
|
+
return @str_index[i.doc] <=> @str_index[j.doc]
|
106
|
+
end
|
107
|
+
def sort_value(i)
|
108
|
+
return @str_map[@str_index[i.doc]]
|
109
|
+
end
|
110
|
+
def sort_type()
|
111
|
+
return SortField::SortType::STRING
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Expert: Common scoring functionality for different types of queries.
|
3
|
+
#
|
4
|
+
# A +Scorer+ either iterates over documents matching a query, or provides an
|
5
|
+
# explanation of the score for a query for a given document.
|
6
|
+
#
|
7
|
+
# Document scores are computed using a given +Similarity+ implementation.
|
8
|
+
class Scorer
|
9
|
+
attr_reader :similarity
|
10
|
+
MAX_DOCS = 0x7FFFFFFF
|
11
|
+
|
12
|
+
# Constructs a Scorer.
|
13
|
+
# similarity:: The +Similarity+ implementation used by this scorer.
|
14
|
+
def initialize(similarity)
|
15
|
+
@similarity = similarity
|
16
|
+
end
|
17
|
+
|
18
|
+
# Expert: Iterates over matching all documents, yielding the document
|
19
|
+
# number and the score.
|
20
|
+
#
|
21
|
+
# returns:: true if more matching documents may remain.
|
22
|
+
def each_hit() # :yields: doc, score
|
23
|
+
while next?
|
24
|
+
yield(doc(), score())
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Expert: Iterates over matching documents in a range.
|
29
|
+
#
|
30
|
+
# max:: Do not score documents past this. Default will search all documents
|
31
|
+
# avaliable.
|
32
|
+
# returns:: true if more matching documents may remain.
|
33
|
+
def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
|
34
|
+
while (next? and doc() < max)
|
35
|
+
yield(doc(), score())
|
36
|
+
end
|
37
|
+
return doc() < max
|
38
|
+
end
|
39
|
+
|
40
|
+
# Advances to the next document matching the query.
|
41
|
+
# returns:: true iff there is another document matching the query.
|
42
|
+
# When this method is used the #explain(int) method should not be used.
|
43
|
+
def next?()
|
44
|
+
raise NotImplementedError
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns the current document number matching the query.
|
48
|
+
# Initially invalid, until #next?() is called the first time.
|
49
|
+
def doc()
|
50
|
+
raise NotImplementedError
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the score for the current document matching the query.
|
54
|
+
# Initially invalid, until #next?() is called the first time.
|
55
|
+
def score()
|
56
|
+
raise NotImplementedError
|
57
|
+
end
|
58
|
+
|
59
|
+
# Skips to the first match beyond the current whose document number is
|
60
|
+
# greater than or equal to a given target.
|
61
|
+
#
|
62
|
+
# When this method is used the #explain(int) method should not be used.
|
63
|
+
#
|
64
|
+
# target:: The target document number.
|
65
|
+
# returns:: true iff there is such a match.
|
66
|
+
#
|
67
|
+
# Behaves as if written:
|
68
|
+
#
|
69
|
+
# def skip_to(target)
|
70
|
+
# begin
|
71
|
+
# return false if not next?()
|
72
|
+
# end while (target > doc())
|
73
|
+
# return true
|
74
|
+
# end
|
75
|
+
#
|
76
|
+
# Most implementations are considerably more efficient than that.
|
77
|
+
def skip_to(target)
|
78
|
+
raise NotImplementedError
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns an explanation of the score for a document.
|
82
|
+
#
|
83
|
+
# When this method is used, the #next?(), #skip_to(int) and
|
84
|
+
# #score(HitCollector) methods should not be used.
|
85
|
+
#
|
86
|
+
# doc:: The document number for the explanation.
|
87
|
+
def explain(doc)
|
88
|
+
raise NotImplementedError
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,278 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Expert: Scoring API.
|
3
|
+
# Subclasses implement search scoring.
|
4
|
+
#
|
5
|
+
# The score of query *q* for document *d* is defined
|
6
|
+
# in terms of these methods as follows:
|
7
|
+
#
|
8
|
+
# <table cellpadding="0" cellspacing="0" border="0">
|
9
|
+
# <tr>
|
10
|
+
# <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
|
11
|
+
# <td valign="middle" align="center">
|
12
|
+
# <big><big><big><big><big>&Sigma</big></big></big></big></big></td>
|
13
|
+
# <td valign="middle"><small>
|
14
|
+
# #tf(int) tf(t in d)#
|
15
|
+
# #idf_term(Term,Searcher) idf(t)#
|
16
|
+
# Field#getBoost getBoost(t.field in d)#
|
17
|
+
# #length_norm(String,int) length_norm(t.field in d)
|
18
|
+
# </small></td>
|
19
|
+
# <td valign="middle" rowspan="2"> *
|
20
|
+
# #coord(int,int) coord(q,d)#
|
21
|
+
# #query_norm(float) query_norm(q)
|
22
|
+
# </td>
|
23
|
+
# </tr>
|
24
|
+
# <tr>
|
25
|
+
# <td valign="top" align="right">
|
26
|
+
# <small>t in q</small>
|
27
|
+
# </td>
|
28
|
+
# </tr>
|
29
|
+
# </table>
|
30
|
+
#
|
31
|
+
# See #set_default
|
32
|
+
# See IndexWriter#set_similarity
|
33
|
+
# See Searcher#set_similarity
|
34
|
+
class Similarity
|
35
|
+
|
36
|
+
def Similarity.byte_to_float(b)
|
37
|
+
if (b == 0)
|
38
|
+
return 0.0
|
39
|
+
end
|
40
|
+
mantissa = b & 0x07 # 0x07 = 7 = 0b00000111
|
41
|
+
exponent = (b >> 3) & 0x1F # 0x1f = 31 = 0b00011111
|
42
|
+
return [0,0,(mantissa << 5),(exponent+48)].pack("cccc").unpack("f")[0]
|
43
|
+
end
|
44
|
+
|
45
|
+
def Similarity.float_to_byte(f)
|
46
|
+
if (f <= 0.0) then return 0 end
|
47
|
+
|
48
|
+
bits = [f].pack("f").unpack("cccc")
|
49
|
+
mantissa = (bits[2] & 0xEf) >> 5
|
50
|
+
exponent = (bits[3] - 48)
|
51
|
+
|
52
|
+
if (exponent > 0x1f)
|
53
|
+
exponent = 0x1f # 0x1f = 31 = 0b00011111
|
54
|
+
mantissa = 0x07 # 0x07 = 7 = 0b00000111
|
55
|
+
end
|
56
|
+
|
57
|
+
if (exponent < 0)
|
58
|
+
exponent = 0
|
59
|
+
mantissa = 1
|
60
|
+
end
|
61
|
+
|
62
|
+
return ((exponent<<3) | mantissa)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Cache of decoded bytes
|
66
|
+
NORM_TABLE = Array.new(256) { |i| Similarity.byte_to_float(i) }
|
67
|
+
|
68
|
+
# Decodes a normalization factor stored in an index.
|
69
|
+
# See Similarity#encode_norm(float)
|
70
|
+
def Similarity.decode_norm(b)
|
71
|
+
return NORM_TABLE[b & 0xFF]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Decodes a normalization factor stored in an index.
|
75
|
+
# See Similarity#encode_norm(float)
|
76
|
+
def decode_norm(b)
|
77
|
+
return self.class.decode_norm(b)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Computes the normalization value for a field given the total number of
|
81
|
+
# terms contained in a field. These values, together with field boosts, are
|
82
|
+
# stored in an index and multipled into scores for hits on each field by the
|
83
|
+
# search code.
|
84
|
+
#
|
85
|
+
# Matches in longer fields are less precise, so implemenations of this
|
86
|
+
# method usually return smaller values when *num_tokens* is large,
|
87
|
+
# and larger values when *num_tokens* is small.
|
88
|
+
#
|
89
|
+
# That these values are computed under
|
90
|
+
# IndexWriter#add_document and stored then using
|
91
|
+
# #encode_norm(float). Thus they have limited precision, and documents
|
92
|
+
# must be re-indexed if this method is altered.
|
93
|
+
#
|
94
|
+
# field:: the name of the field
|
95
|
+
# num_tokens:: the total number of tokens contained in fields named
|
96
|
+
# _field_ of _doc_.
|
97
|
+
#
|
98
|
+
# See Field#set_boost
|
99
|
+
def length_norm
|
100
|
+
raise NotImplementedError
|
101
|
+
end
|
102
|
+
|
103
|
+
# Computes the normalization value for a query given the sum of the squared
|
104
|
+
# weights of each of the query terms. This value is then multipled into the
|
105
|
+
# weight of each query term.
|
106
|
+
#
|
107
|
+
# This does not affect ranking, but rather just attempts to make scores
|
108
|
+
# from different queries comparable.
|
109
|
+
#
|
110
|
+
# sum_of_squared_weights:: the sum of the squares of query term weights
|
111
|
+
# Return:: a normalization factor for query weights
|
112
|
+
def query_norm
|
113
|
+
raise NotImplementedError
|
114
|
+
end
|
115
|
+
|
116
|
+
# Encodes a normalization factor for storage in an index.
|
117
|
+
#
|
118
|
+
# The encoding uses a five-bit exponent and three-bit mantissa, thus
|
119
|
+
# representing values from around 7x10^9 to 2x10^-9 with about one
|
120
|
+
# significant decimal digit of accuracy. Zero is also represented.
|
121
|
+
# Negative numbers are rounded up to zero. Values too large to represent
|
122
|
+
# are rounded down to the largest representable value. Positive values too
|
123
|
+
# small to represent are rounded up to the smallest positive representable
|
124
|
+
# value.
|
125
|
+
#
|
126
|
+
# See Field#boost=
|
127
|
+
def Similarity.encode_norm(f)
|
128
|
+
return Similarity.float_to_byte(f)
|
129
|
+
end
|
130
|
+
|
131
|
+
def encode_norm(f)
|
132
|
+
return self.class.float_to_byte(f)
|
133
|
+
end
|
134
|
+
|
135
|
+
# Computes a score factor based on a term or phrase's frequency in a
|
136
|
+
# document. This value is multiplied by the #idf_term(Term, Searcher)
|
137
|
+
# factor for each term in the query and these products are then summed to
|
138
|
+
# form the initial score for a document.
|
139
|
+
#
|
140
|
+
# Terms and phrases repeated in a document indicate the topic of the
|
141
|
+
# document, so implementations of this method usually return larger values
|
142
|
+
# when _freq_ is large, and smaller values when _freq_
|
143
|
+
# is small.
|
144
|
+
#
|
145
|
+
# The default implementation calls #tf(float)
|
146
|
+
#
|
147
|
+
# freq:: the frequency of a term within a document
|
148
|
+
# Return:: a score factor based on a term's within-document frequency
|
149
|
+
def tf
|
150
|
+
raise NotImplementedError
|
151
|
+
end
|
152
|
+
|
153
|
+
# Computes the amount of a sloppy phrase match, based on an edit distance.
|
154
|
+
# This value is summed for each sloppy phrase match in a document to form
|
155
|
+
# the frequency that is passed to #tf(float).
|
156
|
+
#
|
157
|
+
# A phrase match with a small edit distance to a document passage more
|
158
|
+
# closely matches the document, so implementations of this method usually
|
159
|
+
# return larger values when the edit distance is small and smaller values
|
160
|
+
# when it is large.
|
161
|
+
#
|
162
|
+
# See PhraseQuery#slop(int)
|
163
|
+
# distance:: the edit distance of this sloppy phrase match
|
164
|
+
# Return:: the frequency increment for this match
|
165
|
+
def sloppy_freq
|
166
|
+
raise NotImplementedError
|
167
|
+
end
|
168
|
+
|
169
|
+
# Computes a score factor for a simple term.
|
170
|
+
#
|
171
|
+
# The default implementation is:
|
172
|
+
# return idf(searcher.doc_freq(term), searcher.max_doc())
|
173
|
+
#
|
174
|
+
# Note that Searcher#max_doc() is used instead of
|
175
|
+
# IndexReader#num_docs() because it is proportional to
|
176
|
+
# Searcher#doc_freq(Term) , i.e., when one is inaccurate,
|
177
|
+
# so is the other, and in the same direction.
|
178
|
+
#
|
179
|
+
# term:: the term in question
|
180
|
+
# searcher:: the document collection being searched
|
181
|
+
# Return:: a score factor for the term
|
182
|
+
def idf_term(term, searcher)
|
183
|
+
return idf(searcher.doc_freq(term), searcher.max_doc())
|
184
|
+
end
|
185
|
+
|
186
|
+
# Computes a score factor for a phrase.
|
187
|
+
#
|
188
|
+
# The default implementation sums the #idf(Term,Searcher) factor
|
189
|
+
# for each term in the phrase.
|
190
|
+
#
|
191
|
+
# terms:: the terms in the phrase
|
192
|
+
# searcher:: the document collection being searched
|
193
|
+
# Return:: a score factor for the phrase
|
194
|
+
def idf_phrase(terms, searcher)
|
195
|
+
idf = 0.0
|
196
|
+
terms.each { |term| idf += idf_term(term, searcher) }
|
197
|
+
return idf
|
198
|
+
end
|
199
|
+
|
200
|
+
# Computes a score factor based on a term's document frequency (the number
|
201
|
+
# of documents which contain the term). This value is multiplied by the
|
202
|
+
# #tf(int) factor for each term in the query and these products are
|
203
|
+
# then summed to form the initial score for a document.
|
204
|
+
#
|
205
|
+
# Terms that occur in fewer documents are better indicators of topic, so
|
206
|
+
# implemenations of this method usually return larger values for rare terms,
|
207
|
+
# and smaller values for common terms.
|
208
|
+
#
|
209
|
+
# doc_freq:: the number of documents which contain the term
|
210
|
+
# num_docs:: the total number of documents in the collection
|
211
|
+
# Return:: a score factor based on the term's document frequency
|
212
|
+
def idf
|
213
|
+
raise NotImplementedError
|
214
|
+
end
|
215
|
+
|
216
|
+
# Computes a score factor based on the fraction of all query terms that a
|
217
|
+
# document contains. This value is multiplied into scores.
|
218
|
+
#
|
219
|
+
# The presence of a large portion of the query terms indicates a better
|
220
|
+
# match with the query, so implemenations of this method usually return
|
221
|
+
# larger values when the ratio between these parameters is large and smaller
|
222
|
+
# values when the ratio between them is small.
|
223
|
+
#
|
224
|
+
# overlap:: the number of query terms matched in the document
|
225
|
+
# max_overlap:: the total number of terms in the query
|
226
|
+
# Return:: a score factor based on term overlap with the query
|
227
|
+
def coord
|
228
|
+
raise NotImplementedError
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
# Expert: Default scoring implementation.
|
233
|
+
class DefaultSimilarity < Similarity
|
234
|
+
# See source
|
235
|
+
def length_norm(field, num_terms)
|
236
|
+
return 1.0 / Math.sqrt(num_terms)
|
237
|
+
end
|
238
|
+
|
239
|
+
# See source
|
240
|
+
def query_norm(sum_of_squared_weights)
|
241
|
+
return 1.0 / Math.sqrt(sum_of_squared_weights)
|
242
|
+
end
|
243
|
+
|
244
|
+
# See source
|
245
|
+
def tf(freq)
|
246
|
+
return Math.sqrt(freq)
|
247
|
+
end
|
248
|
+
|
249
|
+
# See source
|
250
|
+
def sloppy_freq(distance)
|
251
|
+
return 1.0 / (distance + 1)
|
252
|
+
end
|
253
|
+
|
254
|
+
# See source
|
255
|
+
def idf(doc_freq, num_docs)
|
256
|
+
return 0.0 if num_docs == 0
|
257
|
+
return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
|
258
|
+
end
|
259
|
+
|
260
|
+
# See source
|
261
|
+
def coord(overlap, max_overlap)
|
262
|
+
return overlap.to_f / max_overlap
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
class Similarity
|
267
|
+
# The Similarity implementation used by default.
|
268
|
+
@@default = DefaultSimilarity.new()
|
269
|
+
|
270
|
+
def Similarity.default
|
271
|
+
return @@default
|
272
|
+
end
|
273
|
+
|
274
|
+
def Similarity.default=(default)
|
275
|
+
@@default = default
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|