ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,65 @@
1
+ module Ferret::Search
2
+ # A Query that matches documents containing a subset of terms provided
3
+ # by a FilteredTermEnum enumeration.
4
+ #
5
+ # +MultiTermQuery+ is not designed to be used by itself. The reason being
6
+ # that it is not intialized with a FilteredTermEnum enumeration. A
7
+ # FilteredTermEnum enumeration needs to be provided.
8
+ #
9
+ # For example, WildcardQuery and FuzzyQuery extend +MultiTermQuery+ to
10
+ # provide WildcardTermEnum and FuzzyTermEnum, respectively.
11
+ class MultiTermQuery < Query
12
+ attr_reader :term
13
+
14
+ # Constructs a query for terms matching +term+.
15
+ def initialize(term)
16
+ super()
17
+ @term = term
18
+ end
19
+
20
+ # Construct the enumeration to be used, expanding the pattern term.
21
+ def get_term_enum(reader)
22
+ raise NotImplementedError
23
+ end
24
+
25
+
26
+ def rewrite(reader)
27
+ enumerator = get_term_enum(reader)
28
+ bq = BooleanQuery.new(true)
29
+ begin
30
+ begin
31
+ t = enumerator.term()
32
+ if (t != nil)
33
+ tq = TermQuery.new(t) # found a match
34
+ tq.boost = boost() * enumerator.difference() # set the boost
35
+ bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
36
+ end
37
+ end while enumerator.next?
38
+ ensure
39
+ enumerator.close()
40
+ end
41
+ return bq
42
+ end
43
+
44
+ # Prints a user-readable version of this query.
45
+ def to_s(field = nil)
46
+ buffer = ""
47
+ buffer << "#{@term.field}:" if @term.field != field
48
+ buffer << @term.text
49
+ buffer << "^#{boost()}" if (boost() != 1.0)
50
+ return buffer
51
+ end
52
+
53
+ def eql?(o)
54
+ if not o.instance_of? MultiTermQuery
55
+ return false
56
+ end
57
+ return term == o.term
58
+ end
59
+ alias :== :eql?
60
+
61
+ def hash()
62
+ return term.hash()
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,22 @@
1
+ module Ferret::Search
2
+ # A scorer that matches no document at all.
3
+ class NonMatchingScorer < Scorer
4
+ def initialize()
5
+ super(nil) # no similarity used
6
+ end
7
+
8
+ def next?
9
+ return false
10
+ end
11
+
12
+ def skip_to(target)
13
+ return false
14
+ end
15
+
16
+ def explain(doc)
17
+ e = Explanation.new()
18
+ e.description = "No document matches."
19
+ return e
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,55 @@
1
+ module Ferret::Search
2
+ class PhrasePositions
3
+ attr_reader :doc, :position
4
+ attr_accessor :next
5
+
6
+ def initialize(tp_enum, offset)
7
+ @tp_enum = tp_enum
8
+ @offset = offset
9
+ @count = @position = @doc = -1
10
+ @next = nil
11
+ end
12
+
13
+ def next?()
14
+ if not @tp_enum.next?
15
+ @tp_enum.close() # close stream
16
+ @doc = Scorer::MAX_DOCS # sentinel value
17
+ return false
18
+ end
19
+ @doc = @tp_enum.doc
20
+ @position = 0
21
+ return true
22
+ end
23
+
24
+ def skip_to(target)
25
+ if not @tp_enum.skip_to(target)
26
+ @tp_enum.close() # close stream
27
+ @doc = Scorer::MAX_DOCS # sentinel value
28
+ return false
29
+ end
30
+ @doc = @tp_enum.doc
31
+ @position = 0
32
+ return true
33
+ end
34
+
35
+
36
+ def first_position()
37
+ @count = @tp_enum.freq # read first pos
38
+ next_position()
39
+ end
40
+
41
+ def next_position()
42
+ @count -= 1
43
+ if @count >= 0 # read subsequent pos's
44
+ @position = @tp_enum.next_position() - @offset
45
+ return true
46
+ else
47
+ return false
48
+ end
49
+ end
50
+
51
+ def to_s
52
+ "pp->(doc => #{@doc}, position => #{position})"
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,217 @@
1
+ module Ferret::Search
2
+ # A Query that matches documents containing a particular sequence of terms.
3
+ # A PhraseQuery is built by QueryParser for input like +"new york"+.
4
+ #
5
+ # This query may be combined with other terms or queries with a BooleanQuery.
6
+ class PhraseQuery < Query
7
+ def initialize()
8
+ super
9
+ @slop = 0
10
+ @terms = []
11
+ @positions = []
12
+ @field = nil
13
+ end
14
+
15
+ # Sets the number of other words permitted between words in query phrase.
16
+ # If zero, then this is an exact phrase search. For larger values this
17
+ # works like a +WITHIN+ or +NEAR+ operator.
18
+ #
19
+ # The slop is in fact an edit-distance, where the units correspond to
20
+ # moves of terms in the query phrase out of position. For example, to
21
+ # switch the order of two words requires two moves (the first move places
22
+ # the words atop one another), so to permit re-orderings of phrases, the
23
+ # slop must be at least two.
24
+ #
25
+ # More exact matches are scored higher than sloppier matches, thus search
26
+ # results are sorted by exactness.
27
+ #
28
+ # The slop is zero by default, requiring exact matches.
29
+ attr_accessor :slop
30
+ attr_reader :terms, :positions, :field
31
+
32
+ # Adds a term to the end of the query phrase.
33
+ #
34
+ # The relative position of the term is the one immediately after the last
35
+ # term added, unless explicitly specified. By specifying explicitly,
36
+ # you can have phrases with more than one term at the same position or
37
+ # phrases with gaps (e.g. in connection with stopwords).
38
+ #
39
+ # term:: the term to search for
40
+ # position:: the relative position of the term to the rest of the terms
41
+ # int the query.
42
+ def add(term, position = nil, pos_inc = 1)
43
+ if position.nil?
44
+ position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
45
+ end
46
+
47
+ if @terms.size == 0
48
+ @field = term.field
49
+ elsif (term.field != @field)
50
+ raise ArgumentError, "All phrase terms must be in the same field: #{term}"
51
+ end
52
+
53
+ @terms << term
54
+ @positions << position
55
+ end
56
+
57
+ def <<(term)
58
+ add(term)
59
+ return self
60
+ end
61
+
62
+ class PhraseWeight < Weight
63
+ attr_reader :query, :value
64
+
65
+ def initialize(query, searcher)
66
+ @query = query
67
+ @similarity = query.similarity(searcher)
68
+ @idf = @similarity.idf_phrase(@query.terms, searcher)
69
+ end
70
+
71
+ def to_s() return "phrase_weight(#{@value})" end
72
+
73
+ def sum_of_squared_weights()
74
+ @query_weight = @idf * @query.boost() # compute query weight
75
+ return @query_weight * @query_weight # square it
76
+ end
77
+
78
+ def normalize(query_norm)
79
+ @query_norm = query_norm
80
+ @query_weight *= query_norm # normalize query weight
81
+ @value = @query_weight * @idf # idf for document
82
+ end
83
+
84
+ def scorer(reader)
85
+ return nil if @query.terms.size == 0 # optimize zero-term case
86
+
87
+ tps = []
88
+ @query.terms.each do |term|
89
+ tp = reader.term_positions_for(term)
90
+ return nil if tp.nil?
91
+ tps << tp
92
+ end
93
+
94
+ if (@query.slop == 0) # optimize exact case
95
+ return ExactPhraseScorer.new(self, tps, @query.positions,
96
+ @similarity,
97
+ reader.get_norms(@query.field))
98
+ else
99
+ return SloppyPhraseScorer.new(self, tps, @query.positions,
100
+ @similarity,
101
+ @query.slop,
102
+ reader.get_norms(@query.field))
103
+ end
104
+ end
105
+
106
+ def explain(reader, doc)
107
+ result = Explanation.new()
108
+ result.description = "weight(#{@query} in #{doc}), product of:"
109
+
110
+ doc_freqs = @query.terms.map do |term|
111
+ "#{term.text}=#{reader.doc_freq(term)}"
112
+ end.join(", ")
113
+
114
+ idf_expl = Explanation.new(@idf, "idf(#{@query.field}:<#{doc_freqs}>)")
115
+
116
+ # explain query weight
117
+ query_expl = Explanation.new()
118
+ query_expl.description = "query_weight(#{@query}), product of:"
119
+
120
+ boost = @query.boost()
121
+ if boost != 1.0
122
+ boost_expl = Explanation.new(boost, "boost")
123
+ query_expl << boost_expl
124
+ end
125
+ query_expl << idf_expl
126
+
127
+ query_norm_expl = Explanation.new(@query_norm, "query_norm")
128
+ query_expl << query_norm_expl
129
+
130
+ query_expl.value = boost * @idf * query_norm_expl.value
131
+
132
+ result << query_expl
133
+
134
+ # explain field weight
135
+ field_expl = Explanation.new()
136
+ field_expl.description =
137
+ "field_weight(#{query} in #{doc}), product of:"
138
+
139
+ tf_expl = scorer(reader).explain(doc)
140
+ field_expl << tf_expl
141
+ field_expl << idf_expl
142
+
143
+ field_norm_expl = Explanation.new()
144
+ field_norms = reader.get_norms(@query.field)
145
+ field_norm =
146
+ field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
147
+ field_norm_expl.value = field_norm
148
+ field_norm_expl.description =
149
+ "field_norm(field=#{@query.field}, doc=#{doc})"
150
+ field_expl << field_norm_expl
151
+
152
+ field_expl.value = tf_expl.value * @idf * field_norm
153
+
154
+ result << field_expl
155
+
156
+ # combine them
157
+ result.value = query_expl.value * field_expl.value
158
+
159
+ if query_expl.value == 1.0
160
+ return field_expl
161
+ else
162
+ return result
163
+ end
164
+ end
165
+ end
166
+
167
+ def create_weight(searcher)
168
+ if @terms.size == 1 # optimize one-term case
169
+ term = @terms[0]
170
+ tq = TermQuery.new(term)
171
+ tq.boost = boost()
172
+ return tq.create_weight(searcher)
173
+ end
174
+ return PhraseWeight.new(self, searcher)
175
+ end
176
+
177
+ # See Query#extract_terms()
178
+ def extract_terms(query_terms)
179
+ query_terms.add_all(@terms)
180
+ end
181
+
182
+ # Prints a user-readable version of this query.
183
+ def to_s(f=nil)
184
+ buffer = ""
185
+ buffer << "#{@field}:" if @field != f
186
+ buffer << '"'
187
+ last_pos = -1
188
+ @terms.each_index do |i|
189
+ term = @terms[i]
190
+ pos = @positions[i]
191
+ last_pos.upto(pos-2) {buffer << "<> "}
192
+ last_pos = pos
193
+ buffer << "#{term.text} "
194
+ end
195
+ buffer.rstrip!
196
+ buffer << '"'
197
+ buffer << "~#{slop}" if (slop != 0)
198
+ buffer << "^#{boost()}" if boost() != 1.0
199
+ return buffer
200
+ end
201
+
202
+ # Returns true iff +o+ is equal to this.
203
+ def eql?(o)
204
+ if not o.instance_of? PhraseQuery
205
+ return false
206
+ end
207
+ return (boost() == o.boost() and @slop == o.slop and
208
+ @terms == o.terms and @positions == o.positions)
209
+ end
210
+ alias :== :eql?
211
+
212
+ # Returns a hash code value for this object.
213
+ def hash()
214
+ return boost().hash ^ slop.hash ^ @terms.hash ^ @positions.hash
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,153 @@
1
+ module Ferret::Search
2
+ class PhraseScorer < Scorer
3
+ attr_reader :first, :last
4
+ protected :first, :last
5
+
6
+ def initialize(weight, tps, positions, similarity, norms)
7
+ super(similarity)
8
+ @norms = norms
9
+ @weight = weight
10
+ @value = weight.value
11
+ @first_time = true
12
+ @more = true
13
+
14
+ # convert tps to a list
15
+ tps.length.times do |i|
16
+ pp = PhrasePositions.new(tps[i], positions[i])
17
+ if (@last != nil) # add next to end of list
18
+ @last.next = pp
19
+ else
20
+ @first = pp
21
+ end
22
+ @last = pp
23
+ end
24
+
25
+ @pq = PhraseQueue.new(tps.length) # construct empty pq
26
+ end
27
+
28
+ def doc()
29
+ return @first.doc
30
+ end
31
+
32
+ def next?
33
+ if (@first_time)
34
+ init()
35
+ @first_time = false
36
+ elsif (@more)
37
+ @more = @last.next? # trigger further scanning
38
+ end
39
+ return do_next()
40
+ end
41
+
42
+ # next without initial increment
43
+ def do_next()
44
+ while (@more)
45
+ while (@more and @first.doc < @last.doc) # find doc w/ all the terms
46
+ @more = @first.skip_to(@last.doc) # skip first upto last
47
+ first_to_last() # and move it to the end
48
+ end
49
+
50
+ if (@more)
51
+ # found a doc with all of the terms
52
+ @freq = phrase_freq() # check for phrase
53
+ if (@freq == 0.0) # no match
54
+ @more = @last.next? # trigger further scanning
55
+ else
56
+ return true # found a match
57
+ end
58
+ end
59
+ end
60
+ return false # no more matches
61
+ end
62
+
63
+ def each()
64
+ pp = @first
65
+ while (pp != nil)
66
+ yield pp
67
+ pp = pp.next
68
+ end
69
+ end
70
+
71
+ def score()
72
+ #puts("scoring #{@first.doc}")
73
+ raw = similarity().tf(@freq) * @value # raw score
74
+ return raw * Similarity.decode_norm(@norms[@first.doc]) # normalize
75
+ end
76
+
77
+ def skip_to(target)
78
+ each() { |pp| break if not @more = pp.skip_to(target) }
79
+ sort() if @more # re-sort
80
+ return do_next()
81
+ end
82
+
83
+ def phrase_freq()
84
+ raise NotImplementedError
85
+ end
86
+
87
+ def init()
88
+ each do |pp|
89
+ break if not @more = pp.next?
90
+ end
91
+ if @more
92
+ sort()
93
+ end
94
+ end
95
+
96
+ def sort()
97
+ @pq.clear()
98
+ each() do |pp|
99
+ @pq.push(pp)
100
+ end
101
+ pq_to_list()
102
+ end
103
+
104
+ def pq_to_list()
105
+ @last = @first = nil
106
+ while (@pq.top() != nil)
107
+ pp = @pq.pop()
108
+ if (@last != nil) # add next to end of list
109
+ @last.next = pp
110
+ else
111
+ @first = pp
112
+ end
113
+ @last = pp
114
+ pp.next = nil
115
+ end
116
+ end
117
+
118
+ def first_to_last()
119
+ @last.next = @first # move first to end of list
120
+ @last = @first
121
+ @first = @first.next
122
+ @last.next = nil
123
+ end
124
+
125
+ def explain(doc)
126
+ tf_explanation = Explanation.new()
127
+
128
+ while (next? and doc() < doc)
129
+ end
130
+
131
+ phrase_freq = (doc() == doc) ? @freq : 0.0
132
+ tf_explanation.value = @similarity.tf(phrase_freq)
133
+ tf_explanation.description = "tf(phrase_freq=#{phrase_freq})"
134
+
135
+ return tf_explanation
136
+ end
137
+
138
+ def to_s() return "phrase_scorer(#{@weight})" end
139
+
140
+ end
141
+
142
+
143
+ class PhraseQueue < Ferret::Utils::PriorityQueue
144
+ def less_than(pp1, pp2)
145
+ if (pp1.doc == pp2.doc)
146
+ return pp1.position < pp2.position
147
+ else
148
+ return pp1.doc < pp2.doc
149
+ end
150
+ end
151
+ end
152
+
153
+ end