ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,65 @@
1
+ module Ferret::Search
2
+ # A Query that matches documents containing a subset of terms provided
3
+ # by a FilteredTermEnum enumeration.
4
+ #
5
+ # +MultiTermQuery+ is not designed to be used by itself. The reason being
6
+ # that it is not intialized with a FilteredTermEnum enumeration. A
7
+ # FilteredTermEnum enumeration needs to be provided.
8
+ #
9
+ # For example, WildcardQuery and FuzzyQuery extend +MultiTermQuery+ to
10
+ # provide WildcardTermEnum and FuzzyTermEnum, respectively.
11
+ class MultiTermQuery < Query
12
+ attr_reader :term
13
+
14
+ # Constructs a query for terms matching +term+.
15
+ def initialize(term)
16
+ super()
17
+ @term = term
18
+ end
19
+
20
+ # Construct the enumeration to be used, expanding the pattern term.
21
+ def get_term_enum(reader)
22
+ raise NotImplementedError
23
+ end
24
+
25
+
26
+ def rewrite(reader)
27
+ enumerator = get_term_enum(reader)
28
+ bq = BooleanQuery.new(true)
29
+ begin
30
+ begin
31
+ t = enumerator.term()
32
+ if (t != nil)
33
+ tq = TermQuery.new(t) # found a match
34
+ tq.boost = boost() * enumerator.difference() # set the boost
35
+ bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
36
+ end
37
+ end while enumerator.next?
38
+ ensure
39
+ enumerator.close()
40
+ end
41
+ return bq
42
+ end
43
+
44
+ # Prints a user-readable version of this query.
45
+ def to_s(field = nil)
46
+ buffer = ""
47
+ buffer << "#{@term.field}:" if @term.field != field
48
+ buffer << @term.text
49
+ buffer << "^#{boost()}" if (boost() != 1.0)
50
+ return buffer
51
+ end
52
+
53
+ def eql?(o)
54
+ if not o.instance_of? MultiTermQuery
55
+ return false
56
+ end
57
+ return term == o.term
58
+ end
59
+ alias :== :eql?
60
+
61
+ def hash()
62
+ return term.hash()
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,22 @@
1
+ module Ferret::Search
2
+ # A scorer that matches no document at all.
3
+ class NonMatchingScorer < Scorer
4
+ def initialize()
5
+ super(nil) # no similarity used
6
+ end
7
+
8
+ def next?
9
+ return false
10
+ end
11
+
12
+ def skip_to(target)
13
+ return false
14
+ end
15
+
16
+ def explain(doc)
17
+ e = Explanation.new()
18
+ e.description = "No document matches."
19
+ return e
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,55 @@
1
+ module Ferret::Search
2
+ class PhrasePositions
3
+ attr_reader :doc, :position
4
+ attr_accessor :next
5
+
6
+ def initialize(tp_enum, offset)
7
+ @tp_enum = tp_enum
8
+ @offset = offset
9
+ @count = @position = @doc = -1
10
+ @next = nil
11
+ end
12
+
13
+ def next?()
14
+ if not @tp_enum.next?
15
+ @tp_enum.close() # close stream
16
+ @doc = Scorer::MAX_DOCS # sentinel value
17
+ return false
18
+ end
19
+ @doc = @tp_enum.doc
20
+ @position = 0
21
+ return true
22
+ end
23
+
24
+ def skip_to(target)
25
+ if not @tp_enum.skip_to(target)
26
+ @tp_enum.close() # close stream
27
+ @doc = Scorer::MAX_DOCS # sentinel value
28
+ return false
29
+ end
30
+ @doc = @tp_enum.doc
31
+ @position = 0
32
+ return true
33
+ end
34
+
35
+
36
+ def first_position()
37
+ @count = @tp_enum.freq # read first pos
38
+ next_position()
39
+ end
40
+
41
+ def next_position()
42
+ @count -= 1
43
+ if @count >= 0 # read subsequent pos's
44
+ @position = @tp_enum.next_position() - @offset
45
+ return true
46
+ else
47
+ return false
48
+ end
49
+ end
50
+
51
+ def to_s
52
+ "pp->(doc => #{@doc}, position => #{position})"
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,217 @@
1
+ module Ferret::Search
2
+ # A Query that matches documents containing a particular sequence of terms.
3
+ # A PhraseQuery is built by QueryParser for input like +"new york"+.
4
+ #
5
+ # This query may be combined with other terms or queries with a BooleanQuery.
6
+ class PhraseQuery < Query
7
+ def initialize()
8
+ super
9
+ @slop = 0
10
+ @terms = []
11
+ @positions = []
12
+ @field = nil
13
+ end
14
+
15
+ # Sets the number of other words permitted between words in query phrase.
16
+ # If zero, then this is an exact phrase search. For larger values this
17
+ # works like a +WITHIN+ or +NEAR+ operator.
18
+ #
19
+ # The slop is in fact an edit-distance, where the units correspond to
20
+ # moves of terms in the query phrase out of position. For example, to
21
+ # switch the order of two words requires two moves (the first move places
22
+ # the words atop one another), so to permit re-orderings of phrases, the
23
+ # slop must be at least two.
24
+ #
25
+ # More exact matches are scored higher than sloppier matches, thus search
26
+ # results are sorted by exactness.
27
+ #
28
+ # The slop is zero by default, requiring exact matches.
29
+ attr_accessor :slop
30
+ attr_reader :terms, :positions, :field
31
+
32
+ # Adds a term to the end of the query phrase.
33
+ #
34
+ # The relative position of the term is the one immediately after the last
35
+ # term added, unless explicitly specified. By specifying explicitly,
36
+ # you can have phrases with more than one term at the same position or
37
+ # phrases with gaps (e.g. in connection with stopwords).
38
+ #
39
+ # term:: the term to search for
40
+ # position:: the relative position of the term to the rest of the terms
41
+ # int the query.
42
+ def add(term, position = nil, pos_inc = 1)
43
+ if position.nil?
44
+ position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
45
+ end
46
+
47
+ if @terms.size == 0
48
+ @field = term.field
49
+ elsif (term.field != @field)
50
+ raise ArgumentError, "All phrase terms must be in the same field: #{term}"
51
+ end
52
+
53
+ @terms << term
54
+ @positions << position
55
+ end
56
+
57
+ def <<(term)
58
+ add(term)
59
+ return self
60
+ end
61
+
62
+ class PhraseWeight < Weight
63
+ attr_reader :query, :value
64
+
65
+ def initialize(query, searcher)
66
+ @query = query
67
+ @similarity = query.similarity(searcher)
68
+ @idf = @similarity.idf_phrase(@query.terms, searcher)
69
+ end
70
+
71
+ def to_s() return "phrase_weight(#{@value})" end
72
+
73
+ def sum_of_squared_weights()
74
+ @query_weight = @idf * @query.boost() # compute query weight
75
+ return @query_weight * @query_weight # square it
76
+ end
77
+
78
+ def normalize(query_norm)
79
+ @query_norm = query_norm
80
+ @query_weight *= query_norm # normalize query weight
81
+ @value = @query_weight * @idf # idf for document
82
+ end
83
+
84
+ def scorer(reader)
85
+ return nil if @query.terms.size == 0 # optimize zero-term case
86
+
87
+ tps = []
88
+ @query.terms.each do |term|
89
+ tp = reader.term_positions_for(term)
90
+ return nil if tp.nil?
91
+ tps << tp
92
+ end
93
+
94
+ if (@query.slop == 0) # optimize exact case
95
+ return ExactPhraseScorer.new(self, tps, @query.positions,
96
+ @similarity,
97
+ reader.get_norms(@query.field))
98
+ else
99
+ return SloppyPhraseScorer.new(self, tps, @query.positions,
100
+ @similarity,
101
+ @query.slop,
102
+ reader.get_norms(@query.field))
103
+ end
104
+ end
105
+
106
+ def explain(reader, doc)
107
+ result = Explanation.new()
108
+ result.description = "weight(#{@query} in #{doc}), product of:"
109
+
110
+ doc_freqs = @query.terms.map do |term|
111
+ "#{term.text}=#{reader.doc_freq(term)}"
112
+ end.join(", ")
113
+
114
+ idf_expl = Explanation.new(@idf, "idf(#{@query.field}:<#{doc_freqs}>)")
115
+
116
+ # explain query weight
117
+ query_expl = Explanation.new()
118
+ query_expl.description = "query_weight(#{@query}), product of:"
119
+
120
+ boost = @query.boost()
121
+ if boost != 1.0
122
+ boost_expl = Explanation.new(boost, "boost")
123
+ query_expl << boost_expl
124
+ end
125
+ query_expl << idf_expl
126
+
127
+ query_norm_expl = Explanation.new(@query_norm, "query_norm")
128
+ query_expl << query_norm_expl
129
+
130
+ query_expl.value = boost * @idf * query_norm_expl.value
131
+
132
+ result << query_expl
133
+
134
+ # explain field weight
135
+ field_expl = Explanation.new()
136
+ field_expl.description =
137
+ "field_weight(#{query} in #{doc}), product of:"
138
+
139
+ tf_expl = scorer(reader).explain(doc)
140
+ field_expl << tf_expl
141
+ field_expl << idf_expl
142
+
143
+ field_norm_expl = Explanation.new()
144
+ field_norms = reader.get_norms(@query.field)
145
+ field_norm =
146
+ field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
147
+ field_norm_expl.value = field_norm
148
+ field_norm_expl.description =
149
+ "field_norm(field=#{@query.field}, doc=#{doc})"
150
+ field_expl << field_norm_expl
151
+
152
+ field_expl.value = tf_expl.value * @idf * field_norm
153
+
154
+ result << field_expl
155
+
156
+ # combine them
157
+ result.value = query_expl.value * field_expl.value
158
+
159
+ if query_expl.value == 1.0
160
+ return field_expl
161
+ else
162
+ return result
163
+ end
164
+ end
165
+ end
166
+
167
+ def create_weight(searcher)
168
+ if @terms.size == 1 # optimize one-term case
169
+ term = @terms[0]
170
+ tq = TermQuery.new(term)
171
+ tq.boost = boost()
172
+ return tq.create_weight(searcher)
173
+ end
174
+ return PhraseWeight.new(self, searcher)
175
+ end
176
+
177
+ # See Query#extract_terms()
178
+ def extract_terms(query_terms)
179
+ query_terms.add_all(@terms)
180
+ end
181
+
182
+ # Prints a user-readable version of this query.
183
+ def to_s(f=nil)
184
+ buffer = ""
185
+ buffer << "#{@field}:" if @field != f
186
+ buffer << '"'
187
+ last_pos = -1
188
+ @terms.each_index do |i|
189
+ term = @terms[i]
190
+ pos = @positions[i]
191
+ last_pos.upto(pos-2) {buffer << "<> "}
192
+ last_pos = pos
193
+ buffer << "#{term.text} "
194
+ end
195
+ buffer.rstrip!
196
+ buffer << '"'
197
+ buffer << "~#{slop}" if (slop != 0)
198
+ buffer << "^#{boost()}" if boost() != 1.0
199
+ return buffer
200
+ end
201
+
202
+ # Returns true iff +o+ is equal to this.
203
+ def eql?(o)
204
+ if not o.instance_of? PhraseQuery
205
+ return false
206
+ end
207
+ return (boost() == o.boost() and @slop == o.slop and
208
+ @terms == o.terms and @positions == o.positions)
209
+ end
210
+ alias :== :eql?
211
+
212
+ # Returns a hash code value for this object.
213
+ def hash()
214
+ return boost().hash ^ slop.hash ^ @terms.hash ^ @positions.hash
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,153 @@
1
+ module Ferret::Search
2
+ class PhraseScorer < Scorer
3
+ attr_reader :first, :last
4
+ protected :first, :last
5
+
6
+ def initialize(weight, tps, positions, similarity, norms)
7
+ super(similarity)
8
+ @norms = norms
9
+ @weight = weight
10
+ @value = weight.value
11
+ @first_time = true
12
+ @more = true
13
+
14
+ # convert tps to a list
15
+ tps.length.times do |i|
16
+ pp = PhrasePositions.new(tps[i], positions[i])
17
+ if (@last != nil) # add next to end of list
18
+ @last.next = pp
19
+ else
20
+ @first = pp
21
+ end
22
+ @last = pp
23
+ end
24
+
25
+ @pq = PhraseQueue.new(tps.length) # construct empty pq
26
+ end
27
+
28
+ def doc()
29
+ return @first.doc
30
+ end
31
+
32
+ def next?
33
+ if (@first_time)
34
+ init()
35
+ @first_time = false
36
+ elsif (@more)
37
+ @more = @last.next? # trigger further scanning
38
+ end
39
+ return do_next()
40
+ end
41
+
42
+ # next without initial increment
43
+ def do_next()
44
+ while (@more)
45
+ while (@more and @first.doc < @last.doc) # find doc w/ all the terms
46
+ @more = @first.skip_to(@last.doc) # skip first upto last
47
+ first_to_last() # and move it to the end
48
+ end
49
+
50
+ if (@more)
51
+ # found a doc with all of the terms
52
+ @freq = phrase_freq() # check for phrase
53
+ if (@freq == 0.0) # no match
54
+ @more = @last.next? # trigger further scanning
55
+ else
56
+ return true # found a match
57
+ end
58
+ end
59
+ end
60
+ return false # no more matches
61
+ end
62
+
63
+ def each()
64
+ pp = @first
65
+ while (pp != nil)
66
+ yield pp
67
+ pp = pp.next
68
+ end
69
+ end
70
+
71
+ def score()
72
+ #puts("scoring #{@first.doc}")
73
+ raw = similarity().tf(@freq) * @value # raw score
74
+ return raw * Similarity.decode_norm(@norms[@first.doc]) # normalize
75
+ end
76
+
77
+ def skip_to(target)
78
+ each() { |pp| break if not @more = pp.skip_to(target) }
79
+ sort() if @more # re-sort
80
+ return do_next()
81
+ end
82
+
83
+ def phrase_freq()
84
+ raise NotImplementedError
85
+ end
86
+
87
+ def init()
88
+ each do |pp|
89
+ break if not @more = pp.next?
90
+ end
91
+ if @more
92
+ sort()
93
+ end
94
+ end
95
+
96
+ def sort()
97
+ @pq.clear()
98
+ each() do |pp|
99
+ @pq.push(pp)
100
+ end
101
+ pq_to_list()
102
+ end
103
+
104
+ def pq_to_list()
105
+ @last = @first = nil
106
+ while (@pq.top() != nil)
107
+ pp = @pq.pop()
108
+ if (@last != nil) # add next to end of list
109
+ @last.next = pp
110
+ else
111
+ @first = pp
112
+ end
113
+ @last = pp
114
+ pp.next = nil
115
+ end
116
+ end
117
+
118
+ def first_to_last()
119
+ @last.next = @first # move first to end of list
120
+ @last = @first
121
+ @first = @first.next
122
+ @last.next = nil
123
+ end
124
+
125
+ def explain(doc)
126
+ tf_explanation = Explanation.new()
127
+
128
+ while (next? and doc() < doc)
129
+ end
130
+
131
+ phrase_freq = (doc() == doc) ? @freq : 0.0
132
+ tf_explanation.value = @similarity.tf(phrase_freq)
133
+ tf_explanation.description = "tf(phrase_freq=#{phrase_freq})"
134
+
135
+ return tf_explanation
136
+ end
137
+
138
+ def to_s() return "phrase_scorer(#{@weight})" end
139
+
140
+ end
141
+
142
+
143
+ class PhraseQueue < Ferret::Utils::PriorityQueue
144
+ def less_than(pp1, pp2)
145
+ if (pp1.doc == pp2.doc)
146
+ return pp1.position < pp2.position
147
+ else
148
+ return pp1.doc < pp2.doc
149
+ end
150
+ end
151
+ end
152
+
153
+ end