ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,294 @@
1
+ module Ferret::Search
2
+ # An alternative to BooleanScorer.
3
+ #
4
+ # Uses ConjunctionScorer, DisjunctionScorer, ReqOptScorer and ReqExclScorer.
5
+ #
6
+ # Implements skip_to(), and has no limitations on the numbers of added scorers.
7
+ class BooleanScorer < Scorer
8
+ attr_reader :required_scorers, :coordinator
9
+
10
+ class Coordinator
11
+ attr_accessor :max_coord, :nr_matchers
12
+
13
+ def initialize(similarity)
14
+ @max_coord = 0 # to be increased for each non prohibited scorer
15
+ @coord_factors = nil
16
+ @similarity = similarity
17
+ end
18
+
19
+
20
+ def init() # use after all scorers have been added.
21
+ @coord_factors = Array.new(@max_coord + 1)
22
+
23
+ (@max_coord+1).times do |i|
24
+ @coord_factors[i] = @similarity.coord(i, @max_coord)
25
+ end
26
+ end
27
+
28
+
29
+ def init_doc()
30
+ @nr_matchers = 0
31
+ end
32
+
33
+ def coord_factor()
34
+ return @coord_factors[@nr_matchers]
35
+ end
36
+ end
37
+
38
+ # The scorer to which all scoring will be delegated,
39
+ # except for computing and using the coordination factor.
40
+
41
+ def initialize(similarity)
42
+ super(similarity)
43
+ @required_scorers = []
44
+ @optional_scorers = []
45
+ @prohibited_scorers = []
46
+ @counting_sum_scorer = nil
47
+ @coordinator = Coordinator.new(similarity)
48
+ end
49
+
50
+ def add_scorer(scorer, occur)
51
+ unless occur == BooleanClause::Occur::MUST_NOT
52
+ @coordinator.max_coord += 1
53
+ end
54
+
55
+ case occur
56
+ when BooleanClause::Occur::MUST: @required_scorers << scorer
57
+ when BooleanClause::Occur::SHOULD: @optional_scorers << scorer
58
+ when BooleanClause::Occur::MUST_NOT: @prohibited_scorers << scorer
59
+ end
60
+ end
61
+
62
+ # Initialize the match counting scorer that sums all the
63
+ # scores.
64
+ # When "counting" is used in a name it means counting the number
65
+ # of matching scorers.<br>
66
+ # When "sum" is used in a name it means score value summing
67
+ # over the matching scorers
68
+ def init_counting_sum_scorer()
69
+ @coordinator.init()
70
+ @counting_sum_scorer = make_counting_sum_scorer()
71
+ end
72
+
73
+ # Count a scorer as a single match.
74
+ class SingleMatchScorer < Scorer
75
+ def initialize(parent_scorer, scorer)
76
+ super(scorer.similarity)
77
+ @scorer = scorer
78
+ @parent_scorer = parent_scorer
79
+ end
80
+ def score()
81
+ @parent_scorer.coordinator.nr_matchers += 1
82
+ return @scorer.score
83
+ end
84
+ def doc()
85
+ return @scorer.doc
86
+ end
87
+ def next?
88
+ return @scorer.next?
89
+ end
90
+ def skip_to(doc_nr)
91
+ return @scorer.skip_to(doc_nr)
92
+ end
93
+ def explain(doc_nr)
94
+ return @scorer.explain(doc_nr)
95
+ end
96
+ end
97
+
98
+ class CountingDisjunctionSumScorer < DisjunctionSumScorer
99
+ def initialize(parent_scorer, scorers)
100
+ super(scorers)
101
+ @parent_scorer = parent_scorer
102
+ end
103
+ def score
104
+ @parent_scorer.coordinator.nr_matchers += @nr_matchers
105
+ return super
106
+ end
107
+ end
108
+
109
+ def counting_disjunction_sum_scorer(scorers)
110
+ # each scorer from the list counted as a single matcher
111
+
112
+ return CountingDisjunctionSumScorer.new(self, scorers)
113
+ end
114
+
115
+ class CountingConjunctionScorer < ConjunctionScorer
116
+ def initialize(parent_scorer, similarity)
117
+ super(similarity)
118
+ @parent_scorer = parent_scorer
119
+ @required_nr_matchers = parent_scorer.required_scorers.size
120
+ @last_scored_doc = -1
121
+ end
122
+ def score
123
+ if (@parent_scorer.doc() > @last_scored_doc)
124
+ @last_scored_doc = @parent_scorer.doc()
125
+ @parent_scorer.coordinator.nr_matchers += @required_nr_matchers
126
+ end
127
+
128
+ return super
129
+ end
130
+ end
131
+
132
+ def counting_conjunction_sum_scorer(required_scorers)
133
+ # each scorer from the list counted as a single matcher
134
+
135
+ required_nr_matchers = required_scorers.size
136
+ ccs = CountingConjunctionScorer.new(self, Similarity.default)
137
+ @required_scorers.each do |scorer|
138
+ ccs << scorer
139
+ end
140
+ return ccs
141
+ end
142
+
143
+ # Returns the scorer to be used for match counting and score summing.
144
+ # Uses required_scorers, optional_scorers and prohibited_scorers.
145
+ def make_counting_sum_scorer()
146
+ # each scorer counted as a single matcher
147
+ if @required_scorers.size == 0
148
+ if @optional_scorers.size == 0
149
+ return NonMatchingScorer.new # only prohibited scorers
150
+ elsif @optional_scorers.size == 1
151
+ return make_counting_sum_scorer2( # the only optional scorer is required
152
+ SingleMatchScorer.new(self, @optional_scorers[0]),
153
+ []) # no optional scorers left
154
+ else # more than 1 @optional_scorers, no required scorers
155
+ return make_counting_sum_scorer2( # at least one optional scorer is required
156
+ counting_disjunction_sum_scorer(@optional_scorers),
157
+ []) # no optional scorers left
158
+ end
159
+ elsif @required_scorers.size == 1 # 1 required
160
+ return make_counting_sum_scorer2(
161
+ SingleMatchScorer.new(self, @required_scorers[0]),
162
+ @optional_scorers)
163
+ else # more required scorers
164
+ return make_counting_sum_scorer2(
165
+ counting_conjunction_sum_scorer(@required_scorers),
166
+ @optional_scorers)
167
+ end
168
+ end
169
+
170
+ # Returns the scorer to be used for match counting and score summing.
171
+ # Uses the arguments and prohibited_scorers.
172
+ # required_counting_sum_scorer:: A required scorer already built.
173
+ # @optional_scorers:: A list of optional scorers, possibly empty.
174
+ def make_counting_sum_scorer2(required_counting_sum_scorer, optional_scorers)
175
+
176
+ if (optional_scorers.size == 0)
177
+ if (@prohibited_scorers.size == 0)
178
+ return required_counting_sum_scorer
179
+ elsif (@prohibited_scorers.size == 1)
180
+ return ReqExclScorer.new(required_counting_sum_scorer,
181
+ @prohibited_scorers[0])
182
+ else # no optional, more than 1 prohibited
183
+ return ReqExclScorer.new(
184
+ required_counting_sum_scorer,
185
+ DisjunctionSumScorer.new(@prohibited_scorers))
186
+ end
187
+ elsif (optional_scorers.size == 1)
188
+ return make_counting_sum_scorer3(
189
+ required_counting_sum_scorer,
190
+ SingleMatchScorer.new(self, optional_scorers[0]))
191
+ else # more optional
192
+ return make_counting_sum_scorer3(
193
+ required_counting_sum_scorer,
194
+ counting_disjunction_sum_scorer(optional_scorers))
195
+ end
196
+ end
197
+
198
+ # Returns the scorer to be used for match counting and score summing.
199
+ # Uses the arguments and prohibited_scorers.
200
+ # required_counting_sum_scorer:: A required scorer already built.
201
+ # optional_counting_sum_scorer:: An optional scorer already built.
202
+ def make_counting_sum_scorer3(required_counting_sum_scorer,
203
+ optional_counting_sum_scorer)
204
+ if (@prohibited_scorers.size == 0) # no prohibited
205
+ return ReqOptSumScorer.new(required_counting_sum_scorer,
206
+ optional_counting_sum_scorer)
207
+ elsif (@prohibited_scorers.size == 1) # 1 prohibited
208
+ return ReqOptSumScorer.new(
209
+ ReqExclScorer.new(required_counting_sum_scorer,
210
+ @prohibited_scorers[0]),
211
+ optional_counting_sum_scorer)
212
+ else # more prohibited
213
+ return ReqOptSumScorer.new(
214
+ ReqExclScorer.new(required_counting_sum_scorer,
215
+ DisjunctionSumScorer.new(@prohibited_scorers)),
216
+ optional_counting_sum_scorer)
217
+ end
218
+ end
219
+
220
+ # Expert: Iterates over matching all documents, yielding the document
221
+ # number and the score.
222
+ #
223
+ # returns:: true if more matching documents may remain.
224
+ def each_hit() # :yields: doc, score
225
+ if @counting_sum_scorer.nil?
226
+ init_counting_sum_scorer()
227
+ end
228
+ while @counting_sum_scorer.next?
229
+ yield(@counting_sum_scorer.doc, score())
230
+ end
231
+ end
232
+
233
+ # Expert: Iterates over matching documents in a range.
234
+ #
235
+ # NOTE: that #next? needs to be called first.
236
+ #
237
+ # max:: Do not score documents past this. Default will search all documents
238
+ # avaliable.
239
+ # returns:: true if more matching documents may remain.
240
+ def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
241
+ # nil pointer exception when next? was not called before:
242
+ doc_nr = @counting_sum_scorer.doc()
243
+ while (doc_nr < max)
244
+ yield(doc_nr, score())
245
+ if not @counting_sum_scorer.next?
246
+ return false
247
+ end
248
+ doc_nr = @counting_sum_scorer.doc()
249
+ end
250
+ return true
251
+ end
252
+
253
+ def doc()
254
+ return @counting_sum_scorer.doc
255
+ end
256
+
257
+ def next?
258
+ if (@counting_sum_scorer == nil)
259
+ init_counting_sum_scorer()
260
+ end
261
+ return @counting_sum_scorer.next?
262
+ end
263
+
264
+ def score()
265
+ @coordinator.init_doc()
266
+ sum = @counting_sum_scorer.score()
267
+ return sum * @coordinator.coord_factor()
268
+ end
269
+
270
+ # Skips to the first match beyond the current whose document number is
271
+ # greater than or equal to a given target.
272
+ #
273
+ # When this method is used the #explain(int) method should not be used.
274
+ #
275
+ # target:: The target document number.
276
+ # returns:: true iff there is such a match.
277
+ def skip_to(target)
278
+ if (@counting_sum_scorer == nil)
279
+ init_counting_sum_scorer()
280
+ end
281
+ return @counting_sum_scorer.skip_to(target)
282
+ end
283
+
284
+ # TODO: Implement an explanation of the coordination factor.
285
+ # doc:: The document number for the explanation.
286
+ # raises:: UnsupportedOperationException
287
+ def explain(doc)
288
+ raise NotImplementedError
289
+ # How to explain the coordination factor?
290
+ #init_counting_sum_scorer()
291
+ #return @counting_sum_scorer.explain(doc); # misses coord factor.
292
+ end
293
+ end
294
+ end
@@ -0,0 +1,40 @@
1
+ module Ferret::Search
2
+ require 'monitor'
3
+
4
+ # Wraps another filter's result and caches it. The caching
5
+ # behavior is like QueryFilter. The purpose is to allow
6
+ # filters to simply filter, and then wrap with this class to add
7
+ # caching, keeping the two concerns decoupled yet composable.
8
+ class CachingWrapperFilter < Filter
9
+ # filter:: Filter to cache results of
10
+ def initialize(filter)
11
+ @filter = filter
12
+ @cache = nil
13
+ end
14
+
15
+ def bits(reader)
16
+ if (@cache == nil)
17
+ @cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
18
+ end
19
+
20
+ @cache.synchronize() do # check cache
21
+ bits = @cache[reader]
22
+ if bits
23
+ return bits
24
+ end
25
+ end
26
+
27
+ bits = @filter.bits(reader)
28
+
29
+ @cache.synchronize() do # update cache
30
+ @cache[reader] = bits
31
+ end
32
+
33
+ return bits
34
+ end
35
+
36
+ def to_s()
37
+ return "CachingWrapperFilter(#{@filter})"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,99 @@
1
+ require 'set'
2
+ module Ferret::Search
3
+ # Scorer for conjunctions, sets of queries, all of which are required.
4
+ class ConjunctionScorer < Scorer
5
+
6
+ def initialize(similarity)
7
+ super
8
+ @scorers = []
9
+ @first_time = true
10
+ @more = true
11
+ end
12
+
13
+ def add(scorer)
14
+ @scorers << scorer
15
+ end
16
+ alias :<< :add
17
+
18
+ def first()
19
+ return @scorers.first
20
+ end
21
+
22
+ def last()
23
+ return @scorers.last
24
+ end
25
+
26
+ def doc()
27
+ return first().doc()
28
+ end
29
+
30
+ def next?()
31
+ if (@first_time)
32
+ init(true)
33
+ elsif (@more)
34
+ @more = last().next? # trigger further scanning
35
+ end
36
+ return do_next()
37
+ end
38
+
39
+ def do_next()
40
+ while @more and first().doc < last().doc # find doc w/ all clauses
41
+ @more = first().skip_to(last().doc) # skip first upto last
42
+ @scorers << @scorers.shift # move first to last
43
+ end
44
+ return @more # found a doc with all clauses
45
+ end
46
+
47
+ def skip_to(target)
48
+ if(@first_time)
49
+ init(false)
50
+ end
51
+
52
+ @scorers.each do |scorer|
53
+ break if not @more
54
+ @more = scorer.skip_to(target)
55
+ end
56
+
57
+ sort_scorers() if @more # resort the scorers
58
+
59
+ return do_next()
60
+ end
61
+
62
+ # Sums the scores of all of the scorers for the current document.
63
+ def score()
64
+ score = 0.0 # sum scores
65
+ @scorers.each do |scorer|
66
+ score += scorer.score
67
+ end
68
+ score *= @coord
69
+ return score
70
+ end
71
+
72
+ def init(init_scorers)
73
+ # compute coord factor
74
+ @coord = similarity().coord(@scorers.size(), @scorers.size())
75
+
76
+ @more = @scorers.size() > 0
77
+
78
+ if init_scorers
79
+ # move each scorer to its first entry
80
+ @scorers.each do |scorer|
81
+ break if not @more
82
+ @more = scorer.next?
83
+ end
84
+ sort_scorers() if @more
85
+ end
86
+
87
+ @first_time = false
88
+ end
89
+
90
+ def sort_scorers()
91
+ # move @scorers to an array
92
+ @scorers.sort! {|a,b| a.doc <=> b.doc }
93
+ end
94
+
95
+ def explain(doc)
96
+ raise NotImplementedError
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,203 @@
1
+ module Ferret::Search
2
+ # A Scorer for OR like queries, counterpart of Lucene's +ConjunctionScorer+.
3
+ # This Scorer implements Scorer#skip_to(int) and uses skip_to() on the given Scorers.
4
+ class DisjunctionSumScorer < Scorer
5
+ # the sub-scorers
6
+ attr_accessor :sub_scorers
7
+
8
+ # Construct a +DisjunctionScorer+.
9
+ # sub_scorers:: A collection of at least two subscorers.
10
+ #
11
+ # minimum_nr_matchers:: The positive minimum number of subscorers that should
12
+ # match to match this query.
13
+ # <br>When +@minimum_nr_matchers+ is bigger than
14
+ # the number of +sub_scorers+,
15
+ # no matches will be produced.
16
+ # <br>When @minimum_nr_matchers equals the number of sub_scorers,
17
+ # it more efficient to use +ConjunctionScorer+.
18
+ def initialize(sub_scorers, minimum_nr_matchers = 1)
19
+ super(nil)
20
+
21
+ # The number of subscorers.
22
+ @nr_scorers = sub_scorers.size
23
+
24
+ # The document number of the current match.
25
+ @current_doc = -1
26
+ @curret_score = nil
27
+ # The number of subscorers that provide the current match.
28
+ @nr_matchers = -1
29
+
30
+ if (minimum_nr_matchers <= 0)
31
+ raise ArgumentError, "Minimum nr of matchers must be positive"
32
+ end
33
+ if (@nr_scorers <= 1)
34
+ raise ArgumentError, "There must be at least 2 sub_scorers"
35
+ end
36
+
37
+ @minimum_nr_matchers = minimum_nr_matchers
38
+ @sub_scorers = sub_scorers
39
+
40
+ # The @scorer_queue contains all subscorers ordered by their current
41
+ # doc, with the minimum at the top.
42
+ #
43
+ # The @scorer_queue is initialized the first time next? or skip_to() is
44
+ # called.
45
+ #
46
+ # An exhausted scorer is immediately removed from the @scorer_queue.
47
+ #
48
+ # If less than the @minimum_nr_matchers scorers remain in the
49
+ # @scorer_queue next? and skip_to() return false.
50
+ #
51
+ # After each to call to next? or skip_to()
52
+ # +currentSumScore+ is the total score of the current matching doc,
53
+ # +@nr_matchers+ is the number of matching scorers,
54
+ # and all scorers are after the matching doc, or are exhausted.
55
+ @scorer_queue = nil
56
+ end
57
+
58
+ # Called the first time next? or skip_to() is called to
59
+ # initialize +@scorer_queue+.
60
+ def init_scorer_queue()
61
+ @scorer_queue = ScorerQueue.new(@nr_scorers)
62
+ @sub_scorers.each do |sub_scorer|
63
+ if (sub_scorer.next?) # doc() method will be used in @scorer_queue.
64
+ @scorer_queue.insert(sub_scorer)
65
+ end
66
+ end
67
+ end
68
+
69
+ # A +PriorityQueue+ that orders by Scorer#doc().
70
+ class ScorerQueue < Ferret::Utils::PriorityQueue
71
+ def less_than(scorer1, scorer2)
72
+ return scorer1.doc < scorer2.doc
73
+ end
74
+ end
75
+
76
+ def next?
77
+ if (@scorer_queue == nil)
78
+ init_scorer_queue()
79
+ end
80
+
81
+ if (@scorer_queue.size < @minimum_nr_matchers)
82
+ return false
83
+ else
84
+ return advance_after_current()
85
+ end
86
+ end
87
+
88
+
89
+ # Advance all subscorers after the current document determined by the
90
+ # top of the +@scorer_queue+.
91
+ # Repeat until at least the minimum number of subscorers match on the same
92
+ # document and all subscorers are after that document or are exhausted.
93
+ #
94
+ # On enbegin the +@scorer_queue+ has at least +@minimum_nr_matchers+
95
+ # available. At least the scorer with the minimum document number will be advanced.
96
+ # returns:: true iff there is a match.
97
+ #
98
+ # In case there is a match, +@current_doc+, +currentSumScore+,
99
+ # and +@nr_matchers+ describe the match.
100
+ #
101
+ # TODO Investigate whether it is possible to use skip_to() when
102
+ # the minimum number of matchers is bigger than one, ie. begin and use the
103
+ # character of ConjunctionScorer for the minimum number of matchers.
104
+ def advance_after_current()
105
+ begin # repeat until minimum nr of matchers
106
+ top = @scorer_queue.top
107
+ @current_doc = top.doc
108
+ @current_score = top.score
109
+ @nr_matchers = 1
110
+ begin # Until all subscorers are after @current_doc
111
+ if top.next?
112
+ @scorer_queue.adjust_top()
113
+ else
114
+ @scorer_queue.pop()
115
+ if (@scorer_queue.size < (@minimum_nr_matchers - @nr_matchers))
116
+ # Not enough subscorers left for a match on this document,
117
+ # and also no more chance of any further match.
118
+ return false
119
+ end
120
+ if (@scorer_queue.size == 0)
121
+ break # nothing more to advance, check for last match.
122
+ end
123
+ end
124
+ top = @scorer_queue.top
125
+ if top.doc != @current_doc
126
+ break # All remaining subscorers are after @current_doc.
127
+ else
128
+ @current_score += top.score
129
+ @nr_matchers += 1
130
+ end
131
+ end while (true)
132
+
133
+ if (@nr_matchers >= @minimum_nr_matchers)
134
+ return true
135
+ elsif (@scorer_queue.size < @minimum_nr_matchers)
136
+ return false
137
+ end
138
+ end while (true)
139
+ end
140
+
141
+ # Returns the score of the current document matching the query.
142
+ # Initially invalid, until #next? is called the first time.
143
+ def score()
144
+ return @current_score
145
+ end
146
+
147
+ # Returns the document number of the current document matching the query.
148
+ # Initially invalid, until #next? is called the first time.
149
+ def doc()
150
+ return @current_doc
151
+ end
152
+
153
+ # Returns the number of subscorers matching the current document.
154
+ # Initially invalid, until #next? is called the first time.
155
+ def number_of_matchers()
156
+ return @nr_matchers
157
+ end
158
+
159
+ # Skips to the first match beyond the current whose document number is
160
+ # greater than or equal to a given target.
161
+ #
162
+ # When this method is used the #explain(int) method should not be used.
163
+ #
164
+ # The implementation uses the skip_to() method on the subscorers.
165
+ # target:: The target document number.
166
+ # returns:: true iff there is such a match.
167
+ def skip_to(target)
168
+ if @scorer_queue.nil?
169
+ init_scorer_queue()
170
+ end
171
+ if @scorer_queue.size < @minimum_nr_matchers
172
+ return false
173
+ end
174
+ if target <= @current_doc
175
+ target = @current_doc + 1
176
+ end
177
+ begin
178
+ top = @scorer_queue.top
179
+ if top.doc >= target
180
+ return advance_after_current()
181
+ elsif top.skip_to(target)
182
+ @scorer_queue.adjust_top()
183
+ else
184
+ @scorer_queue.pop()
185
+ if (@scorer_queue.size < @minimum_nr_matchers)
186
+ return false
187
+ end
188
+ end
189
+ end while (true)
190
+ end
191
+
192
+ # Gives and explanation for the score of a given document.
193
+ # TODO Show the resulting score. See BooleanScorer.explain() on how to do this.
194
+ def explain(doc)
195
+ e = Explanation.new()
196
+ e.description = "At least " + @minimum_nr_matchers + " of"
197
+ @sub_scorers.each do |sub_scorer|
198
+ e.details << sub_scorer.explain(doc)
199
+ end
200
+ return e
201
+ end
202
+ end
203
+ end