ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,294 @@
1
+ module Ferret::Search
2
+ # An alternative to BooleanScorer.
3
+ #
4
+ # Uses ConjunctionScorer, DisjunctionScorer, ReqOptScorer and ReqExclScorer.
5
+ #
6
+ # Implements skip_to(), and has no limitations on the numbers of added scorers.
7
+ class BooleanScorer < Scorer
8
+ attr_reader :required_scorers, :coordinator
9
+
10
+ class Coordinator
11
+ attr_accessor :max_coord, :nr_matchers
12
+
13
+ def initialize(similarity)
14
+ @max_coord = 0 # to be increased for each non prohibited scorer
15
+ @coord_factors = nil
16
+ @similarity = similarity
17
+ end
18
+
19
+
20
+ def init() # use after all scorers have been added.
21
+ @coord_factors = Array.new(@max_coord + 1)
22
+
23
+ (@max_coord+1).times do |i|
24
+ @coord_factors[i] = @similarity.coord(i, @max_coord)
25
+ end
26
+ end
27
+
28
+
29
+ def init_doc()
30
+ @nr_matchers = 0
31
+ end
32
+
33
+ def coord_factor()
34
+ return @coord_factors[@nr_matchers]
35
+ end
36
+ end
37
+
38
+ # The scorer to which all scoring will be delegated,
39
+ # except for computing and using the coordination factor.
40
+
41
+ def initialize(similarity)
42
+ super(similarity)
43
+ @required_scorers = []
44
+ @optional_scorers = []
45
+ @prohibited_scorers = []
46
+ @counting_sum_scorer = nil
47
+ @coordinator = Coordinator.new(similarity)
48
+ end
49
+
50
+ def add_scorer(scorer, occur)
51
+ unless occur == BooleanClause::Occur::MUST_NOT
52
+ @coordinator.max_coord += 1
53
+ end
54
+
55
+ case occur
56
+ when BooleanClause::Occur::MUST: @required_scorers << scorer
57
+ when BooleanClause::Occur::SHOULD: @optional_scorers << scorer
58
+ when BooleanClause::Occur::MUST_NOT: @prohibited_scorers << scorer
59
+ end
60
+ end
61
+
62
+ # Initialize the match counting scorer that sums all the
63
+ # scores.
64
+ # When "counting" is used in a name it means counting the number
65
+ # of matching scorers.<br>
66
+ # When "sum" is used in a name it means score value summing
67
+ # over the matching scorers
68
+ def init_counting_sum_scorer()
69
+ @coordinator.init()
70
+ @counting_sum_scorer = make_counting_sum_scorer()
71
+ end
72
+
73
+ # Count a scorer as a single match.
74
+ class SingleMatchScorer < Scorer
75
+ def initialize(parent_scorer, scorer)
76
+ super(scorer.similarity)
77
+ @scorer = scorer
78
+ @parent_scorer = parent_scorer
79
+ end
80
+ def score()
81
+ @parent_scorer.coordinator.nr_matchers += 1
82
+ return @scorer.score
83
+ end
84
+ def doc()
85
+ return @scorer.doc
86
+ end
87
+ def next?
88
+ return @scorer.next?
89
+ end
90
+ def skip_to(doc_nr)
91
+ return @scorer.skip_to(doc_nr)
92
+ end
93
+ def explain(doc_nr)
94
+ return @scorer.explain(doc_nr)
95
+ end
96
+ end
97
+
98
+ class CountingDisjunctionSumScorer < DisjunctionSumScorer
99
+ def initialize(parent_scorer, scorers)
100
+ super(scorers)
101
+ @parent_scorer = parent_scorer
102
+ end
103
+ def score
104
+ @parent_scorer.coordinator.nr_matchers += @nr_matchers
105
+ return super
106
+ end
107
+ end
108
+
109
+ def counting_disjunction_sum_scorer(scorers)
110
+ # each scorer from the list counted as a single matcher
111
+
112
+ return CountingDisjunctionSumScorer.new(self, scorers)
113
+ end
114
+
115
+ class CountingConjunctionScorer < ConjunctionScorer
116
+ def initialize(parent_scorer, similarity)
117
+ super(similarity)
118
+ @parent_scorer = parent_scorer
119
+ @required_nr_matchers = parent_scorer.required_scorers.size
120
+ @last_scored_doc = -1
121
+ end
122
+ def score
123
+ if (@parent_scorer.doc() > @last_scored_doc)
124
+ @last_scored_doc = @parent_scorer.doc()
125
+ @parent_scorer.coordinator.nr_matchers += @required_nr_matchers
126
+ end
127
+
128
+ return super
129
+ end
130
+ end
131
+
132
+ def counting_conjunction_sum_scorer(required_scorers)
133
+ # each scorer from the list counted as a single matcher
134
+
135
+ required_nr_matchers = required_scorers.size
136
+ ccs = CountingConjunctionScorer.new(self, Similarity.default)
137
+ @required_scorers.each do |scorer|
138
+ ccs << scorer
139
+ end
140
+ return ccs
141
+ end
142
+
143
+ # Returns the scorer to be used for match counting and score summing.
144
+ # Uses required_scorers, optional_scorers and prohibited_scorers.
145
+ def make_counting_sum_scorer()
146
+ # each scorer counted as a single matcher
147
+ if @required_scorers.size == 0
148
+ if @optional_scorers.size == 0
149
+ return NonMatchingScorer.new # only prohibited scorers
150
+ elsif @optional_scorers.size == 1
151
+ return make_counting_sum_scorer2( # the only optional scorer is required
152
+ SingleMatchScorer.new(self, @optional_scorers[0]),
153
+ []) # no optional scorers left
154
+ else # more than 1 @optional_scorers, no required scorers
155
+ return make_counting_sum_scorer2( # at least one optional scorer is required
156
+ counting_disjunction_sum_scorer(@optional_scorers),
157
+ []) # no optional scorers left
158
+ end
159
+ elsif @required_scorers.size == 1 # 1 required
160
+ return make_counting_sum_scorer2(
161
+ SingleMatchScorer.new(self, @required_scorers[0]),
162
+ @optional_scorers)
163
+ else # more required scorers
164
+ return make_counting_sum_scorer2(
165
+ counting_conjunction_sum_scorer(@required_scorers),
166
+ @optional_scorers)
167
+ end
168
+ end
169
+
170
+ # Returns the scorer to be used for match counting and score summing.
171
+ # Uses the arguments and prohibited_scorers.
172
+ # required_counting_sum_scorer:: A required scorer already built.
173
+ # @optional_scorers:: A list of optional scorers, possibly empty.
174
+ def make_counting_sum_scorer2(required_counting_sum_scorer, optional_scorers)
175
+
176
+ if (optional_scorers.size == 0)
177
+ if (@prohibited_scorers.size == 0)
178
+ return required_counting_sum_scorer
179
+ elsif (@prohibited_scorers.size == 1)
180
+ return ReqExclScorer.new(required_counting_sum_scorer,
181
+ @prohibited_scorers[0])
182
+ else # no optional, more than 1 prohibited
183
+ return ReqExclScorer.new(
184
+ required_counting_sum_scorer,
185
+ DisjunctionSumScorer.new(@prohibited_scorers))
186
+ end
187
+ elsif (optional_scorers.size == 1)
188
+ return make_counting_sum_scorer3(
189
+ required_counting_sum_scorer,
190
+ SingleMatchScorer.new(self, optional_scorers[0]))
191
+ else # more optional
192
+ return make_counting_sum_scorer3(
193
+ required_counting_sum_scorer,
194
+ counting_disjunction_sum_scorer(optional_scorers))
195
+ end
196
+ end
197
+
198
+ # Returns the scorer to be used for match counting and score summing.
199
+ # Uses the arguments and prohibited_scorers.
200
+ # required_counting_sum_scorer:: A required scorer already built.
201
+ # optional_counting_sum_scorer:: An optional scorer already built.
202
+ def make_counting_sum_scorer3(required_counting_sum_scorer,
203
+ optional_counting_sum_scorer)
204
+ if (@prohibited_scorers.size == 0) # no prohibited
205
+ return ReqOptSumScorer.new(required_counting_sum_scorer,
206
+ optional_counting_sum_scorer)
207
+ elsif (@prohibited_scorers.size == 1) # 1 prohibited
208
+ return ReqOptSumScorer.new(
209
+ ReqExclScorer.new(required_counting_sum_scorer,
210
+ @prohibited_scorers[0]),
211
+ optional_counting_sum_scorer)
212
+ else # more prohibited
213
+ return ReqOptSumScorer.new(
214
+ ReqExclScorer.new(required_counting_sum_scorer,
215
+ DisjunctionSumScorer.new(@prohibited_scorers)),
216
+ optional_counting_sum_scorer)
217
+ end
218
+ end
219
+
220
+ # Expert: Iterates over matching all documents, yielding the document
221
+ # number and the score.
222
+ #
223
+ # returns:: true if more matching documents may remain.
224
+ def each_hit() # :yields: doc, score
225
+ if @counting_sum_scorer.nil?
226
+ init_counting_sum_scorer()
227
+ end
228
+ while @counting_sum_scorer.next?
229
+ yield(@counting_sum_scorer.doc, score())
230
+ end
231
+ end
232
+
233
+ # Expert: Iterates over matching documents in a range.
234
+ #
235
+ # NOTE: that #next? needs to be called first.
236
+ #
237
+ # max:: Do not score documents past this. Default will search all documents
238
+ # avaliable.
239
+ # returns:: true if more matching documents may remain.
240
+ def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
241
+ # nil pointer exception when next? was not called before:
242
+ doc_nr = @counting_sum_scorer.doc()
243
+ while (doc_nr < max)
244
+ yield(doc_nr, score())
245
+ if not @counting_sum_scorer.next?
246
+ return false
247
+ end
248
+ doc_nr = @counting_sum_scorer.doc()
249
+ end
250
+ return true
251
+ end
252
+
253
+ def doc()
254
+ return @counting_sum_scorer.doc
255
+ end
256
+
257
+ def next?
258
+ if (@counting_sum_scorer == nil)
259
+ init_counting_sum_scorer()
260
+ end
261
+ return @counting_sum_scorer.next?
262
+ end
263
+
264
+ def score()
265
+ @coordinator.init_doc()
266
+ sum = @counting_sum_scorer.score()
267
+ return sum * @coordinator.coord_factor()
268
+ end
269
+
270
+ # Skips to the first match beyond the current whose document number is
271
+ # greater than or equal to a given target.
272
+ #
273
+ # When this method is used the #explain(int) method should not be used.
274
+ #
275
+ # target:: The target document number.
276
+ # returns:: true iff there is such a match.
277
+ def skip_to(target)
278
+ if (@counting_sum_scorer == nil)
279
+ init_counting_sum_scorer()
280
+ end
281
+ return @counting_sum_scorer.skip_to(target)
282
+ end
283
+
284
+ # TODO: Implement an explanation of the coordination factor.
285
+ # doc:: The document number for the explanation.
286
+ # raises:: UnsupportedOperationException
287
+ def explain(doc)
288
+ raise NotImplementedError
289
+ # How to explain the coordination factor?
290
+ #init_counting_sum_scorer()
291
+ #return @counting_sum_scorer.explain(doc); # misses coord factor.
292
+ end
293
+ end
294
+ end
@@ -0,0 +1,40 @@
1
+ module Ferret::Search
2
+ require 'monitor'
3
+
4
+ # Wraps another filter's result and caches it. The caching
5
+ # behavior is like QueryFilter. The purpose is to allow
6
+ # filters to simply filter, and then wrap with this class to add
7
+ # caching, keeping the two concerns decoupled yet composable.
8
+ class CachingWrapperFilter < Filter
9
+ # filter:: Filter to cache results of
10
+ def initialize(filter)
11
+ @filter = filter
12
+ @cache = nil
13
+ end
14
+
15
+ def bits(reader)
16
+ if (@cache == nil)
17
+ @cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
18
+ end
19
+
20
+ @cache.synchronize() do # check cache
21
+ bits = @cache[reader]
22
+ if bits
23
+ return bits
24
+ end
25
+ end
26
+
27
+ bits = @filter.bits(reader)
28
+
29
+ @cache.synchronize() do # update cache
30
+ @cache[reader] = bits
31
+ end
32
+
33
+ return bits
34
+ end
35
+
36
+ def to_s()
37
+ return "CachingWrapperFilter(#{@filter})"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,99 @@
1
+ require 'set'
2
+ module Ferret::Search
3
+ # Scorer for conjunctions, sets of queries, all of which are required.
4
+ class ConjunctionScorer < Scorer
5
+
6
+ def initialize(similarity)
7
+ super
8
+ @scorers = []
9
+ @first_time = true
10
+ @more = true
11
+ end
12
+
13
+ def add(scorer)
14
+ @scorers << scorer
15
+ end
16
+ alias :<< :add
17
+
18
+ def first()
19
+ return @scorers.first
20
+ end
21
+
22
+ def last()
23
+ return @scorers.last
24
+ end
25
+
26
+ def doc()
27
+ return first().doc()
28
+ end
29
+
30
+ def next?()
31
+ if (@first_time)
32
+ init(true)
33
+ elsif (@more)
34
+ @more = last().next? # trigger further scanning
35
+ end
36
+ return do_next()
37
+ end
38
+
39
+ def do_next()
40
+ while @more and first().doc < last().doc # find doc w/ all clauses
41
+ @more = first().skip_to(last().doc) # skip first upto last
42
+ @scorers << @scorers.shift # move first to last
43
+ end
44
+ return @more # found a doc with all clauses
45
+ end
46
+
47
+ def skip_to(target)
48
+ if(@first_time)
49
+ init(false)
50
+ end
51
+
52
+ @scorers.each do |scorer|
53
+ break if not @more
54
+ @more = scorer.skip_to(target)
55
+ end
56
+
57
+ sort_scorers() if @more # resort the scorers
58
+
59
+ return do_next()
60
+ end
61
+
62
+ # Sums the scores of all of the scorers for the current document.
63
+ def score()
64
+ score = 0.0 # sum scores
65
+ @scorers.each do |scorer|
66
+ score += scorer.score
67
+ end
68
+ score *= @coord
69
+ return score
70
+ end
71
+
72
+ def init(init_scorers)
73
+ # compute coord factor
74
+ @coord = similarity().coord(@scorers.size(), @scorers.size())
75
+
76
+ @more = @scorers.size() > 0
77
+
78
+ if init_scorers
79
+ # move each scorer to its first entry
80
+ @scorers.each do |scorer|
81
+ break if not @more
82
+ @more = scorer.next?
83
+ end
84
+ sort_scorers() if @more
85
+ end
86
+
87
+ @first_time = false
88
+ end
89
+
90
+ def sort_scorers()
91
+ # move @scorers to an array
92
+ @scorers.sort! {|a,b| a.doc <=> b.doc }
93
+ end
94
+
95
+ def explain(doc)
96
+ raise NotImplementedError
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,203 @@
1
+ module Ferret::Search
2
+ # A Scorer for OR like queries, counterpart of Lucene's +ConjunctionScorer+.
3
+ # This Scorer implements Scorer#skip_to(int) and uses skip_to() on the given Scorers.
4
+ class DisjunctionSumScorer < Scorer
5
+ # the sub-scorers
6
+ attr_accessor :sub_scorers
7
+
8
+ # Construct a +DisjunctionScorer+.
9
+ # sub_scorers:: A collection of at least two subscorers.
10
+ #
11
+ # minimum_nr_matchers:: The positive minimum number of subscorers that should
12
+ # match to match this query.
13
+ # <br>When +@minimum_nr_matchers+ is bigger than
14
+ # the number of +sub_scorers+,
15
+ # no matches will be produced.
16
+ # <br>When @minimum_nr_matchers equals the number of sub_scorers,
17
+ # it more efficient to use +ConjunctionScorer+.
18
+ def initialize(sub_scorers, minimum_nr_matchers = 1)
19
+ super(nil)
20
+
21
+ # The number of subscorers.
22
+ @nr_scorers = sub_scorers.size
23
+
24
+ # The document number of the current match.
25
+ @current_doc = -1
26
+ @curret_score = nil
27
+ # The number of subscorers that provide the current match.
28
+ @nr_matchers = -1
29
+
30
+ if (minimum_nr_matchers <= 0)
31
+ raise ArgumentError, "Minimum nr of matchers must be positive"
32
+ end
33
+ if (@nr_scorers <= 1)
34
+ raise ArgumentError, "There must be at least 2 sub_scorers"
35
+ end
36
+
37
+ @minimum_nr_matchers = minimum_nr_matchers
38
+ @sub_scorers = sub_scorers
39
+
40
+ # The @scorer_queue contains all subscorers ordered by their current
41
+ # doc, with the minimum at the top.
42
+ #
43
+ # The @scorer_queue is initialized the first time next? or skip_to() is
44
+ # called.
45
+ #
46
+ # An exhausted scorer is immediately removed from the @scorer_queue.
47
+ #
48
+ # If less than the @minimum_nr_matchers scorers remain in the
49
+ # @scorer_queue next? and skip_to() return false.
50
+ #
51
+ # After each to call to next? or skip_to()
52
+ # +currentSumScore+ is the total score of the current matching doc,
53
+ # +@nr_matchers+ is the number of matching scorers,
54
+ # and all scorers are after the matching doc, or are exhausted.
55
+ @scorer_queue = nil
56
+ end
57
+
58
+ # Called the first time next? or skip_to() is called to
59
+ # initialize +@scorer_queue+.
60
+ def init_scorer_queue()
61
+ @scorer_queue = ScorerQueue.new(@nr_scorers)
62
+ @sub_scorers.each do |sub_scorer|
63
+ if (sub_scorer.next?) # doc() method will be used in @scorer_queue.
64
+ @scorer_queue.insert(sub_scorer)
65
+ end
66
+ end
67
+ end
68
+
69
+ # A +PriorityQueue+ that orders by Scorer#doc().
70
+ class ScorerQueue < Ferret::Utils::PriorityQueue
71
+ def less_than(scorer1, scorer2)
72
+ return scorer1.doc < scorer2.doc
73
+ end
74
+ end
75
+
76
+ def next?
77
+ if (@scorer_queue == nil)
78
+ init_scorer_queue()
79
+ end
80
+
81
+ if (@scorer_queue.size < @minimum_nr_matchers)
82
+ return false
83
+ else
84
+ return advance_after_current()
85
+ end
86
+ end
87
+
88
+
89
+ # Advance all subscorers after the current document determined by the
90
+ # top of the +@scorer_queue+.
91
+ # Repeat until at least the minimum number of subscorers match on the same
92
+ # document and all subscorers are after that document or are exhausted.
93
+ #
94
+ # On enbegin the +@scorer_queue+ has at least +@minimum_nr_matchers+
95
+ # available. At least the scorer with the minimum document number will be advanced.
96
+ # returns:: true iff there is a match.
97
+ #
98
+ # In case there is a match, +@current_doc+, +currentSumScore+,
99
+ # and +@nr_matchers+ describe the match.
100
+ #
101
+ # TODO Investigate whether it is possible to use skip_to() when
102
+ # the minimum number of matchers is bigger than one, ie. begin and use the
103
+ # character of ConjunctionScorer for the minimum number of matchers.
104
+ def advance_after_current()
105
+ begin # repeat until minimum nr of matchers
106
+ top = @scorer_queue.top
107
+ @current_doc = top.doc
108
+ @current_score = top.score
109
+ @nr_matchers = 1
110
+ begin # Until all subscorers are after @current_doc
111
+ if top.next?
112
+ @scorer_queue.adjust_top()
113
+ else
114
+ @scorer_queue.pop()
115
+ if (@scorer_queue.size < (@minimum_nr_matchers - @nr_matchers))
116
+ # Not enough subscorers left for a match on this document,
117
+ # and also no more chance of any further match.
118
+ return false
119
+ end
120
+ if (@scorer_queue.size == 0)
121
+ break # nothing more to advance, check for last match.
122
+ end
123
+ end
124
+ top = @scorer_queue.top
125
+ if top.doc != @current_doc
126
+ break # All remaining subscorers are after @current_doc.
127
+ else
128
+ @current_score += top.score
129
+ @nr_matchers += 1
130
+ end
131
+ end while (true)
132
+
133
+ if (@nr_matchers >= @minimum_nr_matchers)
134
+ return true
135
+ elsif (@scorer_queue.size < @minimum_nr_matchers)
136
+ return false
137
+ end
138
+ end while (true)
139
+ end
140
+
141
+ # Returns the score of the current document matching the query.
142
+ # Initially invalid, until #next? is called the first time.
143
+ def score()
144
+ return @current_score
145
+ end
146
+
147
+ # Returns the document number of the current document matching the query.
148
+ # Initially invalid, until #next? is called the first time.
149
+ def doc()
150
+ return @current_doc
151
+ end
152
+
153
+ # Returns the number of subscorers matching the current document.
154
+ # Initially invalid, until #next? is called the first time.
155
+ def number_of_matchers()
156
+ return @nr_matchers
157
+ end
158
+
159
+ # Skips to the first match beyond the current whose document number is
160
+ # greater than or equal to a given target.
161
+ #
162
+ # When this method is used the #explain(int) method should not be used.
163
+ #
164
+ # The implementation uses the skip_to() method on the subscorers.
165
+ # target:: The target document number.
166
+ # returns:: true iff there is such a match.
167
+ def skip_to(target)
168
+ if @scorer_queue.nil?
169
+ init_scorer_queue()
170
+ end
171
+ if @scorer_queue.size < @minimum_nr_matchers
172
+ return false
173
+ end
174
+ if target <= @current_doc
175
+ target = @current_doc + 1
176
+ end
177
+ begin
178
+ top = @scorer_queue.top
179
+ if top.doc >= target
180
+ return advance_after_current()
181
+ elsif top.skip_to(target)
182
+ @scorer_queue.adjust_top()
183
+ else
184
+ @scorer_queue.pop()
185
+ if (@scorer_queue.size < @minimum_nr_matchers)
186
+ return false
187
+ end
188
+ end
189
+ end while (true)
190
+ end
191
+
192
+ # Gives and explanation for the score of a given document.
193
+ # TODO Show the resulting score. See BooleanScorer.explain() on how to do this.
194
+ def explain(doc)
195
+ e = Explanation.new()
196
+ e.description = "At least " + @minimum_nr_matchers + " of"
197
+ @sub_scorers.each do |sub_scorer|
198
+ e.details << sub_scorer.explain(doc)
199
+ end
200
+ return e
201
+ end
202
+ end
203
+ end