ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,363 @@
1
+ module Ferret
2
+ module Index
3
+ # An IndexReader which reads multiple indexes, appending their content.
4
+ class MultiReader < IndexReader
5
+ attr_reader :max_doc
6
+
7
+ # Construct a MultiReader aggregating the named set of (sub)readers.
8
+ # Directory locking for delete, undeleteAll, and set_norm operations is
9
+ # left to the subreaders.
10
+ #
11
+ # Note that all subreaders are closed if this Multireader is closed.
12
+ # sub_readers:: set of (sub)readers
13
+ # raises:: IOException
14
+ def initialize(sub_readers, directory = nil, sis = nil, close_dir = false)
15
+ if (directory)
16
+ super(directory, sis, close_dir)
17
+ else
18
+ super(sub_readers.length == 0 ? nil : sub_readers[0].directory())
19
+ end
20
+
21
+ @max_doc = 0
22
+ @num_docs = -1
23
+ @has_deletions = false
24
+
25
+ @sub_readers = sub_readers
26
+ @starts = Array.new(@sub_readers.length + 1) # build starts array
27
+ @sub_readers.each_with_index do |sub_reader, i|
28
+ @starts[i] = @max_doc
29
+ @max_doc += sub_reader.max_doc # compute maxDocs
30
+
31
+ if @sub_readers[i].has_deletions?
32
+ @has_deletions = true
33
+ end
34
+ end
35
+ @starts[@sub_readers.length] = @max_doc
36
+ @norms_cache = {}
37
+ end
38
+
39
+
40
+ # Return an array of term frequency vectors for the specified document. The
41
+ # array contains a vector for each vectorized field in the document. Each
42
+ # vector vector contains term numbers and frequencies for all terms in a
43
+ # given vectorized field. If no such fields existed, the method returns
44
+ # nil.
45
+ def get_term_vectors(n)
46
+ i = reader_index(n) # find segment num
47
+ return @sub_readers[i].get_term_vectors(n - @starts[i]); # dispatch to segment
48
+ end
49
+
50
+ def get_term_vector(n, field)
51
+ i = reader_index(n) # find segment num
52
+ return @sub_readers[i].get_term_vector(n - @starts[i], field)
53
+ end
54
+
55
+ def num_docs()
56
+ synchronize do
57
+ if (@num_docs == -1) # check cache
58
+ n = 0 # cache miss -= 1recompute
59
+ @sub_readers.each {|reader| n += reader.num_docs()}
60
+ @num_docs = n
61
+ end
62
+ return @num_docs
63
+ end
64
+ end
65
+
66
+ def get_document(n)
67
+ i = reader_index(n) # find segment num
68
+ return @sub_readers[i].get_document(n - @starts[i]) # dispatch to segment reader
69
+ end
70
+
71
+ def deleted?(n)
72
+ i = reader_index(n) # find segment num
73
+ return @sub_readers[i].deleted?(n - @starts[i]) # dispatch to segment reader
74
+ end
75
+
76
+ def has_deletions?()
77
+ return @has_deletions
78
+ end
79
+
80
+ def do_delete(n)
81
+ @num_docs = -1 # invalidate cache
82
+ i = reader_index(n) # find segment num
83
+ @sub_readers[i].delete(n - @starts[i]) # dispatch to segment reader
84
+ @has_deletions = true
85
+ end
86
+
87
+ def do_undelete_all()
88
+ @num_docs = -1 # invalidate cache
89
+ @sub_readers.each {|reader| reader.undelete_all() }
90
+ @has_deletions = false
91
+ end
92
+
93
+ def reader_index(n) # find reader for doc n:
94
+ lo = 0 # search @starts array
95
+ hi = @sub_readers.length - 1 # for first element less
96
+
97
+ while (hi >= lo)
98
+ mid = (lo + hi) >> 1
99
+ mid_value = @starts[mid]
100
+ if (n < mid_value)
101
+ hi = mid - 1
102
+ elsif (n > mid_value)
103
+ lo = mid + 1
104
+ else # found a match
105
+ while (mid+1 < @sub_readers.length and @starts[mid+1] == mid_value)
106
+ mid += 1 # scan to last match
107
+ end
108
+ return mid
109
+ end
110
+ end
111
+ return hi
112
+ end
113
+
114
+ def get_norms(field)
115
+ synchronize do
116
+ bytes = @norms_cache[field]
117
+ if (bytes != nil)
118
+ return bytes # cache hit
119
+ end
120
+
121
+ bytes = " " * @max_doc
122
+ @sub_readers.length.times do |i|
123
+ @sub_readers[i].get_norms_into(field, bytes, @starts[i])
124
+ end
125
+ @norms_cache[field] = bytes # update cache
126
+ return bytes
127
+ end
128
+ end
129
+
130
+ def get_norms_into(field, buf, offset)
131
+ bytes = @norms_cache[field]
132
+ if (bytes != nil) # cache hit
133
+ buf[offset ,@max_doc] = bytes[0, @max_doc]
134
+ return
135
+ end
136
+
137
+ @sub_readers.length.times do |i|
138
+ @sub_readers[i].get_norms_into(field, buf, offset + @starts[i])
139
+ end
140
+ end
141
+
142
+ def do_set_norm(n, field, value)
143
+ @norms_cache.delete(field) # clear cache
144
+ i = reader_index(n) # find segment num
145
+ @sub_readers[i].set_norm(n-@starts[i], field, value); # dispatch
146
+ end
147
+
148
+ def terms()
149
+ return MultiTermEnum.new(@sub_readers, @starts, nil)
150
+ end
151
+
152
+ def terms_from(term)
153
+ return MultiTermEnum.new(@sub_readers, @starts, term)
154
+ end
155
+
156
+ def doc_freq(t)
157
+ total = 0 # sum freqs in segments
158
+ @sub_readers.each {|reader| total += reader.doc_freq(t)}
159
+ return total
160
+ end
161
+
162
+ def term_docs()
163
+ return MultiTermDocEnum.new(@sub_readers, @starts)
164
+ end
165
+
166
+ def term_positions()
167
+ return MultiTermDocPosEnum.new(@sub_readers, @starts)
168
+ end
169
+
170
+ def do_commit()
171
+ @sub_readers.each {|reader| reader.commit() }
172
+ end
173
+
174
+ def do_close()
175
+ synchronize do
176
+ @sub_readers.each {|reader| reader.close() }
177
+ end
178
+ end
179
+
180
+ # See IndexReader#get_field_names
181
+ def get_field_names(field_option)
182
+ # maintain a unique set of field names
183
+ field_set = Set.new
184
+ @sub_readers.each do |reader|
185
+ field_set |= reader.get_field_names(field_option)
186
+ end
187
+ return field_set
188
+ end
189
+ end
190
+
191
+ class MultiTermEnum < TermEnum
192
+
193
+ attr_reader :doc_freq, :term
194
+
195
+ def initialize(readers, starts, t)
196
+ @queue = SegmentMergeQueue.new(readers.length)
197
+ readers.each_index do |i|
198
+ reader = readers[i]
199
+ term_enum = nil
200
+ if (t != nil)
201
+ term_enum = reader.terms_from(t)
202
+ else
203
+ term_enum = reader.terms()
204
+ end
205
+ smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
206
+
207
+ if (t == nil and smi.next?) or term_enum.term
208
+ @queue.push(smi); # initialize queue
209
+ else
210
+ smi.close()
211
+ end
212
+ end
213
+
214
+ if (t != nil and @queue.size() > 0)
215
+ next?()
216
+ end
217
+ end
218
+
219
+ def next?()
220
+ top = @queue.top()
221
+ if (top == nil)
222
+ @term = nil
223
+ return false
224
+ end
225
+
226
+ @term = top.term
227
+ @doc_freq = 0
228
+
229
+ while top and @term == top.term
230
+ @queue.pop()
231
+ @doc_freq += top.term_enum.doc_freq() # increment freq
232
+ if (top.next?)
233
+ @queue.push(top) # restore queue
234
+ else
235
+ top.close() # done with a segment
236
+ end
237
+ top = @queue.top()
238
+ end
239
+ return true
240
+ end
241
+
242
+ def close()
243
+ @queue.close()
244
+ end
245
+ end
246
+
247
+ class MultiTermDocEnum < TermDocEnum
248
+ attr_accessor :readers, :starts, :term, :base, :pointer, :current
249
+
250
+ def initialize(readers, starts)
251
+ @readers = readers
252
+ @starts = starts
253
+ @base = 0
254
+ @pointer = 0
255
+
256
+ @reader_term_docs = Array.new(readers.length)
257
+ end
258
+
259
+ def doc
260
+ return @base + @current.doc()
261
+ end
262
+
263
+ def freq
264
+ return @current.freq()
265
+ end
266
+
267
+ def seek(term)
268
+ @term = term
269
+ @base = 0
270
+ @pointer = 0
271
+ @current = nil
272
+ end
273
+
274
+ def next?
275
+ if @current and @current.next?
276
+ return true
277
+ elsif @pointer < @readers.length
278
+ @base = @starts[@pointer]
279
+ @current = term_docs(@pointer)
280
+ @pointer += 1
281
+ return next?()
282
+ else
283
+ return false
284
+ end
285
+ end
286
+
287
+ # Optimized implementation. Unlike the Java version, this method
288
+ # always returns as many results as it can read.
289
+ def read(docs, freqs)
290
+ got = 0
291
+ last_got = 0
292
+ needed = docs.length
293
+
294
+ while (true)
295
+ while @current.nil?
296
+ if @pointer < @readers.length # begin next segment
297
+ @base = @starts[@pointer]
298
+ @current = term_docs(@pointer)
299
+ @pointer += 1
300
+ else
301
+ return got
302
+ end
303
+ end
304
+ got = @current.read(docs, freqs, got)
305
+ if (got == last_got) # none left in segment
306
+ @current = nil
307
+ else # got some
308
+ b = @base # adjust doc numbers
309
+ (last_got...got).each {|i| docs[i] += b}
310
+ if got == needed
311
+ return got
312
+ else
313
+ last_got = got
314
+ end
315
+ end
316
+ end
317
+ end
318
+
319
+ # As yet unoptimized implementation.
320
+ def skip_to(target)
321
+ begin
322
+ return false if not next?
323
+ end while target > doc()
324
+ return true
325
+ end
326
+
327
+ def term_docs(i)
328
+ return nil if (@term == nil)
329
+ result = @reader_term_docs[i]
330
+ if (result == nil)
331
+ result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
332
+ end
333
+ result.seek(@term)
334
+ return result
335
+ end
336
+
337
+ def term_docs_from_reader(reader)
338
+ return reader.term_docs()
339
+ end
340
+
341
+ def close()
342
+ @reader_term_docs.compact.each do |rtd|
343
+ rtd.close()
344
+ end
345
+ end
346
+ end
347
+
348
+ class MultiTermDocPosEnum < MultiTermDocEnum
349
+ def initialize(r, s)
350
+ super(r,s)
351
+ end
352
+
353
+ def term_docs_from_reader(reader)
354
+ return reader.term_positions()
355
+ end
356
+
357
+ def next_position()
358
+ return @current.next_position()
359
+ end
360
+
361
+ end
362
+ end
363
+ end
@@ -0,0 +1,105 @@
1
+ module Ferret::Index
2
+ # Describe class +MultipleTermPositions+ here.
3
+ #
4
+ # @author Anders Nielsen
5
+ class MultipleTermDocPosEnum < TermDocEnum
6
+
7
+ class TermPositionsQueue < Ferret::Utils::PriorityQueue
8
+ def initialize(term_positions)
9
+ super(term_positions.size)
10
+
11
+ term_positions.each do |tp|
12
+ push(tp) if tp.next?
13
+ end
14
+ end
15
+
16
+ def less_than(tp1, tp2)
17
+ return tp1.doc < tp2.doc
18
+ end
19
+ end
20
+
21
+ # Creates a new +MultipleTermPositions+ instance.
22
+ #
23
+ # @exception IOException
24
+ def initialize(reader, terms)
25
+ term_positions = []
26
+
27
+ terms.each do |term|
28
+ term_positions << reader.term_positions_for(term)
29
+ end
30
+
31
+ @tps_queue = TermPositionsQueue.new(term_positions)
32
+ @pos_list = []
33
+ end
34
+
35
+ def next?
36
+ return false if (@tps_queue.size == 0)
37
+
38
+ @pos_list.clear()
39
+ @doc = @tps_queue.top.doc
40
+
41
+ tps = nil
42
+ begin
43
+ tps = @tps_queue.top()
44
+
45
+ tps.freq.times do |i|
46
+ @pos_list << tps.next_position()
47
+ end
48
+
49
+ if tps.next?
50
+ @tps_queue.adjust_top()
51
+ else
52
+ @tps_queue.pop()
53
+ tps.close()
54
+ end
55
+ end while (@tps_queue.size > 0 and @tps_queue.top.doc == @doc)
56
+
57
+ @pos_list.sort!()
58
+ @freq = @pos_list.size
59
+
60
+ return true
61
+ end
62
+
63
+ def next_position()
64
+ return @pos_list.shift()
65
+ end
66
+
67
+ def skip_to(target)
68
+ while (@tps_queue.top != nil and target > @tps_queue.top.doc)
69
+ tps = @tps_queue.pop()
70
+ if (tps.skip_to(target))
71
+ @tps_queue.push(tps)
72
+ else
73
+ tps.close()
74
+ end
75
+ end
76
+ return next?
77
+ end
78
+
79
+ def doc()
80
+ return @doc
81
+ end
82
+
83
+ def freq()
84
+ return @freq
85
+ end
86
+
87
+ def close()
88
+ while (tps = @tps_queue.pop())
89
+ tps.close()
90
+ end
91
+ end
92
+
93
+ # Not implemented.
94
+ # raises:: NotImplementedError
95
+ def seek(term)
96
+ raise NotImplementedError
97
+ end
98
+
99
+ # Not implemented.
100
+ # raises:: NotImplementedError
101
+ def read(docs, freqs)
102
+ raise NotImplementedError
103
+ end
104
+ end
105
+ end