ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,377 +0,0 @@
1
- module Ferret::Index
2
- # An IndexReader which reads multiple indexes, appending their content.
3
- class MultiReader < IndexReader
4
- attr_reader :max_doc
5
-
6
- # Construct a MultiReader aggregating the named set of (sub)readers.
7
- # Directory locking for delete, undeleteAll, and set_norm operations is
8
- # left to the subreaders.
9
- #
10
- # Note that all subreaders are closed if this Multireader is closed.
11
- # sub_readers:: set of (sub)readers
12
- # raises:: IOException
13
- def initialize(sub_readers, directory = nil, sis = nil, close_dir = false)
14
- if (directory)
15
- super(directory, sis, close_dir)
16
- else
17
- super(sub_readers.length == 0 ? nil : sub_readers[0].directory())
18
- end
19
-
20
- @max_doc = 0
21
- @num_docs = -1
22
- @has_deletions = false
23
-
24
- @sub_readers = sub_readers
25
- @starts = Array.new(@sub_readers.length + 1) # build starts array
26
- @sub_readers.each_with_index do |sub_reader, i|
27
- @starts[i] = @max_doc
28
- @max_doc += sub_reader.max_doc # compute max_docs
29
-
30
- if @sub_readers[i].has_deletions?
31
- @has_deletions = true
32
- end
33
- end
34
- @starts[@sub_readers.length] = @max_doc
35
- @norms_cache = {}
36
- end
37
-
38
-
39
- # Return an array of term frequency vectors for the specified document. The
40
- # array contains a vector for each vectorized field in the document. Each
41
- # vector vector contains term numbers and frequencies for all terms in a
42
- # given vectorized field. If no such fields existed, the method returns
43
- # nil.
44
- def get_term_vectors(n)
45
- i = reader_index(n) # find segment num
46
- return @sub_readers[i].get_term_vectors(n - @starts[i]); # dispatch to segment
47
- end
48
-
49
- def get_term_vector(n, field)
50
- i = reader_index(n) # find segment num
51
- return @sub_readers[i].get_term_vector(n - @starts[i], field)
52
- end
53
-
54
- def num_docs()
55
- synchronize do
56
- if (@num_docs == -1) # check cache
57
- n = 0 # cache miss -= 1recompute
58
- @sub_readers.each {|reader| n += reader.num_docs()}
59
- @num_docs = n
60
- end
61
- return @num_docs
62
- end
63
- end
64
-
65
- def get_document(n)
66
- i = reader_index(n) # find segment num
67
- return @sub_readers[i].get_document(n - @starts[i]) # dispatch to segment reader
68
- end
69
-
70
- def deleted?(n)
71
- i = reader_index(n) # find segment num
72
- return @sub_readers[i].deleted?(n - @starts[i]) # dispatch to segment reader
73
- end
74
-
75
- def has_deletions?()
76
- return @has_deletions
77
- end
78
-
79
- def do_delete(n)
80
- @num_docs = -1 # invalidate cache
81
- i = reader_index(n) # find segment num
82
- @sub_readers[i].delete(n - @starts[i]) # dispatch to segment reader
83
- @has_deletions = true
84
- end
85
-
86
- def do_undelete_all()
87
- @num_docs = -1 # invalidate cache
88
- @sub_readers.each {|reader| reader.undelete_all() }
89
- @has_deletions = false
90
- end
91
-
92
- def reader_index(n) # find reader for doc n:
93
- lo = 0 # search @starts array
94
- hi = @sub_readers.length - 1 # for first element less
95
-
96
- while (hi >= lo)
97
- mid = (lo + hi) >> 1
98
- mid_value = @starts[mid]
99
- if (n < mid_value)
100
- hi = mid - 1
101
- elsif (n > mid_value)
102
- lo = mid + 1
103
- else # found a match
104
- while (mid+1 < @sub_readers.length and @starts[mid+1] == mid_value)
105
- mid += 1 # scan to last match
106
- end
107
- return mid
108
- end
109
- end
110
- return hi
111
- end
112
-
113
- def has_norms?(field)
114
- @sub_readers.each {|reader| return true if reader.has_norms?(field)}
115
- return false
116
- end
117
-
118
- def fake_norms()
119
- return @ones ||= SegmentReader.create_fake_norms(max_doc())
120
- end
121
-
122
- def get_norms(field)
123
- synchronize do
124
- bytes = @norms_cache[field]
125
- return bytes if bytes
126
- return fake_norms if not has_norms?(field)
127
-
128
- bytes = " " * @max_doc
129
- @sub_readers.length.times do |i|
130
- @sub_readers[i].get_norms_into(field, bytes, @starts[i])
131
- end
132
- @norms_cache[field] = bytes # update cache
133
- return bytes
134
- end
135
- end
136
-
137
- def get_norms_into(field, buf, offset)
138
- synchronize do
139
- bytes = @norms_cache[field]
140
- bytes = fake_norms() if (bytes.nil? and not has_norms?(field))
141
-
142
- if (bytes) # cache hit
143
- buf[offset ,@max_doc] = bytes[0, @max_doc]
144
- return
145
- end
146
-
147
- @sub_readers.length.times do |i|
148
- @sub_readers[i].get_norms_into(field, buf, offset + @starts[i])
149
- end
150
- end
151
- end
152
-
153
- def do_set_norm(n, field, value)
154
- @norms_cache.delete(field) # clear cache
155
- i = reader_index(n) # find segment num
156
- @sub_readers[i].set_norm(n-@starts[i], field, value); # dispatch
157
- end
158
-
159
- def terms()
160
- return MultiTermEnum.new(@sub_readers, @starts, nil)
161
- end
162
-
163
- def terms_from(term)
164
- return MultiTermEnum.new(@sub_readers, @starts, term)
165
- end
166
-
167
- def doc_freq(t)
168
- total = 0 # sum freqs in segments
169
- @sub_readers.each {|reader| total += reader.doc_freq(t)}
170
- return total
171
- end
172
-
173
- def term_docs()
174
- return MultiTermDocEnum.new(@sub_readers, @starts)
175
- end
176
-
177
- def term_positions()
178
- return MultiTermDocPosEnum.new(@sub_readers, @starts)
179
- end
180
-
181
- def do_commit()
182
- @sub_readers.each {|reader| reader.commit() }
183
- end
184
-
185
- def do_close()
186
- synchronize do
187
- @sub_readers.each {|reader| reader.close() }
188
- end
189
- end
190
-
191
- # See IndexReader#get_field_names
192
- def get_field_names(field_option = IndexReader::FieldOption::ALL)
193
- # maintain a unique set of field names
194
- field_set = Set.new
195
- @sub_readers.each do |reader|
196
- field_set |= reader.get_field_names(field_option)
197
- end
198
- return field_set
199
- end
200
- end
201
-
202
- class MultiTermEnum < TermEnum
203
-
204
- attr_reader :doc_freq, :term
205
-
206
- def initialize(readers, starts, t)
207
- @queue = SegmentMergeQueue.new(readers.length)
208
- readers.each_index do |i|
209
- reader = readers[i]
210
- term_enum = nil
211
- if (t != nil)
212
- term_enum = reader.terms_from(t)
213
- else
214
- term_enum = reader.terms()
215
- end
216
- smi = SegmentMergeInfo.new(starts[i], term_enum, reader)
217
-
218
- if (t == nil and smi.next?) or term_enum.term
219
- @queue.push(smi); # initialize queue
220
- else
221
- smi.close()
222
- end
223
- end
224
-
225
- if (t != nil and @queue.size() > 0)
226
- next?()
227
- end
228
- end
229
-
230
- def next?()
231
- top = @queue.top()
232
- if (top == nil)
233
- @term_buffer = nil
234
- return false
235
- end
236
-
237
- @term = top.term_buffer.term
238
- @doc_freq = 0
239
-
240
- while top and @term == top.term_buffer
241
- @queue.pop()
242
- @doc_freq += top.term_enum.doc_freq() # increment freq
243
- if (top.next?)
244
- @queue.push(top) # restore queue
245
- else
246
- top.close() # done with a segment
247
- end
248
- top = @queue.top()
249
- end
250
- return true
251
- end
252
-
253
- #def term()
254
- # @term_buffer.term if @term_buffer
255
- #end
256
-
257
- def close()
258
- @queue.close()
259
- end
260
- end
261
-
262
- class MultiTermDocEnum < TermDocEnum
263
- attr_accessor :readers, :starts, :term, :base, :pointer, :current
264
-
265
- def initialize(readers, starts)
266
- @readers = readers
267
- @starts = starts
268
- @base = 0
269
- @pointer = 0
270
-
271
- @reader_term_docs = Array.new(readers.length)
272
- end
273
-
274
- def doc
275
- return @base + @current.doc()
276
- end
277
-
278
- def freq
279
- return @current.freq()
280
- end
281
-
282
- def seek(term)
283
- @term = term
284
- @base = 0
285
- @pointer = 0
286
- @current = nil
287
- end
288
-
289
- def next?
290
- if @current and @current.next?
291
- return true
292
- elsif @pointer < @readers.length
293
- @base = @starts[@pointer]
294
- @current = term_docs(@pointer)
295
- @pointer += 1
296
- return next?()
297
- else
298
- return false
299
- end
300
- end
301
-
302
- # Optimized implementation. Unlike the Java version, this method
303
- # always returns as many results as it can read.
304
- def read(docs, freqs)
305
- got = 0
306
- last_got = 0
307
- needed = docs.length
308
-
309
- while (true)
310
- while @current.nil?
311
- if @pointer < @readers.length # try next segment
312
- @base = @starts[@pointer]
313
- @current = term_docs(@pointer)
314
- @pointer += 1
315
- else
316
- return got
317
- end
318
- end
319
- got = @current.read(docs, freqs, got)
320
- if (got == last_got) # none left in segment
321
- @current = nil
322
- else # got some
323
- b = @base # adjust doc numbers
324
- (last_got...got).each {|i| docs[i] += b}
325
- if got == needed
326
- return got
327
- else
328
- last_got = got
329
- end
330
- end
331
- end
332
- end
333
-
334
- # As yet unoptimized implementation.
335
- def skip_to(target)
336
- begin
337
- return false if not next?
338
- end while target > doc()
339
- return true
340
- end
341
-
342
- def term_docs(i)
343
- return nil if (@term == nil)
344
- result = @reader_term_docs[i]
345
- if (result == nil)
346
- result = @reader_term_docs[i] = term_docs_from_reader(@readers[i])
347
- end
348
- result.seek(@term)
349
- return result
350
- end
351
-
352
- def term_docs_from_reader(reader)
353
- return reader.term_docs()
354
- end
355
-
356
- def close()
357
- @reader_term_docs.compact.each do |rtd|
358
- rtd.close()
359
- end
360
- end
361
- end
362
-
363
- class MultiTermDocPosEnum < MultiTermDocEnum
364
- def initialize(r, s)
365
- super(r,s)
366
- end
367
-
368
- def term_docs_from_reader(reader)
369
- return reader.term_positions()
370
- end
371
-
372
- def next_position()
373
- return @current.next_position()
374
- end
375
-
376
- end
377
- end
@@ -1,98 +0,0 @@
1
- module Ferret::Index
2
- # Describe class +MultipleTermPositions+ here.
3
- #
4
- # @author Anders Nielsen
5
- class MultipleTermDocPosEnum < TermDocEnum
6
-
7
- attr_accessor :doc, :freq
8
- class TermPositionsQueue < Ferret::Utils::PriorityQueue
9
- def initialize(term_positions)
10
- super(term_positions.size)
11
-
12
- term_positions.each do |tp|
13
- push(tp) if tp.next?
14
- end
15
- end
16
-
17
- def less_than(tp1, tp2)
18
- return tp1.doc < tp2.doc
19
- end
20
- end
21
-
22
- # Creates a new +MultipleTermPositions+ instance.
23
- #
24
- # @exception IOException
25
- def initialize(reader, terms)
26
- term_positions = []
27
-
28
- terms.each do |term|
29
- term_positions << reader.term_positions_for(term)
30
- end
31
-
32
- @tps_queue = TermPositionsQueue.new(term_positions)
33
- @pos_list = []
34
- end
35
-
36
- def next?
37
- return false if (@tps_queue.size == 0)
38
-
39
- @pos_list.clear()
40
- @doc = @tps_queue.top.doc
41
-
42
- tps = nil
43
- begin
44
- tps = @tps_queue.top()
45
-
46
- tps.freq.times do |i|
47
- @pos_list << tps.next_position()
48
- end
49
-
50
- if tps.next?
51
- @tps_queue.adjust_top()
52
- else
53
- @tps_queue.pop()
54
- tps.close()
55
- end
56
- end while (@tps_queue.size > 0 and @tps_queue.top.doc == @doc)
57
-
58
- @pos_list.sort!()
59
- @freq = @pos_list.size
60
-
61
- return true
62
- end
63
-
64
- def next_position()
65
- return @pos_list.shift()
66
- end
67
-
68
- def skip_to(target)
69
- while (@tps_queue.top != nil and target > @tps_queue.top.doc)
70
- tps = @tps_queue.pop()
71
- if (tps.skip_to(target))
72
- @tps_queue.push(tps)
73
- else
74
- tps.close()
75
- end
76
- end
77
- return next?
78
- end
79
-
80
- def close()
81
- while (tps = @tps_queue.pop())
82
- tps.close()
83
- end
84
- end
85
-
86
- # Not implemented.
87
- # raises:: NotImplementedError
88
- def seek(term)
89
- raise NotImplementedError
90
- end
91
-
92
- # Not implemented.
93
- # raises:: NotImplementedError
94
- def read(docs, freqs)
95
- raise NotImplementedError
96
- end
97
- end
98
- end