ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,380 @@
1
+ module Ferret::Index
2
+
3
+ # FIXME: Describe class +SegmentReader+ here.
4
+ #
5
+ class SegmentReader < IndexReader
6
+
7
+ attr_reader :freq_stream, :prox_stream, :deleted_docs,
8
+ :term_infos, :field_infos, :segment
9
+
10
+ def SegmentReader.get(info, infos = nil, close = false)
11
+ return SegmentReader.new(info.directory, info, infos, close, infos!=nil)
12
+ end
13
+
14
+ def initialize(dir, info, seg_infos, close, owner)
15
+ super(dir, seg_infos, close, owner)
16
+ @segment = info.name
17
+
18
+ @cfs_reader = nil
19
+ cfs = directory
20
+ if directory.exists?(@segment + '.cfs') then
21
+ @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
22
+ cfs = @cfs_reader
23
+ end
24
+
25
+ @field_infos = FieldInfos.new(cfs, @segment + '.fnm')
26
+ @fields_reader = FieldsReader.new(cfs, @segment, @field_infos)
27
+
28
+ @term_infos = TermInfosReader.new(cfs, @segment, @field_infos)
29
+ @deleted_docs = nil
30
+ @deleted_docs_dirty = false
31
+ if SegmentReader.has_deletions?(info) then
32
+ @deleted_docs =
33
+ Ferret::Utils::BitVector.read(directory, @segment + '.del')
34
+ end
35
+
36
+ @freq_stream = cfs.open_input(@segment + '.frq')
37
+ @prox_stream = cfs.open_input(@segment + '.prx')
38
+ @norms = {}
39
+ @norms.extend(MonitorMixin)
40
+ open_norms(cfs)
41
+
42
+ if @field_infos.has_vectors? then
43
+ @tv_reader_orig = TermVectorsReader.new(cfs, @segment, @field_infos)
44
+ end
45
+ end
46
+
47
+ def do_commit()
48
+ if (@deleted_docs_dirty) # re-write deleted
49
+ @deleted_docs.write(@directory, @segment + '.tmp')
50
+ @directory.rename(@segment + '.tmp', @segment + '.del')
51
+ end
52
+ if(@undelete_all and @directory.exists?(@segment + '.del'))
53
+ @directory.delete(@segment + '.del')
54
+ end
55
+ if (@norms_dirty) # re-write norms
56
+ @norms.each_value do |norm|
57
+ if norm.dirty?
58
+ norm.re_write(@directory, @segment, max_doc(), @cfs_reader)
59
+ end
60
+ end
61
+ end
62
+ @deleted_docs_dirty = false
63
+ @norms_dirty = false
64
+ @undelete_all = false
65
+ end
66
+
67
+ def do_close()
68
+ Thread.current["#{self.object_id}-tv_reader"] = nil # clear the cache
69
+ @fields_reader.close()
70
+ @term_infos.close()
71
+
72
+ @freq_stream.close() if @freq_stream
73
+ @prox_stream.close() if @prox_stream
74
+
75
+ close_norms()
76
+
77
+ @tv_reader_orig.close() if @tv_reader_orig
78
+ @cfs_reader.close() if @cfs_reader
79
+ end
80
+
81
+ def SegmentReader.has_deletions?(si)
82
+ return si.directory.exists?(si.name + ".del")
83
+ end
84
+
85
+ def has_deletions?()
86
+ return @deleted_docs != nil
87
+ end
88
+
89
+
90
+ def SegmentReader.uses_compound_file?(si)
91
+ return si.dir.exists?(si.name + ".cfs")
92
+ end
93
+
94
+ def SegmentReader.has_separate_norms?(si)
95
+ return (si.dir.list.select {|f| f =~ /^#{si.name}\.s/}).size > 0
96
+ end
97
+
98
+ def do_delete(doc_num)
99
+ if (@deleted_docs == nil)
100
+ @deleted_docs = Ferret::Utils::BitVector.new
101
+ end
102
+ @deleted_docs_dirty = true
103
+ @undelete_all = false
104
+ @deleted_docs.set(doc_num)
105
+ end
106
+
107
+ def do_undelete_all()
108
+ @deleted_docs = nil
109
+ @deleted_docs_dirty = false
110
+ @undelete_all = true
111
+ end
112
+
113
+ def file_names()
114
+ file_names = []
115
+
116
+ IndexFileNames::INDEX_EXTENSIONS.each do |ext|
117
+ name = @segment + "." + ext
118
+ if (@directory.exists?(name))
119
+ file_names << name
120
+ end
121
+ end
122
+
123
+ @field_infos.each_with_index do |fi, i|
124
+ if (fi.indexed?)
125
+ if @cfs_reader.nil?
126
+ name = @segment + ".f" + i.to_s
127
+ else
128
+ name = @segment + ".s" + i.to_s
129
+ end
130
+ if (@directory.exists?(name))
131
+ file_names << name
132
+ end
133
+ end
134
+ end
135
+ return file_names
136
+ end
137
+
138
+ def terms()
139
+ return @term_infos.terms()
140
+ end
141
+
142
+ def terms_from(t)
143
+ return @term_infos.terms_from(t)
144
+ end
145
+
146
+ def get_document(n)
147
+ synchronize do
148
+ if deleted?(n)
149
+ raise ArgumentError, "attempt to access a deleted document"
150
+ end
151
+ return @fields_reader.doc(n)
152
+ end
153
+ end
154
+
155
+ def deleted?(n)
156
+ synchronize do
157
+ return (@deleted_docs != nil and @deleted_docs.get(n))
158
+ end
159
+ end
160
+
161
+ def term_docs()
162
+ return SegmentTermDocEnum.new(self)
163
+ end
164
+
165
+ def term_positions()
166
+ return SegmentTermDocPosEnum.new(self)
167
+ end
168
+
169
+ def doc_freq(t)
170
+ ti = @term_infos.get_term_info(t)
171
+ if (ti != nil)
172
+ return ti.doc_freq
173
+ else
174
+ return 0
175
+ end
176
+ end
177
+
178
+ def num_docs()
179
+ n = max_doc()
180
+ if (@deleted_docs != nil)
181
+ n -= @deleted_docs.count()
182
+ end
183
+ return n
184
+ end
185
+
186
+ def max_doc()
187
+ return @fields_reader.size()
188
+ end
189
+
190
+ # See IndexReader#get_field_names
191
+ def get_field_names(field_option)
192
+ field_set = Set.new
193
+ @field_infos.each do |fi|
194
+ if (field_option == IndexReader::FieldOption::ALL)
195
+ field_set.add(fi.name)
196
+ elsif (!fi.indexed? and field_option == IndexReader::FieldOption::UNINDEXED)
197
+ field_set.add(fi.name)
198
+ elsif (fi.indexed? and field_option == IndexReader::FieldOption::INDEXED)
199
+ field_set.add(fi.name)
200
+ elsif (fi.indexed? and fi.store_term_vector? == false and
201
+ field_option == IndexReader::FieldOption::INDEXED_NO_TERM_VECTOR)
202
+ field_set.add(fi.name)
203
+ elsif (fi.store_term_vector? == true and
204
+ fi.store_positions? == false and
205
+ fi.store_offsets? == false and
206
+ field_option == IndexReader::FieldOption::TERM_VECTOR)
207
+ field_set.add(fi.name)
208
+ elsif (fi.indexed? and fi.store_term_vector? and
209
+ field_option == IndexReader::FieldOption::INDEXED_WITH_TERM_VECTOR)
210
+ field_set.add(fi.name)
211
+ elsif (fi.store_positions? and fi.store_offsets? == false and
212
+ field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION)
213
+ field_set.add(fi.name)
214
+ elsif (fi.store_offsets? and fi.store_positions? == false and
215
+ field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET)
216
+ field_set.add(fi.name)
217
+ elsif (fi.store_offsets? and fi.store_positions? and
218
+ field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET)
219
+ field_set.add(fi.name)
220
+ end
221
+ end
222
+ return field_set
223
+ end
224
+
225
+ def get_norms(field)
226
+ synchronize do
227
+ norm = @norms[field]
228
+ if (norm == nil) # not an indexed field
229
+ return nil
230
+ end
231
+ if (norm.bytes == nil) # value not yet read
232
+ bytes = " " * max_doc()
233
+ get_norms_into(field, bytes, 0)
234
+ norm.bytes = bytes # cache it
235
+ end
236
+ return norm.bytes
237
+ end
238
+ end
239
+
240
+ def do_set_norm(doc, field, value)
241
+ norm = @norms[field]
242
+ if (norm == nil) # not an indexed field
243
+ return
244
+ end
245
+ norm.dirty = true # mark it dirty
246
+ @norms_dirty = true
247
+
248
+ get_norms(field)[doc] = value # set the value
249
+ end
250
+
251
+ # Read norms into a pre-allocated array.
252
+ def get_norms_into(field, bytes, offset)
253
+ synchronize do
254
+ norm = @norms[field]
255
+ return if (norm == nil) # use zeros in array
256
+
257
+ if (norm.bytes != nil) # can copy from cache
258
+ bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
259
+ return
260
+ end
261
+
262
+ norm_stream = norm.is.clone()
263
+ begin # read from disk
264
+ norm_stream.seek(0)
265
+ norm_stream.read_bytes(bytes, offset, max_doc())
266
+ ensure
267
+ norm_stream.close()
268
+ end
269
+ end
270
+ end
271
+
272
+ def open_norms(cfs_dir)
273
+ @field_infos.each do |fi|
274
+ if (fi.indexed?)
275
+ # look first if there are separate norms in compound format
276
+ file_name = @segment + ".s" + fi.number.to_s
277
+ d = @directory
278
+ if not d.exists?(file_name)
279
+ file_name = @segment + ".f" + fi.number.to_s
280
+ d = cfs_dir
281
+ end
282
+ @norms[fi.name] = Norm.new(d.open_input(file_name), fi.number)
283
+ end
284
+ end
285
+ end
286
+
287
+ def close_norms()
288
+ @norms.synchronize do
289
+ @norms.each_value {|norm| norm.is.close()}
290
+ end
291
+ end
292
+
293
+ # Create a clone from the initial TermVectorsReader and store it
294
+ # in the Thread
295
+ # returns:: TermVectorsReader
296
+ def get_term_vectors_reader()
297
+ #return @xtv_reader ||= @tv_reader_orig.clone()
298
+ tv_reader = Thread.current["#{self.object_id}-tv_reader"]
299
+ if (tv_reader == nil)
300
+ tv_reader = @tv_reader_orig.clone()
301
+ Thread.current["#{self.object_id}-tv_reader"] = tv_reader
302
+ end
303
+ return tv_reader
304
+ end
305
+
306
+ # Return a term frequency vector for the specified document and field. The
307
+ # vector returned contains term numbers and frequencies for all terms in
308
+ # the specified field of this document, if the field had storeTermVector
309
+ # flag set. If the flag was not set, the method returns nil.
310
+ # raises:: IOException
311
+ def get_term_vector(doc_number, field)
312
+ # Check if this field is invalid or has no stored term vector
313
+ fi = @field_infos[field]
314
+ if fi.nil? or not fi.store_term_vector? or @tv_reader_orig.nil?
315
+ return nil
316
+ end
317
+
318
+ term_vectors_reader = get_term_vectors_reader()
319
+ if (term_vectors_reader == nil)
320
+ return nil
321
+ end
322
+ return term_vectors_reader.get_field_tv(doc_number, field)
323
+ end
324
+
325
+
326
+ # Return an array of term frequency vectors for the specified document.
327
+ # The array contains a vector for each vectorized field in the document.
328
+ # Each vector vector contains term numbers and frequencies for all terms
329
+ # in a given vectorized field.
330
+ # If no such fields existed, the method returns nil.
331
+ # raises:: IOException
332
+ def get_term_vectors(doc_number)
333
+ if @tv_reader_orig.nil?
334
+ return nil
335
+ end
336
+ term_vectors_reader = get_term_vectors_reader()
337
+ if (term_vectors_reader == nil)
338
+ return nil
339
+ end
340
+ return term_vectors_reader.get_tv(doc_number)
341
+ end
342
+
343
+ def dir()
344
+ return @directory
345
+ end
346
+
347
+ class Norm
348
+ attr_reader :is
349
+ attr_writer :dirty
350
+ attr_accessor :bytes
351
+
352
+ def dirty?
353
+ return @dirty
354
+ end
355
+
356
+ def initialize(is, number)
357
+ @is = is
358
+ @number = number
359
+ end
360
+
361
+ def re_write(directory, segment, count, cfs_reader)
362
+ # NOTE: norms are re-written in regular directory, not cfs
363
+ out = directory.create_output(segment + ".tmp")
364
+ begin
365
+ out.write_bytes(@bytes, count)
366
+ ensure
367
+ out.close()
368
+ end
369
+ if(cfs_reader == nil)
370
+ file_name = "#{segment}.f#{@number}"
371
+ else
372
+ # use a different file name if we have compound format
373
+ file_name = "#{segment}.s#{@number}"
374
+ end
375
+ directory.rename(segment + ".tmp", file_name)
376
+ @dirty = false
377
+ end
378
+ end
379
+ end
380
+ end
@@ -0,0 +1,178 @@
1
+ module Ferret::Index
2
+ class SegmentTermEnum < TermEnum
3
+
4
+ INT_MAX = (2**31)-1
5
+
6
+ attr_reader :field_infos, :size, :position, :index_pointer,
7
+ :index_interval, :skip_interval
8
+
9
+ def initialize(input, field_infos, is_index)
10
+
11
+ @input = input
12
+ @field_infos = field_infos
13
+ @is_index = is_index
14
+ @position = -1
15
+
16
+ @term_buffer = TermBuffer.new()
17
+ @prev_buffer = TermBuffer.new()
18
+ @scratch = nil # used for scanning
19
+ @term_info = TermInfo.new()
20
+
21
+ @index_pointer = 0
22
+ @format_m1skip_interval = nil
23
+
24
+ first_int = @input.read_int()
25
+
26
+ if (first_int >= 0)
27
+ # original-format file, without explicit format version number
28
+ @format = 0
29
+ @size = first_int
30
+
31
+ # back-compatible settings
32
+ @index_interval = 128
33
+ @skip_interval = INT_MAX # switch off skip_to optimization
34
+
35
+ else
36
+ # we have a format version number
37
+ @format = first_int
38
+
39
+ # check that it is a format we can understand
40
+ if (@format < TermInfosWriter::FORMAT)
41
+ raise "Unknown format version:#{@format}"
42
+ end
43
+
44
+ @size = @input.read_long() # read the size
45
+
46
+ if (@format == -1)
47
+ if (!@is_index)
48
+ @index_interval = @input.read_int()
49
+ @format_m1skip_interval = @input.read_int()
50
+ end
51
+ # switch off skip_to optimization for file format prior to
52
+ # 1.4rc2 in order to avoid a bug in skip_to implementation
53
+ # of these versions
54
+ @skip_interval = INT_MAX
55
+ else
56
+ @index_interval = @input.read_int()
57
+ @skip_interval = @input.read_int()
58
+ end
59
+ end
60
+ end
61
+
62
+ #attr_accessors for the clone method
63
+ attr_accessor :input, :term_buffer, :prev_buffer
64
+ protected :input, :input=, :term_buffer,
65
+ :term_buffer=, :prev_buffer, :prev_buffer=
66
+
67
+ def clone()
68
+ clone = super
69
+ clone.input = @input.clone
70
+ clone.term_info = @term_info.clone
71
+ clone.term_buffer = @term_buffer.clone
72
+ clone.prev_buffer = @prev_buffer.clone
73
+ return clone
74
+ end
75
+
76
+ def seek(pointer, position, term, term_info)
77
+ @input.seek(pointer)
78
+ @position = position
79
+ @term_buffer.term = term
80
+ @prev_buffer.reset()
81
+ @term_info.set!(term_info)
82
+ end
83
+
84
+ # Increments the enumeration to the next element. True if one exists.
85
+ def next?
86
+ @position += 1
87
+ if (@position > @size - 1)
88
+ @term_buffer.reset()
89
+ return false
90
+ end
91
+
92
+ @prev_buffer.set!(@term_buffer)
93
+
94
+ @term_buffer.read(@input, @field_infos)
95
+
96
+ @term_info.doc_freq = @input.read_vint() # read doc freq
97
+ @term_info.freq_pointer += @input.read_vlong() # read freq pointer
98
+ @term_info.prox_pointer += @input.read_vlong() # read prox pointer
99
+
100
+ if (@format == -1)
101
+ # just read skip_offset in order to increment file pointer
102
+ # value is never used since skip_to is switched off
103
+ if (!@is_index)
104
+ if (@term_info.doc_freq > @format_m1skip_interval)
105
+ @term_info.skip_offset = @input.read_vint()
106
+ end
107
+ end
108
+ else
109
+ if (@term_info.doc_freq >= @skip_interval)
110
+ @term_info.skip_offset = @input.read_vint()
111
+ end
112
+ end
113
+
114
+ if (@is_index)
115
+ @index_pointer += @input.read_vlong() # read index pointer
116
+ end
117
+
118
+ return true
119
+ end
120
+
121
+ # Optimized scan, without allocating new terms.
122
+ def scan_to(term)
123
+ if (@scratch == nil)
124
+ @scratch = TermBuffer.new()
125
+ end
126
+ @scratch.term = term
127
+ while (@scratch > @term_buffer and next?) do
128
+ end
129
+ end
130
+
131
+ # Returns the current Term in the enumeration.
132
+ # Initially invalid, valid after next() called for the first time.
133
+ def term
134
+ return @term_buffer.to_term()
135
+ end
136
+
137
+ # Returns the previous Term enumerated. Initially nil.
138
+ def prev
139
+ return @prev_buffer.to_term()
140
+ end
141
+
142
+ # Returns the current TermInfo in the enumeration.
143
+ # Initially invalid, valid after next() called for the first time.
144
+ def term_info
145
+ return @term_info.clone
146
+ end
147
+
148
+ # Sets the argument to the current TermInfo in the enumeration.
149
+ # Initially invalid, valid after next() called for the first time.
150
+ attr_writer :term_info
151
+ #def term_info=(ti)
152
+ # return @term_info.set!(ti)
153
+ #end
154
+
155
+ # Returns the doc_freq from the current TermInfo in the enumeration.
156
+ # Initially invalid, valid after next() called for the first time.
157
+ def doc_freq
158
+ return term_info.doc_freq
159
+ end
160
+
161
+ # Returns the freq_pointer from the current TermInfo in the enumeration.
162
+ # Initially invalid, valid after next() called for the first time.
163
+ def freq_pointer
164
+ return term_info.freq_pointer
165
+ end
166
+
167
+ # Returns the prox_pointer from the current TermInfo in the enumeration.
168
+ # Initially invalid, valid after next() called for the first time.
169
+ def prox_pointer
170
+ return term_info.prox_pointer
171
+ end
172
+
173
+ # Closes the enumeration to further activity, freeing resources.
174
+ def close
175
+ @input.close()
176
+ end
177
+ end
178
+ end