ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,380 @@
1
+ module Ferret::Index
2
+
3
+ # FIXME: Describe class +SegmentReader+ here.
4
+ #
5
+ class SegmentReader < IndexReader
6
+
7
+ attr_reader :freq_stream, :prox_stream, :deleted_docs,
8
+ :term_infos, :field_infos, :segment
9
+
10
+ def SegmentReader.get(info, infos = nil, close = false)
11
+ return SegmentReader.new(info.directory, info, infos, close, infos!=nil)
12
+ end
13
+
14
+ def initialize(dir, info, seg_infos, close, owner)
15
+ super(dir, seg_infos, close, owner)
16
+ @segment = info.name
17
+
18
+ @cfs_reader = nil
19
+ cfs = directory
20
+ if directory.exists?(@segment + '.cfs') then
21
+ @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
22
+ cfs = @cfs_reader
23
+ end
24
+
25
+ @field_infos = FieldInfos.new(cfs, @segment + '.fnm')
26
+ @fields_reader = FieldsReader.new(cfs, @segment, @field_infos)
27
+
28
+ @term_infos = TermInfosReader.new(cfs, @segment, @field_infos)
29
+ @deleted_docs = nil
30
+ @deleted_docs_dirty = false
31
+ if SegmentReader.has_deletions?(info) then
32
+ @deleted_docs =
33
+ Ferret::Utils::BitVector.read(directory, @segment + '.del')
34
+ end
35
+
36
+ @freq_stream = cfs.open_input(@segment + '.frq')
37
+ @prox_stream = cfs.open_input(@segment + '.prx')
38
+ @norms = {}
39
+ @norms.extend(MonitorMixin)
40
+ open_norms(cfs)
41
+
42
+ if @field_infos.has_vectors? then
43
+ @tv_reader_orig = TermVectorsReader.new(cfs, @segment, @field_infos)
44
+ end
45
+ end
46
+
47
+ def do_commit()
48
+ if (@deleted_docs_dirty) # re-write deleted
49
+ @deleted_docs.write(@directory, @segment + '.tmp')
50
+ @directory.rename(@segment + '.tmp', @segment + '.del')
51
+ end
52
+ if(@undelete_all and @directory.exists?(@segment + '.del'))
53
+ @directory.delete(@segment + '.del')
54
+ end
55
+ if (@norms_dirty) # re-write norms
56
+ @norms.each_value do |norm|
57
+ if norm.dirty?
58
+ norm.re_write(@directory, @segment, max_doc(), @cfs_reader)
59
+ end
60
+ end
61
+ end
62
+ @deleted_docs_dirty = false
63
+ @norms_dirty = false
64
+ @undelete_all = false
65
+ end
66
+
67
+ def do_close()
68
+ Thread.current["#{self.object_id}-tv_reader"] = nil # clear the cache
69
+ @fields_reader.close()
70
+ @term_infos.close()
71
+
72
+ @freq_stream.close() if @freq_stream
73
+ @prox_stream.close() if @prox_stream
74
+
75
+ close_norms()
76
+
77
+ @tv_reader_orig.close() if @tv_reader_orig
78
+ @cfs_reader.close() if @cfs_reader
79
+ end
80
+
81
+ def SegmentReader.has_deletions?(si)
82
+ return si.directory.exists?(si.name + ".del")
83
+ end
84
+
85
+ def has_deletions?()
86
+ return @deleted_docs != nil
87
+ end
88
+
89
+
90
+ def SegmentReader.uses_compound_file?(si)
91
+ return si.dir.exists?(si.name + ".cfs")
92
+ end
93
+
94
+ def SegmentReader.has_separate_norms?(si)
95
+ return (si.dir.list.select {|f| f =~ /^#{si.name}\.s/}).size > 0
96
+ end
97
+
98
+ def do_delete(doc_num)
99
+ if (@deleted_docs == nil)
100
+ @deleted_docs = Ferret::Utils::BitVector.new
101
+ end
102
+ @deleted_docs_dirty = true
103
+ @undelete_all = false
104
+ @deleted_docs.set(doc_num)
105
+ end
106
+
107
+ def do_undelete_all()
108
+ @deleted_docs = nil
109
+ @deleted_docs_dirty = false
110
+ @undelete_all = true
111
+ end
112
+
113
+ def file_names()
114
+ file_names = []
115
+
116
+ IndexFileNames::INDEX_EXTENSIONS.each do |ext|
117
+ name = @segment + "." + ext
118
+ if (@directory.exists?(name))
119
+ file_names << name
120
+ end
121
+ end
122
+
123
+ @field_infos.each_with_index do |fi, i|
124
+ if (fi.indexed?)
125
+ if @cfs_reader.nil?
126
+ name = @segment + ".f" + i.to_s
127
+ else
128
+ name = @segment + ".s" + i.to_s
129
+ end
130
+ if (@directory.exists?(name))
131
+ file_names << name
132
+ end
133
+ end
134
+ end
135
+ return file_names
136
+ end
137
+
138
+ def terms()
139
+ return @term_infos.terms()
140
+ end
141
+
142
+ def terms_from(t)
143
+ return @term_infos.terms_from(t)
144
+ end
145
+
146
+ def get_document(n)
147
+ synchronize do
148
+ if deleted?(n)
149
+ raise ArgumentError, "attempt to access a deleted document"
150
+ end
151
+ return @fields_reader.doc(n)
152
+ end
153
+ end
154
+
155
+ def deleted?(n)
156
+ synchronize do
157
+ return (@deleted_docs != nil and @deleted_docs.get(n))
158
+ end
159
+ end
160
+
161
+ def term_docs()
162
+ return SegmentTermDocEnum.new(self)
163
+ end
164
+
165
+ def term_positions()
166
+ return SegmentTermDocPosEnum.new(self)
167
+ end
168
+
169
+ def doc_freq(t)
170
+ ti = @term_infos.get_term_info(t)
171
+ if (ti != nil)
172
+ return ti.doc_freq
173
+ else
174
+ return 0
175
+ end
176
+ end
177
+
178
+ def num_docs()
179
+ n = max_doc()
180
+ if (@deleted_docs != nil)
181
+ n -= @deleted_docs.count()
182
+ end
183
+ return n
184
+ end
185
+
186
+ def max_doc()
187
+ return @fields_reader.size()
188
+ end
189
+
190
+ # See IndexReader#get_field_names
191
+ def get_field_names(field_option)
192
+ field_set = Set.new
193
+ @field_infos.each do |fi|
194
+ if (field_option == IndexReader::FieldOption::ALL)
195
+ field_set.add(fi.name)
196
+ elsif (!fi.indexed? and field_option == IndexReader::FieldOption::UNINDEXED)
197
+ field_set.add(fi.name)
198
+ elsif (fi.indexed? and field_option == IndexReader::FieldOption::INDEXED)
199
+ field_set.add(fi.name)
200
+ elsif (fi.indexed? and fi.store_term_vector? == false and
201
+ field_option == IndexReader::FieldOption::INDEXED_NO_TERM_VECTOR)
202
+ field_set.add(fi.name)
203
+ elsif (fi.store_term_vector? == true and
204
+ fi.store_positions? == false and
205
+ fi.store_offsets? == false and
206
+ field_option == IndexReader::FieldOption::TERM_VECTOR)
207
+ field_set.add(fi.name)
208
+ elsif (fi.indexed? and fi.store_term_vector? and
209
+ field_option == IndexReader::FieldOption::INDEXED_WITH_TERM_VECTOR)
210
+ field_set.add(fi.name)
211
+ elsif (fi.store_positions? and fi.store_offsets? == false and
212
+ field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION)
213
+ field_set.add(fi.name)
214
+ elsif (fi.store_offsets? and fi.store_positions? == false and
215
+ field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET)
216
+ field_set.add(fi.name)
217
+ elsif (fi.store_offsets? and fi.store_positions? and
218
+ field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET)
219
+ field_set.add(fi.name)
220
+ end
221
+ end
222
+ return field_set
223
+ end
224
+
225
+ def get_norms(field)
226
+ synchronize do
227
+ norm = @norms[field]
228
+ if (norm == nil) # not an indexed field
229
+ return nil
230
+ end
231
+ if (norm.bytes == nil) # value not yet read
232
+ bytes = " " * max_doc()
233
+ get_norms_into(field, bytes, 0)
234
+ norm.bytes = bytes # cache it
235
+ end
236
+ return norm.bytes
237
+ end
238
+ end
239
+
240
+ def do_set_norm(doc, field, value)
241
+ norm = @norms[field]
242
+ if (norm == nil) # not an indexed field
243
+ return
244
+ end
245
+ norm.dirty = true # mark it dirty
246
+ @norms_dirty = true
247
+
248
+ get_norms(field)[doc] = value # set the value
249
+ end
250
+
251
+ # Read norms into a pre-allocated array.
252
+ def get_norms_into(field, bytes, offset)
253
+ synchronize do
254
+ norm = @norms[field]
255
+ return if (norm == nil) # use zeros in array
256
+
257
+ if (norm.bytes != nil) # can copy from cache
258
+ bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
259
+ return
260
+ end
261
+
262
+ norm_stream = norm.is.clone()
263
+ begin # read from disk
264
+ norm_stream.seek(0)
265
+ norm_stream.read_bytes(bytes, offset, max_doc())
266
+ ensure
267
+ norm_stream.close()
268
+ end
269
+ end
270
+ end
271
+
272
+ def open_norms(cfs_dir)
273
+ @field_infos.each do |fi|
274
+ if (fi.indexed?)
275
+ # look first if there are separate norms in compound format
276
+ file_name = @segment + ".s" + fi.number.to_s
277
+ d = @directory
278
+ if not d.exists?(file_name)
279
+ file_name = @segment + ".f" + fi.number.to_s
280
+ d = cfs_dir
281
+ end
282
+ @norms[fi.name] = Norm.new(d.open_input(file_name), fi.number)
283
+ end
284
+ end
285
+ end
286
+
287
+ def close_norms()
288
+ @norms.synchronize do
289
+ @norms.each_value {|norm| norm.is.close()}
290
+ end
291
+ end
292
+
293
+ # Create a clone from the initial TermVectorsReader and store it
294
+ # in the Thread
295
+ # returns:: TermVectorsReader
296
+ def get_term_vectors_reader()
297
+ #return @xtv_reader ||= @tv_reader_orig.clone()
298
+ tv_reader = Thread.current["#{self.object_id}-tv_reader"]
299
+ if (tv_reader == nil)
300
+ tv_reader = @tv_reader_orig.clone()
301
+ Thread.current["#{self.object_id}-tv_reader"] = tv_reader
302
+ end
303
+ return tv_reader
304
+ end
305
+
306
+ # Return a term frequency vector for the specified document and field. The
307
+ # vector returned contains term numbers and frequencies for all terms in
308
+ # the specified field of this document, if the field had storeTermVector
309
+ # flag set. If the flag was not set, the method returns nil.
310
+ # raises:: IOException
311
+ def get_term_vector(doc_number, field)
312
+ # Check if this field is invalid or has no stored term vector
313
+ fi = @field_infos[field]
314
+ if fi.nil? or not fi.store_term_vector? or @tv_reader_orig.nil?
315
+ return nil
316
+ end
317
+
318
+ term_vectors_reader = get_term_vectors_reader()
319
+ if (term_vectors_reader == nil)
320
+ return nil
321
+ end
322
+ return term_vectors_reader.get_field_tv(doc_number, field)
323
+ end
324
+
325
+
326
+ # Return an array of term frequency vectors for the specified document.
327
+ # The array contains a vector for each vectorized field in the document.
328
+ # Each vector vector contains term numbers and frequencies for all terms
329
+ # in a given vectorized field.
330
+ # If no such fields existed, the method returns nil.
331
+ # raises:: IOException
332
+ def get_term_vectors(doc_number)
333
+ if @tv_reader_orig.nil?
334
+ return nil
335
+ end
336
+ term_vectors_reader = get_term_vectors_reader()
337
+ if (term_vectors_reader == nil)
338
+ return nil
339
+ end
340
+ return term_vectors_reader.get_tv(doc_number)
341
+ end
342
+
343
+ def dir()
344
+ return @directory
345
+ end
346
+
347
+ class Norm
348
+ attr_reader :is
349
+ attr_writer :dirty
350
+ attr_accessor :bytes
351
+
352
+ def dirty?
353
+ return @dirty
354
+ end
355
+
356
+ def initialize(is, number)
357
+ @is = is
358
+ @number = number
359
+ end
360
+
361
+ def re_write(directory, segment, count, cfs_reader)
362
+ # NOTE: norms are re-written in regular directory, not cfs
363
+ out = directory.create_output(segment + ".tmp")
364
+ begin
365
+ out.write_bytes(@bytes, count)
366
+ ensure
367
+ out.close()
368
+ end
369
+ if(cfs_reader == nil)
370
+ file_name = "#{segment}.f#{@number}"
371
+ else
372
+ # use a different file name if we have compound format
373
+ file_name = "#{segment}.s#{@number}"
374
+ end
375
+ directory.rename(segment + ".tmp", file_name)
376
+ @dirty = false
377
+ end
378
+ end
379
+ end
380
+ end
@@ -0,0 +1,178 @@
1
+ module Ferret::Index
2
+ class SegmentTermEnum < TermEnum
3
+
4
+ INT_MAX = (2**31)-1
5
+
6
+ attr_reader :field_infos, :size, :position, :index_pointer,
7
+ :index_interval, :skip_interval
8
+
9
+ def initialize(input, field_infos, is_index)
10
+
11
+ @input = input
12
+ @field_infos = field_infos
13
+ @is_index = is_index
14
+ @position = -1
15
+
16
+ @term_buffer = TermBuffer.new()
17
+ @prev_buffer = TermBuffer.new()
18
+ @scratch = nil # used for scanning
19
+ @term_info = TermInfo.new()
20
+
21
+ @index_pointer = 0
22
+ @format_m1skip_interval = nil
23
+
24
+ first_int = @input.read_int()
25
+
26
+ if (first_int >= 0)
27
+ # original-format file, without explicit format version number
28
+ @format = 0
29
+ @size = first_int
30
+
31
+ # back-compatible settings
32
+ @index_interval = 128
33
+ @skip_interval = INT_MAX # switch off skip_to optimization
34
+
35
+ else
36
+ # we have a format version number
37
+ @format = first_int
38
+
39
+ # check that it is a format we can understand
40
+ if (@format < TermInfosWriter::FORMAT)
41
+ raise "Unknown format version:#{@format}"
42
+ end
43
+
44
+ @size = @input.read_long() # read the size
45
+
46
+ if (@format == -1)
47
+ if (!@is_index)
48
+ @index_interval = @input.read_int()
49
+ @format_m1skip_interval = @input.read_int()
50
+ end
51
+ # switch off skip_to optimization for file format prior to
52
+ # 1.4rc2 in order to avoid a bug in skip_to implementation
53
+ # of these versions
54
+ @skip_interval = INT_MAX
55
+ else
56
+ @index_interval = @input.read_int()
57
+ @skip_interval = @input.read_int()
58
+ end
59
+ end
60
+ end
61
+
62
+ #attr_accessors for the clone method
63
+ attr_accessor :input, :term_buffer, :prev_buffer
64
+ protected :input, :input=, :term_buffer,
65
+ :term_buffer=, :prev_buffer, :prev_buffer=
66
+
67
+ def clone()
68
+ clone = super
69
+ clone.input = @input.clone
70
+ clone.term_info = @term_info.clone
71
+ clone.term_buffer = @term_buffer.clone
72
+ clone.prev_buffer = @prev_buffer.clone
73
+ return clone
74
+ end
75
+
76
+ def seek(pointer, position, term, term_info)
77
+ @input.seek(pointer)
78
+ @position = position
79
+ @term_buffer.term = term
80
+ @prev_buffer.reset()
81
+ @term_info.set!(term_info)
82
+ end
83
+
84
+ # Increments the enumeration to the next element. True if one exists.
85
+ def next?
86
+ @position += 1
87
+ if (@position > @size - 1)
88
+ @term_buffer.reset()
89
+ return false
90
+ end
91
+
92
+ @prev_buffer.set!(@term_buffer)
93
+
94
+ @term_buffer.read(@input, @field_infos)
95
+
96
+ @term_info.doc_freq = @input.read_vint() # read doc freq
97
+ @term_info.freq_pointer += @input.read_vlong() # read freq pointer
98
+ @term_info.prox_pointer += @input.read_vlong() # read prox pointer
99
+
100
+ if (@format == -1)
101
+ # just read skip_offset in order to increment file pointer
102
+ # value is never used since skip_to is switched off
103
+ if (!@is_index)
104
+ if (@term_info.doc_freq > @format_m1skip_interval)
105
+ @term_info.skip_offset = @input.read_vint()
106
+ end
107
+ end
108
+ else
109
+ if (@term_info.doc_freq >= @skip_interval)
110
+ @term_info.skip_offset = @input.read_vint()
111
+ end
112
+ end
113
+
114
+ if (@is_index)
115
+ @index_pointer += @input.read_vlong() # read index pointer
116
+ end
117
+
118
+ return true
119
+ end
120
+
121
+ # Optimized scan, without allocating new terms.
122
+ def scan_to(term)
123
+ if (@scratch == nil)
124
+ @scratch = TermBuffer.new()
125
+ end
126
+ @scratch.term = term
127
+ while (@scratch > @term_buffer and next?) do
128
+ end
129
+ end
130
+
131
+ # Returns the current Term in the enumeration.
132
+ # Initially invalid, valid after next() called for the first time.
133
+ def term
134
+ return @term_buffer.to_term()
135
+ end
136
+
137
+ # Returns the previous Term enumerated. Initially nil.
138
+ def prev
139
+ return @prev_buffer.to_term()
140
+ end
141
+
142
+ # Returns the current TermInfo in the enumeration.
143
+ # Initially invalid, valid after next() called for the first time.
144
+ def term_info
145
+ return @term_info.clone
146
+ end
147
+
148
+ # Sets the argument to the current TermInfo in the enumeration.
149
+ # Initially invalid, valid after next() called for the first time.
150
+ attr_writer :term_info
151
+ #def term_info=(ti)
152
+ # return @term_info.set!(ti)
153
+ #end
154
+
155
+ # Returns the doc_freq from the current TermInfo in the enumeration.
156
+ # Initially invalid, valid after next() called for the first time.
157
+ def doc_freq
158
+ return term_info.doc_freq
159
+ end
160
+
161
+ # Returns the freq_pointer from the current TermInfo in the enumeration.
162
+ # Initially invalid, valid after next() called for the first time.
163
+ def freq_pointer
164
+ return term_info.freq_pointer
165
+ end
166
+
167
+ # Returns the prox_pointer from the current TermInfo in the enumeration.
168
+ # Initially invalid, valid after next() called for the first time.
169
+ def prox_pointer
170
+ return term_info.prox_pointer
171
+ end
172
+
173
+ # Closes the enumeration to further activity, freeing resources.
174
+ def close
175
+ @input.close()
176
+ end
177
+ end
178
+ end