ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,488 @@
1
+ require 'ferret/search/similarity'
2
+
3
+ module Ferret
4
+ module Index
5
+ #module Ferret::Index
6
+
7
+ require "monitor"
8
+
9
+ # An IndexWriter creates and maintains an index.
10
+ #
11
+ # The third argument to new determines whether a new index is created,
12
+ # or whether an existing index is opened for the addition of new documents.
13
+ #
14
+ # In either case, documents are added with the add_document method. When
15
+ # finished adding documents, close should be called.
16
+ #
17
+ # If an index will not have more documents added for a while and optimal search
18
+ # performance is desired, then the optimize method should be called before the
19
+ # index is closed.
20
+ #
21
+ # Opening an IndexWriter creates a lock file for the directory in use.
22
+ # Trying to open another IndexWriter on the same directory will lead to
23
+ # an IOError. The IOError is also thrown if an IndexReader on the same
24
+ # directory is used to delete documents from the index.
25
+ class IndexWriter
26
+ include MonitorMixin
27
+ include ObjectSpace
28
+
29
+ WRITE_LOCK_TIMEOUT = 1
30
+ COMMIT_LOCK_TIMEOUT = 10
31
+ WRITE_LOCK_NAME = "write.lock"
32
+ COMMIT_LOCK_NAME = "commit.lock"
33
+ DEFAULT_MERGE_FACTOR = 10
34
+ DEFAULT_MIN_MERGE_DOCS = 10
35
+ DEFAULT_MAX_MERGE_DOCS = 0x7fffffff
36
+ DEFAULT_MAX_FIELD_LENGTH = 10000
37
+ DEFAULT_TERM_INDEX_INTERVAL = 128
38
+
39
+ attr_accessor :use_compound_file, :similarity, :term_index_interval,
40
+ :max_merge_docs, :max_field_length, :min_merge_docs, :info_stream
41
+ attr_reader :analyzer, :directory, :merge_factor, :segment_infos
42
+ alias :max_buffered_docs :min_merge_docs
43
+ alias :max_buffered_docs= :min_merge_docs=
44
+
45
+ def merge_factor=(mf)
46
+ raise ArgumentError, "merge factor cannot be less than 2" if (mf < 2)
47
+ @merge_factor = mf
48
+ end
49
+
50
+ # Constructs an IndexWriter for the index in +dir+.
51
+ # Text will be analyzed with +analyzer+. If +create+
52
+ # is true, then a new, empty index will be created in
53
+ # +dir+, replacing the index already there, if any.
54
+ # NOTE:: all options are passed in a hash.
55
+ #
56
+ # dir:: the index directory
57
+ # analyzer:: the analyzer to use. Defaults to StandardAnalyzer.
58
+ # create:: +true+ to create the index or overwrite the existing one
59
+ # +false+ to append to the existing index
60
+ # create_if_missing:: +true+ to create the index if it's missing
61
+ # +false+ to throw an IOError if it's missing
62
+ def initialize(dir, options = {})
63
+ super()
64
+ create = options[:create]||false
65
+ create_if_missing = options[:create_if_missing]||false
66
+
67
+ if dir.instance_of?(String)
68
+ @directory = FSDirectory.get_directory(dir, create||create_if_missing)
69
+ else
70
+ @directory = dir
71
+ end
72
+ @close_dir = options[:close_dir] || false
73
+ @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
74
+ @merge_factor = DEFAULT_MERGE_FACTOR
75
+ @min_merge_docs = DEFAULT_MIN_MERGE_DOCS
76
+ @max_merge_docs = DEFAULT_MAX_MERGE_DOCS
77
+ @max_field_length = DEFAULT_MAX_FIELD_LENGTH
78
+ @term_index_interval = DEFAULT_TERM_INDEX_INTERVAL
79
+
80
+ @similarity = Search::Similarity.default
81
+ @segment_infos = SegmentInfos.new()
82
+ @ram_directory = Ferret::Store::RAMDirectory.new()
83
+
84
+ # Make sure that the lock is released when this object is destroyed
85
+ define_finalizer(self, proc { |id| @write_lock.release() if @write_lock})
86
+
87
+ @write_lock = @directory.make_lock(WRITE_LOCK_NAME)
88
+ @write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
89
+
90
+ @directory.synchronize() do # in- & inter-process sync
91
+ @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
92
+ if (create)
93
+ @segment_infos.write(@directory)
94
+ else
95
+ begin
96
+ @segment_infos.read(@directory)
97
+ rescue IOError => ioe
98
+ if options[:create_if_missing]
99
+ @segment_infos.write(@directory)
100
+ else
101
+ raise ioe
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ # Flushes all changes to an index and closes all associated files.
110
+ def close()
111
+ synchronize() do
112
+ flush_ram_segments()
113
+ @ram_directory.close()
114
+ @write_lock.release() if @write_lock # release write lock
115
+ @write_lock = nil
116
+ if(@close_dir)
117
+ @directory.close()
118
+ end
119
+ end
120
+ end
121
+
122
+ # Returns the number of documents currently in this index.
123
+ def doc_count()
124
+ count = 0
125
+ synchronize() do
126
+ @segment_infos.each { |si| count += si.doc_count() }
127
+ end
128
+ return count
129
+ end
130
+
131
+ # Adds a document to this index, using the provided analyzer instead of the
132
+ # local analyzer if provided. If the document contains more than
133
+ # #max_field_length terms for a given field, the remainder are
134
+ # discarded.
135
+ def add_document(doc, analyzer=@analyzer)
136
+ dw = DocumentWriter.new(@ram_directory,
137
+ analyzer,
138
+ @similarity,
139
+ @max_field_length,
140
+ @term_index_interval)
141
+ dw.info_stream = @info_stream
142
+ segment_name = new_segment_name()
143
+ dw.add_document(segment_name, doc)
144
+ synchronize() do
145
+ @segment_infos << SegmentInfo.new(segment_name, 1, @ram_directory)
146
+ maybe_merge_segments()
147
+ end
148
+ end
149
+ alias :<< :add_document
150
+
151
+ def segments_counter()
152
+ return segment_infos.counter
153
+ end
154
+
155
+ # Merges all segments together into a single segment, optimizing an index
156
+ # for search.
157
+ def optimize()
158
+ synchronize() do
159
+ flush_ram_segments()
160
+ while (@segment_infos.size() > 1 ||
161
+ (@segment_infos.size() == 1 &&
162
+ (SegmentReader.has_deletions?(@segment_infos[0]) ||
163
+ (@segment_infos[0].directory != @directory) ||
164
+ (@use_compound_file &&
165
+ (!SegmentReader.uses_compound_file?(@segment_infos[0]) ||
166
+ SegmentReader.has_separate_norms?(@segment_infos[0]))))))
167
+ min_segment = @segment_infos.size() - @merge_factor
168
+ merge_segments(min_segment < 0 ? 0 : min_segment)
169
+ end
170
+ end
171
+ end
172
+
173
+ # Merges all segments from an array of indexes into this index.
174
+ #
175
+ # This may be used to parallelize batch indexing. A large document
176
+ # collection can be broken into sub-collections. Each sub-collection can be
177
+ # indexed in parallel, on a different thread, process or machine. The
178
+ # complete index can then be created by merging sub-collection indexes
179
+ # with this method.
180
+ #
181
+ # After this completes, the index is optimized.
182
+ def add_indexes(dirs)
183
+ synchronize() do
184
+ optimize() # start with zero or 1 seg
185
+
186
+ start = @segment_infos.size
187
+
188
+ dirs.each do |dir|
189
+ sis = SegmentInfos.new() # read infos from dir
190
+ sis.read(dir)
191
+ sis.each do |si|
192
+ @segment_infos << si
193
+ end
194
+ end
195
+
196
+ # merge newly added segments in log(n) passes
197
+ while (@segment_infos.size > start + @merge_factor)
198
+ (start+1 ... @segment_infos.size).each do |base|
199
+ last = [@segment_infos.size(), (base + @merge_factor)].min
200
+ if (last - base > 1)
201
+ merge_segments(base, last);
202
+ end
203
+ end
204
+ end
205
+
206
+ optimize() # final cleanup
207
+ end
208
+ end
209
+
210
+ # Merges the provided indexes into this index.
211
+ # After this completes, the index is optimized.
212
+ # The provided IndexReaders are not closed.
213
+ def add_indexes_readers(readers)
214
+ synchronize() do
215
+ segments_to_delete = []
216
+ optimize() # start with zero or 1 seg
217
+
218
+ merged_name = new_segment_name()
219
+ merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
220
+
221
+ if (@segment_infos.size() == 1) # add existing index, if any
222
+ s_reader = SegmentReader.new(@segment_infos[0])
223
+ merger << s_reader
224
+ segments_to_delete << s_reader
225
+ end
226
+
227
+ readers.each do |reader|
228
+ merger << reader
229
+ end
230
+
231
+ doc_count = merger.merge!() # merge 'em
232
+
233
+ @segment_infos.clear() # pop old infos & add new
234
+ @segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)
235
+
236
+ @directory.synchronize() do
237
+ @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
238
+ @segment_infos.write(@directory) # commit changes
239
+ delete_segments(segments_to_delete)
240
+ return nil
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+
247
+
248
+ private
249
+
250
+ # Use compound file setting. Defaults to true, minimizing the number of
251
+ # files used. Setting this to false may improve indexing performance, but
252
+ # may also cause file handle problems.
253
+ @use_compound_file = true
254
+
255
+ # The maximum number of terms that will be indexed for a single field in a
256
+ # document. This limits the amount of memory required for indexing, so that
257
+ # collections with very large files will not crash the indexing process by
258
+ # running out of memory.
259
+ #
260
+ # Note that this effectively truncates large documents, excluding from the
261
+ # index terms that occur further in the document. If you know your source
262
+ # documents are large, be sure to set this value high enough to accomodate
263
+ # the expected size. If you set it to a really big number, then the only limit
264
+ # is your memory, but you should anticipate an OutOfMemoryError.
265
+ #
266
+ # By default, no more than 10,000 terms will be indexed for a field.
267
+ @max_field_length = DEFAULT_MAX_FIELD_LENGTH
268
+
269
+ def new_segment_name()
270
+ # The name will be "_" + seg_counter where seg_counter is stored in
271
+ # radix of 36 which is equal to MAX_RADIX in Java
272
+ synchronize() do
273
+ seg_name = "_" + @segment_infos.counter.to_s(36)
274
+ @segment_infos.counter+=1
275
+ return seg_name
276
+ end
277
+ end
278
+
279
+ # Determines how often segment indices are merged by add_document(). With
280
+ # smaller values, less RAM is used while indexing, and searches on
281
+ # unoptimized indices are faster, but indexing speed is slower. With larger
282
+ # values, more RAM is used during indexing, and while searches on unoptimized
283
+ # indices are slower, indexing is faster. Thus larger values (> 10) are best
284
+ # for batch index creation, and smaller values (< 10) for indices that are
285
+ # interactively maintained.
286
+ #
287
+ # This must never be less than 2. The default value is 10.*/
288
+ @merge_factor = DEFAULT_MERGE_FACTOR
289
+
290
+ # Determines the minimal number of documents required before the buffered
291
+ # in-memory documents are merging and a new Segment is created.
292
+ # Since Documents are merged in a org.apache.lucene.store.RAMDirectory},
293
+ # large value gives faster indexing. At the same time, merge_factor limits
294
+ # the number of files open in a FSDirectory.
295
+ #
296
+ # The default value is 10.*/
297
+ @min_merge_docs = DEFAULT_MIN_MERGE_DOCS
298
+
299
+
300
+ # Determines the largest number of documents ever merged by add_document().
301
+ # Small values (e.g., less than 10,000) are best for interactive indexing,
302
+ # as this limits the length of pauses while indexing to a few seconds.
303
+ # Larger values are best for batched indexing and speedier searches.
304
+ @max_merge_docs = DEFAULT_MAX_MERGE_DOCS
305
+
306
+ # Merges all RAM-resident segments.
307
+ def flush_ram_segments()
308
+ min_segment = @segment_infos.size()-1
309
+ doc_count = 0
310
+ while (min_segment >= 0 &&
311
+ (@segment_infos[min_segment]).directory == @ram_directory)
312
+ doc_count += @segment_infos[min_segment].doc_count
313
+ min_segment -= 1
314
+ end
315
+ if (min_segment < 0 || # add one FS segment?
316
+ (doc_count + @segment_infos[min_segment].doc_count) > @merge_factor ||
317
+ !(@segment_infos[@segment_infos.size-1].directory == @ram_directory))
318
+ min_segment += 1
319
+ end
320
+ if (min_segment >= @segment_infos.size()) then
321
+ return
322
+ end # none to merge
323
+ merge_segments(min_segment)
324
+ end
325
+
326
+ # Incremental segment merger.
327
+ def maybe_merge_segments()
328
+ target_merge_docs = @min_merge_docs
329
+ while (target_merge_docs <= @max_merge_docs)
330
+ # find segments smaller than current target size
331
+ min_segment = @segment_infos.size() -1
332
+ merge_docs = 0
333
+ while (min_segment >= 0)
334
+ si = @segment_infos[min_segment]
335
+ if (si.doc_count >= target_merge_docs)
336
+ break
337
+ end
338
+ merge_docs += si.doc_count
339
+ min_segment -= 1
340
+ end
341
+
342
+ if (merge_docs >= target_merge_docs) # found a merge to do
343
+ merge_segments(min_segment + 1)
344
+ else
345
+ break
346
+ end
347
+
348
+ target_merge_docs *= @merge_factor # increase target size
349
+ end
350
+ end
351
+
352
+ # Pops segments off of @segment_infos stack down to min_segment, merges them,
353
+ # and pushes the merged index onto the top of the @segment_infos stack.
354
+ def merge_segments(min_segment, max_segment = @segment_infos.size)
355
+ segments_to_delete = []
356
+ merged_name = new_segment_name()
357
+ if @info_stream != nil
358
+ @info_stream.print("merging segments from #{min_segment} to #{(max_segment - 1)}\n")
359
+ end
360
+ merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
361
+
362
+ (min_segment ... max_segment).each do |i|
363
+ si = @segment_infos[i]
364
+ if (@info_stream != nil)
365
+ @info_stream.print(" #{si.name} (#{si.doc_count} docs)\n")
366
+ end
367
+ reader = SegmentReader.new(si.directory, si, nil, false, false)
368
+ merger.add(reader)
369
+ if ((reader.directory() == @directory) || # if we own the directory
370
+ (reader.directory() == @ram_directory))
371
+ segments_to_delete << reader # queue segment for deletion
372
+ end
373
+ end
374
+
375
+ merged_doc_count = merger.merge()
376
+
377
+ if (@info_stream != nil)
378
+ @info_stream.print(" into " + merged_name + " (" + merged_doc_count.to_s + " docs)\n")
379
+ end
380
+
381
+ (max_segment-1).downto(min_segment) {|i| @segment_infos.delete_at(i) }
382
+ #@segment_infos = @segment_infos[0,min_segment] + @segment_infos[max_segment...-1]
383
+
384
+ @segment_infos << SegmentInfo.new(merged_name, merged_doc_count, @directory)
385
+
386
+ # close readers before we attempt to delete now-obsolete segments
387
+ merger.close_readers()
388
+
389
+ @directory.synchronize() do
390
+ @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
391
+ @segment_infos.write(@directory) # commit before deleting
392
+ delete_segments(segments_to_delete) # delete now-unused segments
393
+ return nil
394
+ end
395
+ end
396
+ segments_to_delete.size.times {|i| segments_to_delete[i] = nil }
397
+ end
398
+
399
+ # Some operating systems (e.g. Windows) don't permit a file to be
400
+ # deleted while it is opened for read (e.g. by another process or
401
+ # thread). So we assume that when a delete fails it is because the
402
+ # file is open in another process, and queue the file for subsequent
403
+ # deletion.
404
+ def delete_segments(segment_readers)
405
+ deletable = []
406
+
407
+ try_to_delete_files(read_deleteable_files(), deletable)
408
+ segment_readers.each do |segment_reader|
409
+ if (segment_reader.directory() == @directory)
410
+ try_to_delete_files(segment_reader.file_names(), deletable)
411
+ else
412
+ # delete other files
413
+ delete_files(segment_reader.file_names(), segment_reader.directory())
414
+ end
415
+ end
416
+
417
+ write_deleteable_files(deletable) # note files we can't delete
418
+ # This is a great time to start the garbage collector as all of our
419
+ # ram files have just become free
420
+ GC.start
421
+
422
+ ##############################################################################
423
+ # objs = {}
424
+ # ObjectSpace.each_object do |obj|
425
+ # objs[obj.class] ||= 0
426
+ # objs[obj.class] += 1
427
+ # end
428
+ # File.open('objects.out','a+') do |fh|
429
+ # fh.puts("____________________")
430
+ # fh.puts("____________________")
431
+ # objs.each_pair do |obj, count|
432
+ # fh.puts "#{count}\t#{obj}"
433
+ # end
434
+ # end
435
+ ##############################################################################
436
+
437
+ end
438
+
439
+ def delete_files(file_names, dir)
440
+ file_names.each do |file_name|
441
+ dir.delete(file_name)
442
+ end
443
+ end
444
+
445
+ def try_to_delete_files(file_names, deletable)
446
+ file_names.each do |file_name|
447
+ begin
448
+ @directory.delete(file_name) # try to delete each file
449
+ rescue IOError => e
450
+ if (@directory.exists?(file_name))
451
+ if (@info_stream != nil) then @info_stream.print(e.to_s + " Will re-try later.") end
452
+ deletable << file_name # add to deletable
453
+ end
454
+ end
455
+ end
456
+ end
457
+
458
+ def read_deleteable_files()
459
+ file_names = []
460
+ if (!@directory.exists?("deletable")) then return file_names end
461
+
462
+ input = @directory.open_input("deletable")
463
+ begin
464
+ file_count = input.read_int()
465
+ file_count.times do
466
+ file_names << input.read_string()
467
+ end
468
+ ensure
469
+ input.close()
470
+ end
471
+ return file_names
472
+ end
473
+
474
+ def write_deleteable_files(file_names)
475
+ output = @directory.create_output("deleteable.new")
476
+ begin
477
+ output.write_int(file_names.size())
478
+ file_names.each do |file_name|
479
+ output.write_string(file_name)
480
+ end
481
+ ensure
482
+ output.close()
483
+ end
484
+ @directory.rename("deleteable.new", "deletable")
485
+ end
486
+ end
487
+ end
488
+ end