ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,534 +0,0 @@
1
- require 'ferret/search/similarity'
2
-
3
- module Ferret
4
- module Index
5
- #module Ferret::Index
6
-
7
- require "monitor"
8
-
9
- # An IndexWriter creates and maintains an index.
10
- #
11
- # The third argument to new determines whether a new index is created,
12
- # or whether an existing index is opened for the addition of new documents.
13
- #
14
- # In either case, documents are added with the add_document method. When
15
- # finished adding documents, close should be called.
16
- #
17
- # If an index will not have more documents added for a while and optimal search
18
- # performance is desired, then the optimize method should be called before the
19
- # index is closed.
20
- #
21
- # Opening an IndexWriter creates a lock file for the directory in use.
22
- # Trying to open another IndexWriter on the same directory will lead to
23
- # an IOError. The IOError is also thrown if an IndexReader on the same
24
- # directory is used to delete documents from the index.
25
- class IndexWriter
26
- include MonitorMixin
27
- include ObjectSpace
28
-
29
- WRITE_LOCK_TIMEOUT = 1
30
- COMMIT_LOCK_TIMEOUT = 10
31
- WRITE_LOCK_NAME = "write"
32
- COMMIT_LOCK_NAME = "commit"
33
- DEFAULT_MERGE_FACTOR = 10
34
- DEFAULT_MIN_MERGE_DOCS = 10
35
- DEFAULT_MAX_MERGE_DOCS = 0x7fffffff
36
- DEFAULT_MAX_FIELD_LENGTH = 10000
37
- DEFAULT_TERM_INDEX_INTERVAL = 128
38
-
39
- attr_accessor :use_compound_file, :similarity, :term_index_interval,
40
- :max_merge_docs, :max_field_length, :min_merge_docs, :info_stream
41
- attr_reader :analyzer, :directory, :merge_factor, :segment_infos
42
- alias :max_buffered_docs :min_merge_docs
43
- alias :max_buffered_docs= :min_merge_docs=
44
-
45
- def merge_factor=(mf)
46
- raise ArgumentError, "merge factor cannot be less than 2" if (mf < 2)
47
- @merge_factor = mf
48
- end
49
-
50
- # Constructs an IndexWriter for the index in +dir+.
51
- # Text will be analyzed with +analyzer+. If +create+
52
- # is true, then a new, empty index will be created in
53
- # +dir+, replacing the index already there, if any.
54
- # NOTE:: all options are passed in a hash.
55
- #
56
- # dir:: the index directory
57
- #
58
- # == Options
59
- #
60
- # analyzer:: the analyzer to use. Defaults to StandardAnalyzer.
61
- # create:: +true+ to create the index or overwrite the existing
62
- # one +false+ to append to the existing index
63
- # create_if_missing:: +true+ to create the index if it's missing
64
- # +false+ to throw an IOError if it's missing
65
- # close_dir:: This specifies whether you would this class to close
66
- # the index directory when this class is closed. The
67
- # default is false.
68
- # use_compound_file:: Use a compound file to store the index. This is
69
- # slower than using multiple files but it prevents the
70
- # too many files open error. This defaults to true.
71
- def initialize(dir = nil, options = {})
72
- super()
73
- create = options[:create] || false
74
- create_if_missing = options[:create_if_missing] || false
75
-
76
- if dir.nil?
77
- @directory = Ferret::Store::RAMDirectory.new
78
- elsif dir.is_a?(String)
79
- @directory = Ferret::Store::FSDirectory.new(dir, create)
80
- else
81
- @directory = dir
82
- end
83
- @close_dir = options[:close_dir] || false
84
- @use_compound_file = (options[:use_compound_file] != false) # ie default true
85
- @analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
86
- @merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
87
- @min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
88
- @max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
89
- @max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
90
- @term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL
91
-
92
- @similarity = Search::Similarity.default
93
- @segment_infos = SegmentInfos.new()
94
- @ram_directory = Ferret::Store::RAMDirectory.new()
95
-
96
- # Make sure that the lock is released when this object is destroyed
97
-
98
- @write_lock = @directory.make_lock(WRITE_LOCK_NAME)
99
- @write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
100
- define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})
101
-
102
- @directory.synchronize() do # in- & inter-process sync
103
- @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
104
- if (create)
105
- @segment_infos.write(@directory)
106
- else
107
- begin
108
- @segment_infos.read(@directory)
109
- rescue Exception => e
110
- if options[:create_if_missing]
111
- @segment_infos.write(@directory)
112
- else
113
- @write_lock.release() # obtain write lock
114
- raise e
115
- end
116
- end
117
- end
118
- end
119
- end
120
-
121
- @info_stream = nil
122
- end
123
-
124
- # Flushes all changes to an index and closes all associated files.
125
- def close()
126
- synchronize() do
127
- flush_ram_segments()
128
- @ram_directory.close()
129
- @write_lock.release() if @write_lock # release write lock
130
- @write_lock = nil
131
- if(@close_dir)
132
- @directory.close()
133
- end
134
- end
135
- end
136
-
137
- # Returns the number of documents currently in this index.
138
- def doc_count()
139
- synchronize() do
140
- count = 0
141
- @segment_infos.each { |si| count += si.doc_count() }
142
- return count
143
- end
144
- end
145
-
146
- # Adds a document to this index, using the provided analyzer instead of the
147
- # local analyzer if provided. If the document contains more than
148
- # #max_field_length terms for a given field, the remainder are
149
- # discarded.
150
- def add_document(doc, analyzer=@analyzer)
151
- dw = DocumentWriter.new(@ram_directory,
152
- analyzer,
153
- @similarity,
154
- @max_field_length,
155
- @term_index_interval)
156
- dw.info_stream = @info_stream
157
- segment_name = new_segment_name()
158
- dw.add_document(segment_name, doc)
159
- synchronize() do
160
- @segment_infos << SegmentInfo.new(segment_name, 1, @ram_directory)
161
- maybe_merge_segments()
162
- end
163
- end
164
- alias :<< :add_document
165
-
166
- def segments_counter()
167
- return segment_infos.counter
168
- end
169
-
170
- # Merges all segments together into a single segment, optimizing an index
171
- # for search.
172
- def optimize()
173
- synchronize() do
174
- flush_ram_segments()
175
- while (@segment_infos.size() > 1 ||
176
- (@segment_infos.size() == 1 &&
177
- (SegmentReader.has_deletions?(@segment_infos[0]) ||
178
- (@segment_infos[0].directory != @directory) ||
179
- (@use_compound_file &&
180
- (!SegmentReader.uses_compound_file?(@segment_infos[0]) ||
181
- SegmentReader.has_separate_norms?(@segment_infos[0]))))))
182
- min_segment = @segment_infos.size() - @merge_factor
183
- merge_segments(min_segment < 0 ? 0 : min_segment)
184
- end
185
- end
186
- end
187
-
188
- # Merges all segments from an array of indexes into this index.
189
- #
190
- # This may be used to parallelize batch indexing. A large document
191
- # collection can be broken into sub-collections. Each sub-collection can be
192
- # indexed in parallel, on a different thread, process or machine. The
193
- # complete index can then be created by merging sub-collection indexes
194
- # with this method.
195
- #
196
- # After this completes, the index is optimized.
197
- def add_indexes(dirs)
198
- synchronize() do
199
- optimize() # start with zero or 1 seg
200
-
201
- start = @segment_infos.size
202
-
203
- dirs.each do |dir|
204
- sis = SegmentInfos.new() # read infos from dir
205
- sis.read(dir)
206
- sis.each do |si|
207
- @segment_infos << si
208
- end
209
- end
210
-
211
- # merge newly added segments in log(n) passes
212
- while (@segment_infos.size > start + @merge_factor)
213
- (start+1 ... @segment_infos.size).each do |base|
214
- last = [@segment_infos.size(), (base + @merge_factor)].min
215
- if (last - base > 1)
216
- merge_segments(base, last);
217
- end
218
- end
219
- end
220
-
221
- optimize() # final cleanup
222
- end
223
- end
224
-
225
- # Merges the provided indexes into this index.
226
- # After this completes, the index is optimized.
227
- # The provided IndexReaders are not closed.
228
- def add_indexes_readers(readers)
229
- synchronize() do
230
- segments_to_delete = []
231
- optimize() # start with zero or 1 seg
232
-
233
- merged_name = new_segment_name()
234
- merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
235
-
236
- if (@segment_infos.size() == 1) # add existing index, if any
237
- s_reader = SegmentReader.get(@segment_infos[0])
238
- merger << s_reader
239
- segments_to_delete << s_reader
240
- end
241
-
242
- readers.each do |reader|
243
- merger << reader
244
- end
245
-
246
- doc_count = merger.merge() # merge 'em
247
-
248
- @segment_infos.clear() # pop old infos & add new
249
- @segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)
250
-
251
- @directory.synchronize() do
252
- @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
253
- @segment_infos.write(@directory) # commit changes
254
- delete_segments(segments_to_delete)
255
- end
256
- end
257
-
258
- if @use_compound_file
259
- files_to_delete = merger.create_compound_file(merged_name + ".tmp")
260
- @directory.synchronize() do # in- & inter-process sync
261
- @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
262
- # make compound file visible for SegmentReaders
263
- @directory.rename(merged_name + ".tmp", merged_name + ".cfs")
264
- # delete now unused files of segment
265
- delete_files_and_write_undeletable(files_to_delete)
266
- end
267
- end
268
- end
269
-
270
- optimize()
271
- end
272
- end
273
-
274
-
275
-
276
- private
277
-
278
- # Use compound file setting. Defaults to true, minimizing the number of
279
- # files used. Setting this to false may improve indexing performance, but
280
- # may also cause file handle problems.
281
- @use_compound_file = true
282
-
283
- # The maximum number of terms that will be indexed for a single field in a
284
- # document. This limits the amount of memory required for indexing, so that
285
- # collections with very large files will not crash the indexing process by
286
- # running out of memory.
287
- #
288
- # Note that this effectively truncates large documents, excluding from the
289
- # index terms that occur further in the document. If you know your source
290
- # documents are large, be sure to set this value high enough to accomodate
291
- # the expected size. If you set it to a really big number, then the only limit
292
- # is your memory, but you should anticipate an OutOfMemoryError.
293
- #
294
- # By default, no more than 10,000 terms will be indexed for a field.
295
- @max_field_length = DEFAULT_MAX_FIELD_LENGTH
296
-
297
- def new_segment_name()
298
- # The name will be "_" + seg_counter where seg_counter is stored in
299
- # radix of 36 which is equal to MAX_RADIX in Java
300
- synchronize() do
301
- seg_name = "_" + @segment_infos.counter.to_s(36)
302
- @segment_infos.counter+=1
303
- return seg_name
304
- end
305
- end
306
-
307
- # Determines how often segment indices are merged by add_document(). With
308
- # smaller values, less RAM is used while indexing, and searches on
309
- # unoptimized indices are faster, but indexing speed is slower. With larger
310
- # values, more RAM is used during indexing, and while searches on unoptimized
311
- # indices are slower, indexing is faster. Thus larger values (> 10) are best
312
- # for batch index creation, and smaller values (< 10) for indices that are
313
- # interactively maintained.
314
- #
315
- # This must never be less than 2. The default value is 10.*/
316
- @merge_factor = DEFAULT_MERGE_FACTOR
317
-
318
- # Determines the minimal number of documents required before the buffered
319
- # in-memory documents are merging and a new Segment is created.
320
- # Since Documents are merged in a org.apache.lucene.store.RAMDirectory},
321
- # large value gives faster indexing. At the same time, merge_factor limits
322
- # the number of files open in a FSDirectory.
323
- #
324
- # The default value is 10.*/
325
- @min_merge_docs = DEFAULT_MIN_MERGE_DOCS
326
-
327
-
328
- # Determines the largest number of documents ever merged by add_document().
329
- # Small values (e.g., less than 10,000) are best for interactive indexing,
330
- # as this limits the length of pauses while indexing to a few seconds.
331
- # Larger values are best for batched indexing and speedier searches.
332
- @max_merge_docs = DEFAULT_MAX_MERGE_DOCS
333
-
334
- # Merges all RAM-resident segments.
335
- def flush_ram_segments()
336
- min_segment = @segment_infos.size()-1
337
- doc_count = 0
338
- while (min_segment >= 0 &&
339
- (@segment_infos[min_segment]).directory == @ram_directory)
340
- doc_count += @segment_infos[min_segment].doc_count
341
- min_segment -= 1
342
- end
343
- if (min_segment < 0 || # add one FS segment?
344
- (doc_count + @segment_infos[min_segment].doc_count) > @merge_factor ||
345
- !(@segment_infos[@segment_infos.size-1].directory == @ram_directory))
346
- min_segment += 1
347
- end
348
- if (min_segment >= @segment_infos.size()) then
349
- return
350
- end # none to merge
351
- merge_segments(min_segment)
352
- end
353
-
354
- # Incremental segment merger.
355
- def maybe_merge_segments()
356
- target_merge_docs = @min_merge_docs
357
- while (target_merge_docs <= @max_merge_docs)
358
- # find segments smaller than current target size
359
- min_segment = @segment_infos.size() - 1
360
- merge_docs = 0
361
- while (min_segment >= 0)
362
- si = @segment_infos[min_segment]
363
- if (si.doc_count >= target_merge_docs)
364
- break
365
- end
366
- merge_docs += si.doc_count
367
- min_segment -= 1
368
- end
369
-
370
- if (merge_docs >= target_merge_docs) # found a merge to do
371
- merge_segments(min_segment + 1)
372
- else
373
- break
374
- end
375
-
376
- target_merge_docs *= @merge_factor # increase target size
377
- end
378
- end
379
-
380
- # Pops segments off of @segment_infos stack down to min_segment, merges them,
381
- # and pushes the merged index onto the top of the @segment_infos stack.
382
- def merge_segments(min_segment, max_segment = @segment_infos.size)
383
- segments_to_delete = []
384
- merged_name = new_segment_name()
385
- if @info_stream != nil
386
- @info_stream.print("merging segments from #{min_segment} " +
387
- "to #{(max_segment - 1)}\n")
388
- end
389
- merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
390
-
391
- (min_segment ... max_segment).each do |i|
392
- si = @segment_infos[i]
393
- if (@info_stream != nil)
394
- @info_stream.print(" #{si.name} (#{si.doc_count} docs)\n")
395
- end
396
- reader = SegmentReader.new(si.directory, si, nil, false, false)
397
- merger.add(reader)
398
- if ((reader.directory() == @directory) || # if we own the directory
399
- (reader.directory() == @ram_directory))
400
- segments_to_delete << reader # queue segment for deletion
401
- end
402
- end
403
-
404
- merged_doc_count = merger.merge()
405
-
406
- if (@info_stream != nil)
407
- @info_stream.print(" into #{merged_name} (#{merged_doc_count.to_s} docs)\n")
408
- end
409
-
410
- (max_segment-1).downto(min_segment) {|i| @segment_infos.delete_at(i) }
411
-
412
- @segment_infos << SegmentInfo.new(merged_name, merged_doc_count, @directory)
413
-
414
- # close readers before we attempt to delete now-obsolete segments
415
- merger.close_readers()
416
-
417
- @directory.synchronize() do
418
- @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
419
- @segment_infos.write(@directory) # commit before deleting
420
- delete_segments(segments_to_delete) # delete now-unused segments
421
- end
422
- end
423
-
424
- if @use_compound_file
425
- files_to_delete = merger.create_compound_file(merged_name + ".tmp")
426
- @directory.synchronize() do # in- & inter-process sync
427
- @directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
428
- # make compound file visible for SegmentReaders
429
- @directory.rename(merged_name + ".tmp", merged_name + ".cfs")
430
- # delete now unused files of segment
431
- delete_files_and_write_undeletable(files_to_delete)
432
- end
433
- end
434
- end
435
-
436
- end
437
-
438
- # Some operating systems (e.g. Windows) don't permit a file to be
439
- # deleted while it is opened for read (e.g. by another process or
440
- # thread). So we assume that when a delete fails it is because the
441
- # file is open in another process, and queue the file for subsequent
442
- # deletion.
443
- def delete_segments(segment_readers)
444
- deletable = []
445
-
446
- try_to_delete_files(read_deleteable_files(), deletable)
447
- segment_readers.each do |segment_reader|
448
- if (segment_reader.directory() == @directory)
449
- try_to_delete_files(segment_reader.file_names(), deletable)
450
- else
451
- # delete other files
452
- delete_files(segment_reader.file_names(), segment_reader.directory())
453
- end
454
- end
455
-
456
- write_deleteable_files(deletable) # note files we can't delete
457
- # This is a great time to start the garbage collector as all of our
458
- # ram files have just become free
459
- #GC.start
460
-
461
- ##############################################################################
462
- # objs = {}
463
- # ObjectSpace.each_object do |obj|
464
- # objs[obj.class] ||= 0
465
- # objs[obj.class] += 1
466
- # end
467
- # File.open('objects.out','a+') do |fh|
468
- # fh.puts("____________________")
469
- # fh.puts("____________________")
470
- # objs.each_pair do |obj, count|
471
- # fh.puts "#{count}\t#{obj}"
472
- # end
473
- # end
474
- ##############################################################################
475
-
476
- end
477
-
478
- def delete_files_and_write_undeletable(files)
479
- deletable = []
480
- try_to_delete_files(read_deleteable_files(), deletable) # try to delete deleteable
481
- try_to_delete_files(files, deletable) # try to delete our files
482
- write_deleteable_files(deletable) # note files we can't delete
483
- end
484
-
485
- def delete_files(file_names, dir)
486
- file_names.each do |file_name|
487
- dir.delete(file_name)
488
- end
489
- end
490
-
491
- def try_to_delete_files(file_names, deletable)
492
- file_names.each do |file_name|
493
- begin
494
- @directory.delete(file_name) # try to delete each file
495
- rescue IOError => e
496
- if (@directory.exists?(file_name))
497
- if (@info_stream != nil) then @info_stream.print(e.to_s + " Will re-try later.") end
498
- deletable << file_name # add to deletable
499
- end
500
- end
501
- end
502
- end
503
-
504
- def read_deleteable_files()
505
- file_names = []
506
- if (!@directory.exists?("deletable")) then return file_names end
507
-
508
- input = @directory.open_input("deletable")
509
- begin
510
- file_count = input.read_int()
511
- file_count.times do
512
- file_names << input.read_string()
513
- end
514
- ensure
515
- input.close()
516
- end
517
- return file_names
518
- end
519
-
520
- def write_deleteable_files(file_names)
521
- output = @directory.create_output("deleteable.new")
522
- begin
523
- output.write_int(file_names.size())
524
- file_names.each do |file_name|
525
- output.write_string(file_name)
526
- end
527
- ensure
528
- output.close()
529
- end
530
- @directory.rename("deleteable.new", "deletable")
531
- end
532
- end
533
- end
534
- end