ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,534 +0,0 @@
|
|
1
|
-
require 'ferret/search/similarity'
|
2
|
-
|
3
|
-
module Ferret
|
4
|
-
module Index
|
5
|
-
#module Ferret::Index
|
6
|
-
|
7
|
-
require "monitor"
|
8
|
-
|
9
|
-
# An IndexWriter creates and maintains an index.
|
10
|
-
#
|
11
|
-
# The third argument to new determines whether a new index is created,
|
12
|
-
# or whether an existing index is opened for the addition of new documents.
|
13
|
-
#
|
14
|
-
# In either case, documents are added with the add_document method. When
|
15
|
-
# finished adding documents, close should be called.
|
16
|
-
#
|
17
|
-
# If an index will not have more documents added for a while and optimal search
|
18
|
-
# performance is desired, then the optimize method should be called before the
|
19
|
-
# index is closed.
|
20
|
-
#
|
21
|
-
# Opening an IndexWriter creates a lock file for the directory in use.
|
22
|
-
# Trying to open another IndexWriter on the same directory will lead to
|
23
|
-
# an IOError. The IOError is also thrown if an IndexReader on the same
|
24
|
-
# directory is used to delete documents from the index.
|
25
|
-
class IndexWriter
|
26
|
-
include MonitorMixin
|
27
|
-
include ObjectSpace
|
28
|
-
|
29
|
-
WRITE_LOCK_TIMEOUT = 1
|
30
|
-
COMMIT_LOCK_TIMEOUT = 10
|
31
|
-
WRITE_LOCK_NAME = "write"
|
32
|
-
COMMIT_LOCK_NAME = "commit"
|
33
|
-
DEFAULT_MERGE_FACTOR = 10
|
34
|
-
DEFAULT_MIN_MERGE_DOCS = 10
|
35
|
-
DEFAULT_MAX_MERGE_DOCS = 0x7fffffff
|
36
|
-
DEFAULT_MAX_FIELD_LENGTH = 10000
|
37
|
-
DEFAULT_TERM_INDEX_INTERVAL = 128
|
38
|
-
|
39
|
-
attr_accessor :use_compound_file, :similarity, :term_index_interval,
|
40
|
-
:max_merge_docs, :max_field_length, :min_merge_docs, :info_stream
|
41
|
-
attr_reader :analyzer, :directory, :merge_factor, :segment_infos
|
42
|
-
alias :max_buffered_docs :min_merge_docs
|
43
|
-
alias :max_buffered_docs= :min_merge_docs=
|
44
|
-
|
45
|
-
def merge_factor=(mf)
|
46
|
-
raise ArgumentError, "merge factor cannot be less than 2" if (mf < 2)
|
47
|
-
@merge_factor = mf
|
48
|
-
end
|
49
|
-
|
50
|
-
# Constructs an IndexWriter for the index in +dir+.
|
51
|
-
# Text will be analyzed with +analyzer+. If +create+
|
52
|
-
# is true, then a new, empty index will be created in
|
53
|
-
# +dir+, replacing the index already there, if any.
|
54
|
-
# NOTE:: all options are passed in a hash.
|
55
|
-
#
|
56
|
-
# dir:: the index directory
|
57
|
-
#
|
58
|
-
# == Options
|
59
|
-
#
|
60
|
-
# analyzer:: the analyzer to use. Defaults to StandardAnalyzer.
|
61
|
-
# create:: +true+ to create the index or overwrite the existing
|
62
|
-
# one +false+ to append to the existing index
|
63
|
-
# create_if_missing:: +true+ to create the index if it's missing
|
64
|
-
# +false+ to throw an IOError if it's missing
|
65
|
-
# close_dir:: This specifies whether you would this class to close
|
66
|
-
# the index directory when this class is closed. The
|
67
|
-
# default is false.
|
68
|
-
# use_compound_file:: Use a compound file to store the index. This is
|
69
|
-
# slower than using multiple files but it prevents the
|
70
|
-
# too many files open error. This defaults to true.
|
71
|
-
def initialize(dir = nil, options = {})
|
72
|
-
super()
|
73
|
-
create = options[:create] || false
|
74
|
-
create_if_missing = options[:create_if_missing] || false
|
75
|
-
|
76
|
-
if dir.nil?
|
77
|
-
@directory = Ferret::Store::RAMDirectory.new
|
78
|
-
elsif dir.is_a?(String)
|
79
|
-
@directory = Ferret::Store::FSDirectory.new(dir, create)
|
80
|
-
else
|
81
|
-
@directory = dir
|
82
|
-
end
|
83
|
-
@close_dir = options[:close_dir] || false
|
84
|
-
@use_compound_file = (options[:use_compound_file] != false) # ie default true
|
85
|
-
@analyzer = options[:analyzer] || Ferret::Analysis::StandardAnalyzer.new
|
86
|
-
@merge_factor = options[:merge_factor] || DEFAULT_MERGE_FACTOR
|
87
|
-
@min_merge_docs = options[:min_merge_docs] || DEFAULT_MIN_MERGE_DOCS
|
88
|
-
@max_merge_docs = options[:max_merge_docs] || DEFAULT_MAX_MERGE_DOCS
|
89
|
-
@max_field_length = options[:max_field_length] || DEFAULT_MAX_FIELD_LENGTH
|
90
|
-
@term_index_interval = options[:term_index_interval] || DEFAULT_TERM_INDEX_INTERVAL
|
91
|
-
|
92
|
-
@similarity = Search::Similarity.default
|
93
|
-
@segment_infos = SegmentInfos.new()
|
94
|
-
@ram_directory = Ferret::Store::RAMDirectory.new()
|
95
|
-
|
96
|
-
# Make sure that the lock is released when this object is destroyed
|
97
|
-
|
98
|
-
@write_lock = @directory.make_lock(WRITE_LOCK_NAME)
|
99
|
-
@write_lock.obtain(WRITE_LOCK_TIMEOUT) # obtain write lock
|
100
|
-
define_finalizer(@write_lock, proc { |id| @write_lock.release() if @write_lock})
|
101
|
-
|
102
|
-
@directory.synchronize() do # in- & inter-process sync
|
103
|
-
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
104
|
-
if (create)
|
105
|
-
@segment_infos.write(@directory)
|
106
|
-
else
|
107
|
-
begin
|
108
|
-
@segment_infos.read(@directory)
|
109
|
-
rescue Exception => e
|
110
|
-
if options[:create_if_missing]
|
111
|
-
@segment_infos.write(@directory)
|
112
|
-
else
|
113
|
-
@write_lock.release() # obtain write lock
|
114
|
-
raise e
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
@info_stream = nil
|
122
|
-
end
|
123
|
-
|
124
|
-
# Flushes all changes to an index and closes all associated files.
|
125
|
-
def close()
|
126
|
-
synchronize() do
|
127
|
-
flush_ram_segments()
|
128
|
-
@ram_directory.close()
|
129
|
-
@write_lock.release() if @write_lock # release write lock
|
130
|
-
@write_lock = nil
|
131
|
-
if(@close_dir)
|
132
|
-
@directory.close()
|
133
|
-
end
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
# Returns the number of documents currently in this index.
|
138
|
-
def doc_count()
|
139
|
-
synchronize() do
|
140
|
-
count = 0
|
141
|
-
@segment_infos.each { |si| count += si.doc_count() }
|
142
|
-
return count
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
# Adds a document to this index, using the provided analyzer instead of the
|
147
|
-
# local analyzer if provided. If the document contains more than
|
148
|
-
# #max_field_length terms for a given field, the remainder are
|
149
|
-
# discarded.
|
150
|
-
def add_document(doc, analyzer=@analyzer)
|
151
|
-
dw = DocumentWriter.new(@ram_directory,
|
152
|
-
analyzer,
|
153
|
-
@similarity,
|
154
|
-
@max_field_length,
|
155
|
-
@term_index_interval)
|
156
|
-
dw.info_stream = @info_stream
|
157
|
-
segment_name = new_segment_name()
|
158
|
-
dw.add_document(segment_name, doc)
|
159
|
-
synchronize() do
|
160
|
-
@segment_infos << SegmentInfo.new(segment_name, 1, @ram_directory)
|
161
|
-
maybe_merge_segments()
|
162
|
-
end
|
163
|
-
end
|
164
|
-
alias :<< :add_document
|
165
|
-
|
166
|
-
def segments_counter()
|
167
|
-
return segment_infos.counter
|
168
|
-
end
|
169
|
-
|
170
|
-
# Merges all segments together into a single segment, optimizing an index
|
171
|
-
# for search.
|
172
|
-
def optimize()
|
173
|
-
synchronize() do
|
174
|
-
flush_ram_segments()
|
175
|
-
while (@segment_infos.size() > 1 ||
|
176
|
-
(@segment_infos.size() == 1 &&
|
177
|
-
(SegmentReader.has_deletions?(@segment_infos[0]) ||
|
178
|
-
(@segment_infos[0].directory != @directory) ||
|
179
|
-
(@use_compound_file &&
|
180
|
-
(!SegmentReader.uses_compound_file?(@segment_infos[0]) ||
|
181
|
-
SegmentReader.has_separate_norms?(@segment_infos[0]))))))
|
182
|
-
min_segment = @segment_infos.size() - @merge_factor
|
183
|
-
merge_segments(min_segment < 0 ? 0 : min_segment)
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
|
-
# Merges all segments from an array of indexes into this index.
|
189
|
-
#
|
190
|
-
# This may be used to parallelize batch indexing. A large document
|
191
|
-
# collection can be broken into sub-collections. Each sub-collection can be
|
192
|
-
# indexed in parallel, on a different thread, process or machine. The
|
193
|
-
# complete index can then be created by merging sub-collection indexes
|
194
|
-
# with this method.
|
195
|
-
#
|
196
|
-
# After this completes, the index is optimized.
|
197
|
-
def add_indexes(dirs)
|
198
|
-
synchronize() do
|
199
|
-
optimize() # start with zero or 1 seg
|
200
|
-
|
201
|
-
start = @segment_infos.size
|
202
|
-
|
203
|
-
dirs.each do |dir|
|
204
|
-
sis = SegmentInfos.new() # read infos from dir
|
205
|
-
sis.read(dir)
|
206
|
-
sis.each do |si|
|
207
|
-
@segment_infos << si
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
# merge newly added segments in log(n) passes
|
212
|
-
while (@segment_infos.size > start + @merge_factor)
|
213
|
-
(start+1 ... @segment_infos.size).each do |base|
|
214
|
-
last = [@segment_infos.size(), (base + @merge_factor)].min
|
215
|
-
if (last - base > 1)
|
216
|
-
merge_segments(base, last);
|
217
|
-
end
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
optimize() # final cleanup
|
222
|
-
end
|
223
|
-
end
|
224
|
-
|
225
|
-
# Merges the provided indexes into this index.
|
226
|
-
# After this completes, the index is optimized.
|
227
|
-
# The provided IndexReaders are not closed.
|
228
|
-
def add_indexes_readers(readers)
|
229
|
-
synchronize() do
|
230
|
-
segments_to_delete = []
|
231
|
-
optimize() # start with zero or 1 seg
|
232
|
-
|
233
|
-
merged_name = new_segment_name()
|
234
|
-
merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
|
235
|
-
|
236
|
-
if (@segment_infos.size() == 1) # add existing index, if any
|
237
|
-
s_reader = SegmentReader.get(@segment_infos[0])
|
238
|
-
merger << s_reader
|
239
|
-
segments_to_delete << s_reader
|
240
|
-
end
|
241
|
-
|
242
|
-
readers.each do |reader|
|
243
|
-
merger << reader
|
244
|
-
end
|
245
|
-
|
246
|
-
doc_count = merger.merge() # merge 'em
|
247
|
-
|
248
|
-
@segment_infos.clear() # pop old infos & add new
|
249
|
-
@segment_infos << SegmentInfo.new(merged_name, doc_count, @directory)
|
250
|
-
|
251
|
-
@directory.synchronize() do
|
252
|
-
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
253
|
-
@segment_infos.write(@directory) # commit changes
|
254
|
-
delete_segments(segments_to_delete)
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
if @use_compound_file
|
259
|
-
files_to_delete = merger.create_compound_file(merged_name + ".tmp")
|
260
|
-
@directory.synchronize() do # in- & inter-process sync
|
261
|
-
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
262
|
-
# make compound file visible for SegmentReaders
|
263
|
-
@directory.rename(merged_name + ".tmp", merged_name + ".cfs")
|
264
|
-
# delete now unused files of segment
|
265
|
-
delete_files_and_write_undeletable(files_to_delete)
|
266
|
-
end
|
267
|
-
end
|
268
|
-
end
|
269
|
-
|
270
|
-
optimize()
|
271
|
-
end
|
272
|
-
end
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
private
|
277
|
-
|
278
|
-
# Use compound file setting. Defaults to true, minimizing the number of
|
279
|
-
# files used. Setting this to false may improve indexing performance, but
|
280
|
-
# may also cause file handle problems.
|
281
|
-
@use_compound_file = true
|
282
|
-
|
283
|
-
# The maximum number of terms that will be indexed for a single field in a
|
284
|
-
# document. This limits the amount of memory required for indexing, so that
|
285
|
-
# collections with very large files will not crash the indexing process by
|
286
|
-
# running out of memory.
|
287
|
-
#
|
288
|
-
# Note that this effectively truncates large documents, excluding from the
|
289
|
-
# index terms that occur further in the document. If you know your source
|
290
|
-
# documents are large, be sure to set this value high enough to accomodate
|
291
|
-
# the expected size. If you set it to a really big number, then the only limit
|
292
|
-
# is your memory, but you should anticipate an OutOfMemoryError.
|
293
|
-
#
|
294
|
-
# By default, no more than 10,000 terms will be indexed for a field.
|
295
|
-
@max_field_length = DEFAULT_MAX_FIELD_LENGTH
|
296
|
-
|
297
|
-
def new_segment_name()
|
298
|
-
# The name will be "_" + seg_counter where seg_counter is stored in
|
299
|
-
# radix of 36 which is equal to MAX_RADIX in Java
|
300
|
-
synchronize() do
|
301
|
-
seg_name = "_" + @segment_infos.counter.to_s(36)
|
302
|
-
@segment_infos.counter+=1
|
303
|
-
return seg_name
|
304
|
-
end
|
305
|
-
end
|
306
|
-
|
307
|
-
# Determines how often segment indices are merged by add_document(). With
|
308
|
-
# smaller values, less RAM is used while indexing, and searches on
|
309
|
-
# unoptimized indices are faster, but indexing speed is slower. With larger
|
310
|
-
# values, more RAM is used during indexing, and while searches on unoptimized
|
311
|
-
# indices are slower, indexing is faster. Thus larger values (> 10) are best
|
312
|
-
# for batch index creation, and smaller values (< 10) for indices that are
|
313
|
-
# interactively maintained.
|
314
|
-
#
|
315
|
-
# This must never be less than 2. The default value is 10.*/
|
316
|
-
@merge_factor = DEFAULT_MERGE_FACTOR
|
317
|
-
|
318
|
-
# Determines the minimal number of documents required before the buffered
|
319
|
-
# in-memory documents are merging and a new Segment is created.
|
320
|
-
# Since Documents are merged in a org.apache.lucene.store.RAMDirectory},
|
321
|
-
# large value gives faster indexing. At the same time, merge_factor limits
|
322
|
-
# the number of files open in a FSDirectory.
|
323
|
-
#
|
324
|
-
# The default value is 10.*/
|
325
|
-
@min_merge_docs = DEFAULT_MIN_MERGE_DOCS
|
326
|
-
|
327
|
-
|
328
|
-
# Determines the largest number of documents ever merged by add_document().
|
329
|
-
# Small values (e.g., less than 10,000) are best for interactive indexing,
|
330
|
-
# as this limits the length of pauses while indexing to a few seconds.
|
331
|
-
# Larger values are best for batched indexing and speedier searches.
|
332
|
-
@max_merge_docs = DEFAULT_MAX_MERGE_DOCS
|
333
|
-
|
334
|
-
# Merges all RAM-resident segments.
|
335
|
-
def flush_ram_segments()
|
336
|
-
min_segment = @segment_infos.size()-1
|
337
|
-
doc_count = 0
|
338
|
-
while (min_segment >= 0 &&
|
339
|
-
(@segment_infos[min_segment]).directory == @ram_directory)
|
340
|
-
doc_count += @segment_infos[min_segment].doc_count
|
341
|
-
min_segment -= 1
|
342
|
-
end
|
343
|
-
if (min_segment < 0 || # add one FS segment?
|
344
|
-
(doc_count + @segment_infos[min_segment].doc_count) > @merge_factor ||
|
345
|
-
!(@segment_infos[@segment_infos.size-1].directory == @ram_directory))
|
346
|
-
min_segment += 1
|
347
|
-
end
|
348
|
-
if (min_segment >= @segment_infos.size()) then
|
349
|
-
return
|
350
|
-
end # none to merge
|
351
|
-
merge_segments(min_segment)
|
352
|
-
end
|
353
|
-
|
354
|
-
# Incremental segment merger.
|
355
|
-
def maybe_merge_segments()
|
356
|
-
target_merge_docs = @min_merge_docs
|
357
|
-
while (target_merge_docs <= @max_merge_docs)
|
358
|
-
# find segments smaller than current target size
|
359
|
-
min_segment = @segment_infos.size() - 1
|
360
|
-
merge_docs = 0
|
361
|
-
while (min_segment >= 0)
|
362
|
-
si = @segment_infos[min_segment]
|
363
|
-
if (si.doc_count >= target_merge_docs)
|
364
|
-
break
|
365
|
-
end
|
366
|
-
merge_docs += si.doc_count
|
367
|
-
min_segment -= 1
|
368
|
-
end
|
369
|
-
|
370
|
-
if (merge_docs >= target_merge_docs) # found a merge to do
|
371
|
-
merge_segments(min_segment + 1)
|
372
|
-
else
|
373
|
-
break
|
374
|
-
end
|
375
|
-
|
376
|
-
target_merge_docs *= @merge_factor # increase target size
|
377
|
-
end
|
378
|
-
end
|
379
|
-
|
380
|
-
# Pops segments off of @segment_infos stack down to min_segment, merges them,
|
381
|
-
# and pushes the merged index onto the top of the @segment_infos stack.
|
382
|
-
def merge_segments(min_segment, max_segment = @segment_infos.size)
|
383
|
-
segments_to_delete = []
|
384
|
-
merged_name = new_segment_name()
|
385
|
-
if @info_stream != nil
|
386
|
-
@info_stream.print("merging segments from #{min_segment} " +
|
387
|
-
"to #{(max_segment - 1)}\n")
|
388
|
-
end
|
389
|
-
merger = SegmentMerger.new(@directory, merged_name, @term_index_interval)
|
390
|
-
|
391
|
-
(min_segment ... max_segment).each do |i|
|
392
|
-
si = @segment_infos[i]
|
393
|
-
if (@info_stream != nil)
|
394
|
-
@info_stream.print(" #{si.name} (#{si.doc_count} docs)\n")
|
395
|
-
end
|
396
|
-
reader = SegmentReader.new(si.directory, si, nil, false, false)
|
397
|
-
merger.add(reader)
|
398
|
-
if ((reader.directory() == @directory) || # if we own the directory
|
399
|
-
(reader.directory() == @ram_directory))
|
400
|
-
segments_to_delete << reader # queue segment for deletion
|
401
|
-
end
|
402
|
-
end
|
403
|
-
|
404
|
-
merged_doc_count = merger.merge()
|
405
|
-
|
406
|
-
if (@info_stream != nil)
|
407
|
-
@info_stream.print(" into #{merged_name} (#{merged_doc_count.to_s} docs)\n")
|
408
|
-
end
|
409
|
-
|
410
|
-
(max_segment-1).downto(min_segment) {|i| @segment_infos.delete_at(i) }
|
411
|
-
|
412
|
-
@segment_infos << SegmentInfo.new(merged_name, merged_doc_count, @directory)
|
413
|
-
|
414
|
-
# close readers before we attempt to delete now-obsolete segments
|
415
|
-
merger.close_readers()
|
416
|
-
|
417
|
-
@directory.synchronize() do
|
418
|
-
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
419
|
-
@segment_infos.write(@directory) # commit before deleting
|
420
|
-
delete_segments(segments_to_delete) # delete now-unused segments
|
421
|
-
end
|
422
|
-
end
|
423
|
-
|
424
|
-
if @use_compound_file
|
425
|
-
files_to_delete = merger.create_compound_file(merged_name + ".tmp")
|
426
|
-
@directory.synchronize() do # in- & inter-process sync
|
427
|
-
@directory.make_lock(COMMIT_LOCK_NAME).while_locked(COMMIT_LOCK_TIMEOUT) do
|
428
|
-
# make compound file visible for SegmentReaders
|
429
|
-
@directory.rename(merged_name + ".tmp", merged_name + ".cfs")
|
430
|
-
# delete now unused files of segment
|
431
|
-
delete_files_and_write_undeletable(files_to_delete)
|
432
|
-
end
|
433
|
-
end
|
434
|
-
end
|
435
|
-
|
436
|
-
end
|
437
|
-
|
438
|
-
# Some operating systems (e.g. Windows) don't permit a file to be
|
439
|
-
# deleted while it is opened for read (e.g. by another process or
|
440
|
-
# thread). So we assume that when a delete fails it is because the
|
441
|
-
# file is open in another process, and queue the file for subsequent
|
442
|
-
# deletion.
|
443
|
-
def delete_segments(segment_readers)
|
444
|
-
deletable = []
|
445
|
-
|
446
|
-
try_to_delete_files(read_deleteable_files(), deletable)
|
447
|
-
segment_readers.each do |segment_reader|
|
448
|
-
if (segment_reader.directory() == @directory)
|
449
|
-
try_to_delete_files(segment_reader.file_names(), deletable)
|
450
|
-
else
|
451
|
-
# delete other files
|
452
|
-
delete_files(segment_reader.file_names(), segment_reader.directory())
|
453
|
-
end
|
454
|
-
end
|
455
|
-
|
456
|
-
write_deleteable_files(deletable) # note files we can't delete
|
457
|
-
# This is a great time to start the garbage collector as all of our
|
458
|
-
# ram files have just become free
|
459
|
-
#GC.start
|
460
|
-
|
461
|
-
##############################################################################
|
462
|
-
# objs = {}
|
463
|
-
# ObjectSpace.each_object do |obj|
|
464
|
-
# objs[obj.class] ||= 0
|
465
|
-
# objs[obj.class] += 1
|
466
|
-
# end
|
467
|
-
# File.open('objects.out','a+') do |fh|
|
468
|
-
# fh.puts("____________________")
|
469
|
-
# fh.puts("____________________")
|
470
|
-
# objs.each_pair do |obj, count|
|
471
|
-
# fh.puts "#{count}\t#{obj}"
|
472
|
-
# end
|
473
|
-
# end
|
474
|
-
##############################################################################
|
475
|
-
|
476
|
-
end
|
477
|
-
|
478
|
-
def delete_files_and_write_undeletable(files)
|
479
|
-
deletable = []
|
480
|
-
try_to_delete_files(read_deleteable_files(), deletable) # try to delete deleteable
|
481
|
-
try_to_delete_files(files, deletable) # try to delete our files
|
482
|
-
write_deleteable_files(deletable) # note files we can't delete
|
483
|
-
end
|
484
|
-
|
485
|
-
def delete_files(file_names, dir)
|
486
|
-
file_names.each do |file_name|
|
487
|
-
dir.delete(file_name)
|
488
|
-
end
|
489
|
-
end
|
490
|
-
|
491
|
-
def try_to_delete_files(file_names, deletable)
|
492
|
-
file_names.each do |file_name|
|
493
|
-
begin
|
494
|
-
@directory.delete(file_name) # try to delete each file
|
495
|
-
rescue IOError => e
|
496
|
-
if (@directory.exists?(file_name))
|
497
|
-
if (@info_stream != nil) then @info_stream.print(e.to_s + " Will re-try later.") end
|
498
|
-
deletable << file_name # add to deletable
|
499
|
-
end
|
500
|
-
end
|
501
|
-
end
|
502
|
-
end
|
503
|
-
|
504
|
-
def read_deleteable_files()
|
505
|
-
file_names = []
|
506
|
-
if (!@directory.exists?("deletable")) then return file_names end
|
507
|
-
|
508
|
-
input = @directory.open_input("deletable")
|
509
|
-
begin
|
510
|
-
file_count = input.read_int()
|
511
|
-
file_count.times do
|
512
|
-
file_names << input.read_string()
|
513
|
-
end
|
514
|
-
ensure
|
515
|
-
input.close()
|
516
|
-
end
|
517
|
-
return file_names
|
518
|
-
end
|
519
|
-
|
520
|
-
def write_deleteable_files(file_names)
|
521
|
-
output = @directory.create_output("deleteable.new")
|
522
|
-
begin
|
523
|
-
output.write_int(file_names.size())
|
524
|
-
file_names.each do |file_name|
|
525
|
-
output.write_string(file_name)
|
526
|
-
end
|
527
|
-
ensure
|
528
|
-
output.close()
|
529
|
-
end
|
530
|
-
@directory.rename("deleteable.new", "deletable")
|
531
|
-
end
|
532
|
-
end
|
533
|
-
end
|
534
|
-
end
|