ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,130 +0,0 @@
|
|
1
|
-
module Ferret
|
2
|
-
module Index
|
3
|
-
# Holds the info for one segment.
|
4
|
-
#
|
5
|
-
# ToDo: Does the dir really need to be stored here?
|
6
|
-
class SegmentInfo
|
7
|
-
attr_accessor :name, :doc_count, :directory
|
8
|
-
|
9
|
-
def initialize(name, doc_count, dir)
|
10
|
-
@name = name
|
11
|
-
@doc_count = doc_count
|
12
|
-
@directory = dir
|
13
|
-
end
|
14
|
-
|
15
|
-
def ==(o)
|
16
|
-
(o.name == @name and o.doc_count == @doc_count)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
class SegmentInfos < Array
|
21
|
-
# for compatability with Java Ferret files
|
22
|
-
FORMAT = -1
|
23
|
-
SEGMENT_FILENAME = "segments"
|
24
|
-
TEMPORARY_SEGMENT_FILENAME = "segments.new"
|
25
|
-
|
26
|
-
attr_reader :version # counts how often the index has been modified
|
27
|
-
# by adding or deleting docs
|
28
|
-
attr_accessor :counter # used to name new segments??
|
29
|
-
|
30
|
-
# Current version number from segments file.
|
31
|
-
def SegmentInfos.read_current_version(directory)
|
32
|
-
return 0 if not directory.exists?(SEGMENT_FILENAME)
|
33
|
-
input = directory.open_input(SEGMENT_FILENAME)
|
34
|
-
@format = 0
|
35
|
-
@version = 0
|
36
|
-
begin
|
37
|
-
@format = input.read_int()
|
38
|
-
if(@format < 0)
|
39
|
-
if (@format < FORMAT) then raise "Unknown format version: " + @format end
|
40
|
-
@version = input.read_long() # read version
|
41
|
-
end
|
42
|
-
ensure
|
43
|
-
input.close()
|
44
|
-
end
|
45
|
-
|
46
|
-
if(@format < 0)
|
47
|
-
return @version
|
48
|
-
end
|
49
|
-
|
50
|
-
# We cannot be sure about the format of the file.
|
51
|
-
# Therefore we have to read the whole file and cannot simply
|
52
|
-
# seek to the version entry.
|
53
|
-
|
54
|
-
sis = SegmentInfos.new()
|
55
|
-
sis.read(directory)
|
56
|
-
return sis.version()
|
57
|
-
end
|
58
|
-
|
59
|
-
def initialize()
|
60
|
-
@version = Time.now.to_i * 1000
|
61
|
-
@counter = 0
|
62
|
-
end
|
63
|
-
|
64
|
-
def initialize_copy(o)
|
65
|
-
super
|
66
|
-
o.each_index {|i| self[i] = o[i].clone}
|
67
|
-
end
|
68
|
-
|
69
|
-
def read(directory)
|
70
|
-
input = directory.open_input(SEGMENT_FILENAME)
|
71
|
-
begin
|
72
|
-
@format = input.read_int()
|
73
|
-
if(@format < 0) # file contains explicit format info
|
74
|
-
# check that it is a format we can understand
|
75
|
-
if (@format < FORMAT) then raise "Unknown format version: " + @format end
|
76
|
-
@version = input.read_long()
|
77
|
-
@counter = input.read_int()
|
78
|
-
else # file is in old format without explicit format info
|
79
|
-
@counter = @format
|
80
|
-
end
|
81
|
-
|
82
|
-
seg_count = input.read_int()
|
83
|
-
seg_count.times do
|
84
|
-
self << SegmentInfo.new(input.read_string(),
|
85
|
-
input.read_int(),
|
86
|
-
directory)
|
87
|
-
end
|
88
|
-
|
89
|
-
if(@format >= 0)
|
90
|
-
# in old format the version number may be at the end of the file
|
91
|
-
if (input.pos() >= input.length())
|
92
|
-
@version = 0 # old file format without version number
|
93
|
-
else
|
94
|
-
@version = input.read_long() # read version
|
95
|
-
end
|
96
|
-
end
|
97
|
-
ensure
|
98
|
-
input.close()
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
def write(directory)
|
103
|
-
output = directory.create_output(TEMPORARY_SEGMENT_FILENAME)
|
104
|
-
begin
|
105
|
-
output.write_int(FORMAT) # write FORMAT
|
106
|
-
output.write_long(@version += 1) # every write changes the index
|
107
|
-
output.write_int(@counter) # write counter
|
108
|
-
output.write_int(size()) # write infos
|
109
|
-
each() do |si|
|
110
|
-
output.write_string(si.name)
|
111
|
-
output.write_int(si.doc_count)
|
112
|
-
end
|
113
|
-
|
114
|
-
ensure
|
115
|
-
output.close()
|
116
|
-
end
|
117
|
-
|
118
|
-
# install new segment info
|
119
|
-
directory.rename(TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME)
|
120
|
-
end
|
121
|
-
|
122
|
-
def to_s()
|
123
|
-
str = "\nSegmentInfos: <"
|
124
|
-
each() { |si| str << "#{si.name}:#{si.doc_count}," }
|
125
|
-
str[-1] = ">"
|
126
|
-
str
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
@@ -1,49 +0,0 @@
|
|
1
|
-
module Ferret
|
2
|
-
module Index
|
3
|
-
class SegmentMergeInfo
|
4
|
-
attr_reader :term_enum, :reader, :base, :term_buffer
|
5
|
-
|
6
|
-
def initialize(base, term_enum, reader)
|
7
|
-
@base = base
|
8
|
-
@reader = reader
|
9
|
-
@term_enum = term_enum
|
10
|
-
@term_buffer = term_enum.term_buffer
|
11
|
-
end
|
12
|
-
|
13
|
-
def positions
|
14
|
-
@postings ||= @reader.term_positions()
|
15
|
-
end
|
16
|
-
|
17
|
-
def doc_map
|
18
|
-
if @doc_map.nil?
|
19
|
-
# build array which maps document numbers around deletions
|
20
|
-
if (@reader.has_deletions?())
|
21
|
-
max_doc = @reader.max_doc()
|
22
|
-
@doc_map = Array.new(max_doc)
|
23
|
-
j = 0
|
24
|
-
max_doc.times do |i|
|
25
|
-
if (@reader.deleted?(i))
|
26
|
-
@doc_map[i] = -1
|
27
|
-
else
|
28
|
-
@doc_map[i] = j
|
29
|
-
j += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
return @doc_map
|
35
|
-
end
|
36
|
-
|
37
|
-
def next?
|
38
|
-
@term_enum.next?
|
39
|
-
end
|
40
|
-
|
41
|
-
def close()
|
42
|
-
@term_enum.close()
|
43
|
-
@postings.close() if @postings
|
44
|
-
@reader = nil
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
@@ -1,16 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
class SegmentMergeQueue < Ferret::Utils::PriorityQueue
|
3
|
-
def less_than(sti_a, sti_b)
|
4
|
-
if sti_a.term_buffer == sti_b.term_buffer
|
5
|
-
return sti_a.base < sti_b.base
|
6
|
-
else
|
7
|
-
return sti_a.term_buffer < sti_b.term_buffer
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
|
-
def close()
|
12
|
-
@heap.each {|sti| sti.close if sti}
|
13
|
-
clear
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
@@ -1,358 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
|
3
|
-
# The SegmentMerger class combines two or more Segments, represented by
|
4
|
-
# an IndexReader#add, into a single Segment. After adding the
|
5
|
-
# appropriate readers, call the merge method to combine the segments.
|
6
|
-
#
|
7
|
-
# If the compoundFile flag is set, then the segments will be merged
|
8
|
-
# into a compound file.
|
9
|
-
class SegmentMerger
|
10
|
-
|
11
|
-
# dir:: The Directory to merge the other segments into
|
12
|
-
# name:: The name of the new segment
|
13
|
-
def initialize(dir, name,
|
14
|
-
term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
|
15
|
-
@directory = dir
|
16
|
-
@segment = name
|
17
|
-
@term_index_interval = term_index_interval
|
18
|
-
@readers = []
|
19
|
-
@field_infos = nil
|
20
|
-
@freq_output = nil
|
21
|
-
@prox_output = nil
|
22
|
-
@term_infos_writer = nil
|
23
|
-
@queue = nil
|
24
|
-
@term_info = TermInfo.new()
|
25
|
-
@skip_buffer = Ferret::Store::RAMDirectory::RAMIndexOutput.new(
|
26
|
-
Ferret::Store::RAMDirectory::RAMFile.new(""))
|
27
|
-
end
|
28
|
-
|
29
|
-
# Add an IndexReader to the collection of readers that are to be merged
|
30
|
-
# reader::
|
31
|
-
def add(reader)
|
32
|
-
@readers << reader
|
33
|
-
end
|
34
|
-
alias :<< :add
|
35
|
-
|
36
|
-
#
|
37
|
-
# i:: The index of the reader to return
|
38
|
-
# returns:: The ith reader to be merged
|
39
|
-
def segment_reader(i)
|
40
|
-
return @readers[i]
|
41
|
-
end
|
42
|
-
|
43
|
-
# Merges the readers specified by the #add method into the directory
|
44
|
-
# passed to the constructor
|
45
|
-
#
|
46
|
-
# returns:: The number of documents that were merged
|
47
|
-
# raises:: IOError
|
48
|
-
def merge()
|
49
|
-
value = merge_fields()
|
50
|
-
merge_terms()
|
51
|
-
merge_norms()
|
52
|
-
merge_vectors() if @field_infos.has_vectors?
|
53
|
-
return value
|
54
|
-
end
|
55
|
-
|
56
|
-
# close all IndexReaders that have been added. Should not be called
|
57
|
-
# before merge().
|
58
|
-
#
|
59
|
-
# raises:: IOError
|
60
|
-
def close_readers()
|
61
|
-
@readers.each { |reader| reader.close }
|
62
|
-
end
|
63
|
-
|
64
|
-
def create_compound_file(file_name)
|
65
|
-
|
66
|
-
cfs_writer = CompoundFileWriter.new(@directory, file_name)
|
67
|
-
|
68
|
-
files = []
|
69
|
-
|
70
|
-
# Basic files
|
71
|
-
IndexFileNames::COMPOUND_EXTENSIONS.each do |ext|
|
72
|
-
files << "#{@segment}.#{ext}"
|
73
|
-
end
|
74
|
-
|
75
|
-
# Field norm files
|
76
|
-
@field_infos.each_with_index do |fi, i|
|
77
|
-
if (fi.indexed? and not fi.omit_norms?)
|
78
|
-
files << "#{@segment}.f#{i}"
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# Vector files
|
83
|
-
if @field_infos.has_vectors?
|
84
|
-
IndexFileNames::VECTOR_EXTENSIONS.each do |ext|
|
85
|
-
files << "#{@segment}.#{ext}"
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
# Now merge all added files
|
90
|
-
files.each do |file|
|
91
|
-
cfs_writer.add_file(file)
|
92
|
-
end
|
93
|
-
|
94
|
-
# Perform the merge
|
95
|
-
cfs_writer.close
|
96
|
-
|
97
|
-
return files
|
98
|
-
end
|
99
|
-
|
100
|
-
def add_indexed(reader, field_infos, field_names,
|
101
|
-
store_term_vectors,
|
102
|
-
store_position_with_term_vector,
|
103
|
-
store_offset_with_term_vector)
|
104
|
-
field_names.each do |field|
|
105
|
-
field_infos.add(field, true,
|
106
|
-
store_term_vectors,
|
107
|
-
store_position_with_term_vector,
|
108
|
-
store_offset_with_term_vector,
|
109
|
-
!reader.has_norms?(field))
|
110
|
-
end
|
111
|
-
end
|
112
|
-
private :add_indexed
|
113
|
-
|
114
|
-
|
115
|
-
#
|
116
|
-
# returns:: The number of documents in all of the readers
|
117
|
-
# raises:: IOError
|
118
|
-
def merge_fields()
|
119
|
-
@field_infos = FieldInfos.new() # merge field names
|
120
|
-
doc_count = 0
|
121
|
-
@readers.each do |reader|
|
122
|
-
add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true)
|
123
|
-
add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, false)
|
124
|
-
add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, false, true)
|
125
|
-
add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, false, false)
|
126
|
-
add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::INDEXED), false, false, false)
|
127
|
-
@field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::UNINDEXED), false)
|
128
|
-
end
|
129
|
-
@field_infos.write_to_dir(@directory, @segment + ".fnm")
|
130
|
-
|
131
|
-
# merge field values
|
132
|
-
fields_writer = FieldsWriter.new(@directory, @segment, @field_infos)
|
133
|
-
|
134
|
-
begin
|
135
|
-
@readers.each do |reader|
|
136
|
-
max_doc = reader.max_doc()
|
137
|
-
max_doc.times do |j|
|
138
|
-
if not reader.deleted?(j) # skip deleted docs
|
139
|
-
fields_writer.add_document(reader.get_document(j))
|
140
|
-
doc_count += 1
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
ensure
|
145
|
-
fields_writer.close()
|
146
|
-
end
|
147
|
-
return doc_count
|
148
|
-
end
|
149
|
-
|
150
|
-
# Merge the TermVectors from each of the segments into the new one.
|
151
|
-
# raises:: IOError
|
152
|
-
def merge_vectors()
|
153
|
-
term_vectors_writer = TermVectorsWriter.new(@directory, @segment, @field_infos)
|
154
|
-
|
155
|
-
begin
|
156
|
-
@readers.each do |reader|
|
157
|
-
max_doc = reader.max_doc()
|
158
|
-
max_doc.times do |doc_num|
|
159
|
-
# skip deleted docs
|
160
|
-
next if (reader.deleted?(doc_num))
|
161
|
-
term_vectors_writer.add_all_doc_vectors(reader.get_term_vectors(doc_num))
|
162
|
-
end
|
163
|
-
end
|
164
|
-
ensure
|
165
|
-
term_vectors_writer.close()
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
def merge_terms()
|
170
|
-
begin
|
171
|
-
@freq_output = @directory.create_output(@segment + ".frq")
|
172
|
-
@prox_output = @directory.create_output(@segment + ".prx")
|
173
|
-
@term_infos_writer =
|
174
|
-
TermInfosWriter.new(@directory, @segment, @field_infos,
|
175
|
-
@term_index_interval)
|
176
|
-
@skip_interval = @term_infos_writer.skip_interval
|
177
|
-
@queue = SegmentMergeQueue.new(@readers.size())
|
178
|
-
|
179
|
-
merge_term_infos()
|
180
|
-
|
181
|
-
ensure
|
182
|
-
[@freq_output, @prox_output, @term_infos_writer, @queue].each do |obj|
|
183
|
-
obj.close()
|
184
|
-
end
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
|
-
def merge_term_infos()
|
189
|
-
base = 0
|
190
|
-
@readers.each do |reader|
|
191
|
-
term_enum = reader.terms()
|
192
|
-
smi = SegmentMergeInfo.new(base, term_enum, reader)
|
193
|
-
base += reader.num_docs()
|
194
|
-
if (smi.next?)
|
195
|
-
@queue.push(smi) # initialize @queue
|
196
|
-
else
|
197
|
-
smi.close()
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
match = Array.new(@readers.size)
|
202
|
-
|
203
|
-
while (@queue.size > 0)
|
204
|
-
match_size = 0 # pop matching terms
|
205
|
-
match[match_size] = @queue.pop
|
206
|
-
match_size += 1
|
207
|
-
#term = match[0].term
|
208
|
-
term_buffer = match[0].term_buffer
|
209
|
-
top = @queue.top
|
210
|
-
|
211
|
-
#while top and term == top.term
|
212
|
-
while top and term_buffer == top.term_buffer
|
213
|
-
match[match_size] = @queue.pop
|
214
|
-
match_size += 1
|
215
|
-
top = @queue.top
|
216
|
-
end
|
217
|
-
|
218
|
-
merge_term_info(match, match_size) # add new TermInfo
|
219
|
-
|
220
|
-
while (match_size > 0)
|
221
|
-
match_size -= 1
|
222
|
-
smi = match[match_size]
|
223
|
-
if (smi.next?)
|
224
|
-
@queue.push(smi) # restore queue
|
225
|
-
else
|
226
|
-
smi.close() # done with a segment
|
227
|
-
end
|
228
|
-
end
|
229
|
-
end
|
230
|
-
end
|
231
|
-
|
232
|
-
# Merge one term found in one or more segments. The array <code>smis</code>
|
233
|
-
# contains segments that are positioned at the same term. <code>N</code>
|
234
|
-
# is the number of cells in the array actually occupied.
|
235
|
-
#
|
236
|
-
# smis:: array of segments
|
237
|
-
# n:: number of cells in the array actually occupied
|
238
|
-
def merge_term_info(smis, n)
|
239
|
-
|
240
|
-
freq_pointer = @freq_output.pos
|
241
|
-
prox_pointer = @prox_output.pos
|
242
|
-
|
243
|
-
df = append_postings(smis, n) # append posting data
|
244
|
-
|
245
|
-
skip_pointer = write_skip()
|
246
|
-
|
247
|
-
if (df > 0)
|
248
|
-
# add an entry to the dictionary with pointers to prox and freq files
|
249
|
-
@term_info.set_values!(df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer))
|
250
|
-
@term_infos_writer.add(smis[0].term_buffer.term, @term_info)
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
254
|
-
# Process postings from multiple segments all positioned on the
|
255
|
-
# same term. Writes out merged entries into @freq_utput and
|
256
|
-
# the @prox_output streams.
|
257
|
-
#
|
258
|
-
# smis:: array of segments
|
259
|
-
# n:: number of cells in the array actually occupied
|
260
|
-
# returns:: number of documents across all segments where this term was found
|
261
|
-
def append_postings(smis, n)
|
262
|
-
last_doc = 0
|
263
|
-
df = 0 # number of docs w/ term
|
264
|
-
reset_skip()
|
265
|
-
n.times do |i|
|
266
|
-
smi = smis[i]
|
267
|
-
postings = smi.positions
|
268
|
-
base = smi.base
|
269
|
-
doc_map = smi.doc_map
|
270
|
-
|
271
|
-
postings.seek(smi.term_enum)
|
272
|
-
while (postings.next?)
|
273
|
-
doc = postings.doc()
|
274
|
-
doc = doc_map[doc] if (doc_map != nil) # work around deletions
|
275
|
-
doc += base # convert to merged space
|
276
|
-
|
277
|
-
if (doc < last_doc)
|
278
|
-
raise "docs out of order curent doc = " + doc.to_s +
|
279
|
-
" and previous doc = " + last_doc.to_s
|
280
|
-
end
|
281
|
-
|
282
|
-
df += 1
|
283
|
-
|
284
|
-
if ((df % @skip_interval) == 0)
|
285
|
-
buffer_skip(last_doc)
|
286
|
-
end
|
287
|
-
|
288
|
-
doc_code = (doc - last_doc) << 1 # use low bit to flag freq=1
|
289
|
-
last_doc = doc
|
290
|
-
|
291
|
-
freq = postings.freq
|
292
|
-
if (freq == 1)
|
293
|
-
@freq_output.write_vint(doc_code | 1) # write doc & freq=1
|
294
|
-
else
|
295
|
-
@freq_output.write_vint(doc_code) # write doc
|
296
|
-
@freq_output.write_vint(freq) # write frequency in doc
|
297
|
-
end
|
298
|
-
|
299
|
-
last_position = 0 # write position deltas
|
300
|
-
freq.times do |j|
|
301
|
-
position = postings.next_position()
|
302
|
-
@prox_output.write_vint(position - last_position)
|
303
|
-
last_position = position
|
304
|
-
end
|
305
|
-
end
|
306
|
-
end
|
307
|
-
return df
|
308
|
-
end
|
309
|
-
|
310
|
-
def reset_skip()
|
311
|
-
@skip_buffer.reset()
|
312
|
-
@last_skip_doc = 0
|
313
|
-
@last_skip_freq_pointer = @freq_output.pos
|
314
|
-
@last_skip_prox_pointer = @prox_output.pos
|
315
|
-
end
|
316
|
-
|
317
|
-
def buffer_skip(doc)
|
318
|
-
freq_pointer = @freq_output.pos
|
319
|
-
prox_pointer = @prox_output.pos
|
320
|
-
|
321
|
-
@skip_buffer.write_vint(doc - @last_skip_doc)
|
322
|
-
@skip_buffer.write_vint(freq_pointer - @last_skip_freq_pointer)
|
323
|
-
@skip_buffer.write_vint(prox_pointer - @last_skip_prox_pointer)
|
324
|
-
|
325
|
-
@last_skip_doc = doc
|
326
|
-
@last_skip_freq_pointer = freq_pointer
|
327
|
-
@last_skip_prox_pointer = prox_pointer
|
328
|
-
end
|
329
|
-
|
330
|
-
def write_skip()
|
331
|
-
skip_pointer = @freq_output.pos
|
332
|
-
@skip_buffer.write_to(@freq_output)
|
333
|
-
return skip_pointer
|
334
|
-
end
|
335
|
-
|
336
|
-
def merge_norms()
|
337
|
-
@field_infos.each_with_index do |fi, i|
|
338
|
-
if (fi.indexed? and not fi.omit_norms?)
|
339
|
-
output = @directory.create_output(@segment + ".f" + i.to_s)
|
340
|
-
begin
|
341
|
-
@readers.each do |reader|
|
342
|
-
max_doc = reader.max_doc()
|
343
|
-
input = "0" * max_doc
|
344
|
-
reader.get_norms_into(fi.name, input, 0)
|
345
|
-
max_doc.times do |k|
|
346
|
-
if not reader.deleted?(k)
|
347
|
-
output.write_byte(input[k])
|
348
|
-
end
|
349
|
-
end
|
350
|
-
end
|
351
|
-
ensure
|
352
|
-
output.close()
|
353
|
-
end
|
354
|
-
end
|
355
|
-
end
|
356
|
-
end
|
357
|
-
end
|
358
|
-
end
|