ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,412 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
|
3
|
-
# FIXME: Describe class +SegmentReader+ here.
|
4
|
-
#
|
5
|
-
class SegmentReader < IndexReader
|
6
|
-
|
7
|
-
attr_reader :freq_stream, :prox_stream, :deleted_docs,
|
8
|
-
:term_infos, :field_infos, :segment
|
9
|
-
|
10
|
-
def SegmentReader.get(info, infos = nil, close = false)
|
11
|
-
return SegmentReader.new(info.directory, info, infos, close, infos!=nil)
|
12
|
-
end
|
13
|
-
|
14
|
-
def initialize(dir, info, seg_infos, close, owner)
|
15
|
-
super(dir, seg_infos, close, owner)
|
16
|
-
@segment = info.name
|
17
|
-
|
18
|
-
@cfs_reader = nil
|
19
|
-
dir = directory
|
20
|
-
#if directory.exists?(@segment + '.cfs') then
|
21
|
-
if SegmentReader.uses_compound_file?(info)
|
22
|
-
@cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
|
23
|
-
dir = @cfs_reader
|
24
|
-
end
|
25
|
-
|
26
|
-
@field_infos = FieldInfos.new(dir, @segment + '.fnm')
|
27
|
-
@fields_reader = FieldsReader.new(dir, @segment, @field_infos)
|
28
|
-
|
29
|
-
@term_infos = TermInfosReader.new(dir, @segment, @field_infos)
|
30
|
-
@deleted_docs = nil
|
31
|
-
@deleted_docs_dirty = false
|
32
|
-
if SegmentReader.has_deletions?(info) then
|
33
|
-
@deleted_docs =
|
34
|
-
Ferret::Utils::BitVector.read(directory, @segment + '.del')
|
35
|
-
end
|
36
|
-
|
37
|
-
@freq_stream = dir.open_input(@segment + '.frq')
|
38
|
-
@prox_stream = dir.open_input(@segment + '.prx')
|
39
|
-
@norms = {}
|
40
|
-
@norms.extend(MonitorMixin)
|
41
|
-
@norms_dirty = false
|
42
|
-
open_norms(dir)
|
43
|
-
|
44
|
-
@tv_reader_orig = nil
|
45
|
-
if @field_infos.has_vectors? then
|
46
|
-
@tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def do_commit()
|
51
|
-
if (@deleted_docs_dirty) # re-write deleted
|
52
|
-
@deleted_docs.write(@directory, @segment + '.tmp')
|
53
|
-
@directory.rename(@segment + '.tmp', @segment + '.del')
|
54
|
-
end
|
55
|
-
if(@undelete_all and @directory.exists?(@segment + '.del'))
|
56
|
-
@directory.delete(@segment + '.del')
|
57
|
-
end
|
58
|
-
if (@norms_dirty) # re-write norms
|
59
|
-
@norms.each_value do |norm|
|
60
|
-
if norm.dirty?
|
61
|
-
norm.re_write(@directory, @segment, max_doc(), @cfs_reader)
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
@deleted_docs_dirty = false
|
66
|
-
@norms_dirty = false
|
67
|
-
@undelete_all = false
|
68
|
-
end
|
69
|
-
|
70
|
-
def do_close()
|
71
|
-
# clear the cache
|
72
|
-
Thread.current["#{self.object_id}-#{@segment}-tv_reader"] = nil
|
73
|
-
|
74
|
-
@fields_reader.close()
|
75
|
-
@term_infos.close()
|
76
|
-
|
77
|
-
@freq_stream.close() if @freq_stream
|
78
|
-
@prox_stream.close() if @prox_stream
|
79
|
-
|
80
|
-
close_norms()
|
81
|
-
|
82
|
-
@tv_reader_orig.close() if @tv_reader_orig
|
83
|
-
@cfs_reader.close() if @cfs_reader
|
84
|
-
end
|
85
|
-
|
86
|
-
def SegmentReader.has_deletions?(si)
|
87
|
-
return si.directory.exists?(si.name + ".del")
|
88
|
-
end
|
89
|
-
|
90
|
-
def has_deletions?()
|
91
|
-
return @deleted_docs != nil
|
92
|
-
end
|
93
|
-
|
94
|
-
|
95
|
-
def SegmentReader.uses_compound_file?(si)
|
96
|
-
return si.directory.exists?(si.name + ".cfs")
|
97
|
-
end
|
98
|
-
|
99
|
-
def SegmentReader.has_separate_norms?(si)
|
100
|
-
si.directory.each {|f| return true if f =~ /^#{si.name}\.s/}
|
101
|
-
return false
|
102
|
-
end
|
103
|
-
|
104
|
-
def do_delete(doc_num)
|
105
|
-
if (@deleted_docs == nil)
|
106
|
-
@deleted_docs = Ferret::Utils::BitVector.new
|
107
|
-
end
|
108
|
-
@deleted_docs_dirty = true
|
109
|
-
@undelete_all = false
|
110
|
-
@deleted_docs.set(doc_num)
|
111
|
-
end
|
112
|
-
|
113
|
-
def do_undelete_all()
|
114
|
-
@deleted_docs = nil
|
115
|
-
@deleted_docs_dirty = false
|
116
|
-
@undelete_all = true
|
117
|
-
end
|
118
|
-
|
119
|
-
def file_names()
|
120
|
-
file_names = []
|
121
|
-
|
122
|
-
IndexFileNames::INDEX_EXTENSIONS.each do |ext|
|
123
|
-
name = @segment + "." + ext
|
124
|
-
if (@directory.exists?(name))
|
125
|
-
file_names << name
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
@field_infos.each_with_index do |fi, i|
|
130
|
-
if (fi.indexed? and not fi.omit_norms?)
|
131
|
-
if @cfs_reader.nil?
|
132
|
-
name = "#{@segment}.f#{i}"
|
133
|
-
else
|
134
|
-
name = "#{@segment}.s#{i}"
|
135
|
-
end
|
136
|
-
if (@directory.exists?(name))
|
137
|
-
file_names << name
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
return file_names
|
142
|
-
end
|
143
|
-
|
144
|
-
def terms()
|
145
|
-
return @term_infos.terms()
|
146
|
-
end
|
147
|
-
|
148
|
-
def terms_from(t)
|
149
|
-
return @term_infos.terms_from(t)
|
150
|
-
end
|
151
|
-
|
152
|
-
def get_document(n)
|
153
|
-
synchronize do
|
154
|
-
if deleted?(n)
|
155
|
-
raise ArgumentError, "attempt to access a deleted document"
|
156
|
-
end
|
157
|
-
return @fields_reader.doc(n)
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
def deleted?(n)
|
162
|
-
synchronize do
|
163
|
-
return (@deleted_docs != nil and @deleted_docs.get(n))
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
def term_docs()
|
168
|
-
return SegmentTermDocEnum.new(self)
|
169
|
-
end
|
170
|
-
|
171
|
-
def term_positions()
|
172
|
-
return SegmentTermDocPosEnum.new(self)
|
173
|
-
end
|
174
|
-
|
175
|
-
def doc_freq(t)
|
176
|
-
ti = @term_infos.get_term_info(t)
|
177
|
-
if (ti != nil)
|
178
|
-
return ti.doc_freq
|
179
|
-
else
|
180
|
-
return 0
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
def num_docs()
|
185
|
-
n = max_doc()
|
186
|
-
if (@deleted_docs != nil)
|
187
|
-
n -= @deleted_docs.count()
|
188
|
-
end
|
189
|
-
return n
|
190
|
-
end
|
191
|
-
|
192
|
-
def max_doc()
|
193
|
-
return @fields_reader.size()
|
194
|
-
end
|
195
|
-
|
196
|
-
# See IndexReader#get_field_names
|
197
|
-
def get_field_names(field_option = IndexReader::FieldOption::ALL)
|
198
|
-
field_set = Set.new
|
199
|
-
@field_infos.each do |fi|
|
200
|
-
if (field_option == IndexReader::FieldOption::ALL)
|
201
|
-
field_set.add(fi.name)
|
202
|
-
elsif (!fi.indexed? and field_option == IndexReader::FieldOption::UNINDEXED)
|
203
|
-
field_set.add(fi.name)
|
204
|
-
elsif (fi.indexed? and field_option == IndexReader::FieldOption::INDEXED)
|
205
|
-
field_set.add(fi.name)
|
206
|
-
elsif (fi.indexed? and fi.store_term_vector? == false and
|
207
|
-
field_option == IndexReader::FieldOption::INDEXED_NO_TERM_VECTOR)
|
208
|
-
field_set.add(fi.name)
|
209
|
-
elsif (fi.store_term_vector? == true and
|
210
|
-
fi.store_positions? == false and
|
211
|
-
fi.store_offsets? == false and
|
212
|
-
field_option == IndexReader::FieldOption::TERM_VECTOR)
|
213
|
-
field_set.add(fi.name)
|
214
|
-
elsif (fi.indexed? and fi.store_term_vector? and
|
215
|
-
field_option == IndexReader::FieldOption::INDEXED_WITH_TERM_VECTOR)
|
216
|
-
field_set.add(fi.name)
|
217
|
-
elsif (fi.store_positions? and fi.store_offsets? == false and
|
218
|
-
field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION)
|
219
|
-
field_set.add(fi.name)
|
220
|
-
elsif (fi.store_offsets? and fi.store_positions? == false and
|
221
|
-
field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET)
|
222
|
-
field_set.add(fi.name)
|
223
|
-
elsif (fi.store_offsets? and fi.store_positions? and
|
224
|
-
field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET)
|
225
|
-
field_set.add(fi.name)
|
226
|
-
end
|
227
|
-
end
|
228
|
-
return field_set
|
229
|
-
end
|
230
|
-
|
231
|
-
def has_norms?(field)
|
232
|
-
return @norms.has_key?(field)
|
233
|
-
end
|
234
|
-
|
235
|
-
def SegmentReader.create_fake_norms(size)
|
236
|
-
Array.new(size, 1).pack("C*")
|
237
|
-
end
|
238
|
-
|
239
|
-
def fake_norms()
|
240
|
-
return @ones ||= SegmentReader.create_fake_norms(max_doc())
|
241
|
-
end
|
242
|
-
|
243
|
-
def get_norms(field)
|
244
|
-
synchronize do
|
245
|
-
norm = @norms[field]
|
246
|
-
if (norm == nil) # not an indexed field or omit norms
|
247
|
-
return nil
|
248
|
-
end
|
249
|
-
if (norm.bytes == nil) # value not yet read
|
250
|
-
bytes = " " * max_doc()
|
251
|
-
get_norms_into(field, bytes, 0)
|
252
|
-
norm.bytes = bytes # cache it
|
253
|
-
end
|
254
|
-
return norm.bytes
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
def do_set_norm(doc, field, value)
|
259
|
-
norm = @norms[field]
|
260
|
-
if (norm == nil) # not an indexed field
|
261
|
-
return
|
262
|
-
end
|
263
|
-
norm.dirty = true # mark it dirty
|
264
|
-
@norms_dirty = true
|
265
|
-
|
266
|
-
get_norms(field)[doc] = value # set the value
|
267
|
-
end
|
268
|
-
|
269
|
-
# Read norms into a pre-allocated array.
|
270
|
-
def get_norms_into(field, bytes, offset)
|
271
|
-
synchronize do
|
272
|
-
norm = @norms[field]
|
273
|
-
if (norm.nil?)
|
274
|
-
bytes[offset, max_doc()] = fake_norms[0, max_doc()]
|
275
|
-
return
|
276
|
-
end
|
277
|
-
|
278
|
-
if (norm.bytes != nil) # can copy from cache
|
279
|
-
bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
|
280
|
-
return
|
281
|
-
end
|
282
|
-
|
283
|
-
norm_stream = norm.is.clone()
|
284
|
-
begin # read from disk
|
285
|
-
norm_stream.seek(0)
|
286
|
-
norm_stream.read_bytes(bytes, offset, max_doc())
|
287
|
-
ensure
|
288
|
-
norm_stream.close()
|
289
|
-
end
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
def open_norms(cfs_dir)
|
294
|
-
@field_infos.each do |fi|
|
295
|
-
if (fi.indexed? and not fi.omit_norms?)
|
296
|
-
# look first if there are separate norms in compound format
|
297
|
-
file_name = @segment + ".s" + fi.number.to_s
|
298
|
-
d = @directory
|
299
|
-
if not d.exists?(file_name)
|
300
|
-
file_name = @segment + ".f" + fi.number.to_s
|
301
|
-
d = cfs_dir
|
302
|
-
end
|
303
|
-
@norms[fi.name] = Norm.new(d.open_input(file_name), fi.number)
|
304
|
-
end
|
305
|
-
end
|
306
|
-
end
|
307
|
-
|
308
|
-
def close_norms()
|
309
|
-
@norms.synchronize do
|
310
|
-
@norms.each_value {|norm| norm.is.close()}
|
311
|
-
end
|
312
|
-
end
|
313
|
-
|
314
|
-
# Create a clone from the initial TermVectorsReader and store it
|
315
|
-
# in the Thread
|
316
|
-
# returns:: TermVectorsReader
|
317
|
-
def get_term_vectors_reader()
|
318
|
-
#tvr_cache = Thread.current["tv_reader"]
|
319
|
-
#if (tvr_cache == nil)
|
320
|
-
# tvr_cache = Thread.current["tv_reader"] = Ferret::Utils::WeakKeyHash.new
|
321
|
-
#end
|
322
|
-
#tvr_cache.synchronize do
|
323
|
-
# tv_reader = tvr_cache[self]
|
324
|
-
# if tv_reader == nil
|
325
|
-
# tv_reader = @tv_reader_orig.clone()
|
326
|
-
# tvr_cache[self] = tv_reader
|
327
|
-
# end
|
328
|
-
# return tv_reader
|
329
|
-
#end
|
330
|
-
tv_reader = Thread.current.get_local(self)
|
331
|
-
if tv_reader.nil?
|
332
|
-
tv_reader = @tv_reader_orig.clone()
|
333
|
-
Thread.current.set_local(self, tv_reader)
|
334
|
-
end
|
335
|
-
return tv_reader
|
336
|
-
end
|
337
|
-
|
338
|
-
# Return a term frequency vector for the specified document and field. The
|
339
|
-
# vector returned contains term numbers and frequencies for all terms in
|
340
|
-
# the specified field of this document, if the field had storeTermVector
|
341
|
-
# flag set. If the flag was not set, the method returns nil.
|
342
|
-
# raises:: IOException
|
343
|
-
def get_term_vector(doc_number, field)
|
344
|
-
# Check if this field is invalid or has no stored term vector
|
345
|
-
fi = @field_infos[field]
|
346
|
-
if fi.nil? or not fi.store_term_vector? or @tv_reader_orig.nil?
|
347
|
-
return nil
|
348
|
-
end
|
349
|
-
|
350
|
-
term_vectors_reader = get_term_vectors_reader()
|
351
|
-
if (term_vectors_reader == nil)
|
352
|
-
return nil
|
353
|
-
end
|
354
|
-
return term_vectors_reader.get_field_tv(doc_number, field)
|
355
|
-
end
|
356
|
-
|
357
|
-
|
358
|
-
# Return an array of term frequency vectors for the specified document.
|
359
|
-
# The array contains a vector for each vectorized field in the document.
|
360
|
-
# Each vector vector contains term numbers and frequencies for all terms
|
361
|
-
# in a given vectorized field.
|
362
|
-
# If no such fields existed, the method returns nil.
|
363
|
-
# raises:: IOException
|
364
|
-
def get_term_vectors(doc_number)
|
365
|
-
if @tv_reader_orig.nil?
|
366
|
-
return nil
|
367
|
-
end
|
368
|
-
term_vectors_reader = get_term_vectors_reader()
|
369
|
-
if (term_vectors_reader == nil)
|
370
|
-
return nil
|
371
|
-
end
|
372
|
-
return term_vectors_reader.get_tv(doc_number)
|
373
|
-
end
|
374
|
-
|
375
|
-
def dir()
|
376
|
-
return @directory
|
377
|
-
end
|
378
|
-
|
379
|
-
class Norm
|
380
|
-
attr_reader :is
|
381
|
-
attr_writer :dirty
|
382
|
-
attr_accessor :bytes
|
383
|
-
|
384
|
-
def dirty?
|
385
|
-
return @dirty
|
386
|
-
end
|
387
|
-
|
388
|
-
def initialize(is, number)
|
389
|
-
@is = is
|
390
|
-
@number = number
|
391
|
-
end
|
392
|
-
|
393
|
-
def re_write(directory, segment, count, cfs_reader)
|
394
|
-
# NOTE: norms are re-written in regular directory, not cfs
|
395
|
-
out = directory.create_output(segment + ".tmp")
|
396
|
-
begin
|
397
|
-
out.write_bytes(@bytes, count)
|
398
|
-
ensure
|
399
|
-
out.close()
|
400
|
-
end
|
401
|
-
if(cfs_reader == nil)
|
402
|
-
file_name = "#{segment}.f#{@number}"
|
403
|
-
else
|
404
|
-
# use a different file name if we have compound format
|
405
|
-
file_name = "#{segment}.s#{@number}"
|
406
|
-
end
|
407
|
-
directory.rename(segment + ".tmp", file_name)
|
408
|
-
@dirty = false
|
409
|
-
end
|
410
|
-
end
|
411
|
-
end
|
412
|
-
end
|
@@ -1,169 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
class SegmentTermEnum < TermEnum
|
3
|
-
|
4
|
-
INT_MAX = (2**31)-1
|
5
|
-
|
6
|
-
attr_reader :field_infos, :size, :position, :index_pointer,
|
7
|
-
:index_interval, :skip_interval
|
8
|
-
|
9
|
-
def initialize(input, field_infos, is_index)
|
10
|
-
|
11
|
-
@input = input
|
12
|
-
@field_infos = field_infos
|
13
|
-
@is_index = is_index
|
14
|
-
@position = -1
|
15
|
-
|
16
|
-
@term_buffer = TermBuffer.new()
|
17
|
-
@prev_buffer = TermBuffer.new()
|
18
|
-
@term_info = TermInfo.new()
|
19
|
-
|
20
|
-
@index_pointer = 0
|
21
|
-
|
22
|
-
first_int = @input.read_int()
|
23
|
-
|
24
|
-
if (first_int >= 0)
|
25
|
-
# original-format file, without explicit format version number
|
26
|
-
@format = 0
|
27
|
-
@size = first_int
|
28
|
-
|
29
|
-
# back-compatible settings
|
30
|
-
@index_interval = 128
|
31
|
-
@skip_interval = INT_MAX # switch off skip_to optimization
|
32
|
-
|
33
|
-
else
|
34
|
-
# we have a format version number
|
35
|
-
@format = first_int
|
36
|
-
|
37
|
-
# check that it is a format we can understand
|
38
|
-
if (@format < TermInfosWriter::FORMAT)
|
39
|
-
raise "Unknown format version:#{@format}"
|
40
|
-
end
|
41
|
-
|
42
|
-
@size = @input.read_long() # read the size
|
43
|
-
|
44
|
-
if (@format == -1)
|
45
|
-
if (!@is_index)
|
46
|
-
@index_interval = @input.read_int()
|
47
|
-
@format_m1skip_interval = @input.read_int()
|
48
|
-
end
|
49
|
-
# switch off skip_to optimization for file format prior to
|
50
|
-
# 1.4rc2 in order to avoid a bug in skip_to implementation
|
51
|
-
# of these versions
|
52
|
-
@skip_interval = INT_MAX
|
53
|
-
else
|
54
|
-
@index_interval = @input.read_int()
|
55
|
-
@skip_interval = @input.read_int()
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#attr_accessors for the clone method
|
61
|
-
attr_accessor :input, :term_buffer, :prev_buffer
|
62
|
-
protected :input, :input=, :prev_buffer, :prev_buffer=
|
63
|
-
|
64
|
-
def initialize_copy(o)
|
65
|
-
super
|
66
|
-
@input = o.input.clone
|
67
|
-
@term_info = o.term_info.clone
|
68
|
-
@term_buffer = o.term_buffer.clone
|
69
|
-
@prev_buffer = o.prev_buffer.clone
|
70
|
-
end
|
71
|
-
|
72
|
-
def seek(pointer, position, term, term_info)
|
73
|
-
@input.seek(pointer)
|
74
|
-
@position = position
|
75
|
-
@term_buffer.term = term
|
76
|
-
@prev_buffer.reset()
|
77
|
-
@term_info.set!(term_info)
|
78
|
-
end
|
79
|
-
|
80
|
-
# Increments the enumeration to the next element. True if one exists.
|
81
|
-
def next?
|
82
|
-
@position += 1
|
83
|
-
if (@position >= @size)
|
84
|
-
@term_buffer.reset()
|
85
|
-
return false
|
86
|
-
end
|
87
|
-
|
88
|
-
@prev_buffer.set!(@term_buffer)
|
89
|
-
|
90
|
-
@term_buffer.read(@input, @field_infos)
|
91
|
-
|
92
|
-
@term_info.doc_freq = @input.read_vint() # read doc freq
|
93
|
-
@term_info.freq_pointer += @input.read_vlong() # read freq pointer
|
94
|
-
@term_info.prox_pointer += @input.read_vlong() # read prox pointer
|
95
|
-
|
96
|
-
if (@format == -1)
|
97
|
-
# just read skip_offset in order to increment file pointer
|
98
|
-
# value is never used since skip_to is switched off
|
99
|
-
if (!@is_index)
|
100
|
-
if (@term_info.doc_freq > @format_m1skip_interval)
|
101
|
-
@term_info.skip_offset = @input.read_vint()
|
102
|
-
end
|
103
|
-
end
|
104
|
-
else
|
105
|
-
if (@term_info.doc_freq >= @skip_interval)
|
106
|
-
@term_info.skip_offset = @input.read_vint()
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
if (@is_index)
|
111
|
-
@index_pointer += @input.read_vlong() # read index pointer
|
112
|
-
end
|
113
|
-
|
114
|
-
return true
|
115
|
-
end
|
116
|
-
|
117
|
-
def scan_to(term)
|
118
|
-
while (term > @term_buffer and next?) do
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
# Returns the current Term in the enumeration.
|
123
|
-
# Initially invalid, valid after next() called for the first time.
|
124
|
-
def term
|
125
|
-
return @term_buffer.to_term()
|
126
|
-
end
|
127
|
-
|
128
|
-
# Returns the previous Term enumerated. Initially nil.
|
129
|
-
def prev
|
130
|
-
return @prev_buffer.to_term()
|
131
|
-
end
|
132
|
-
|
133
|
-
# Returns the current TermInfo in the enumeration.
|
134
|
-
# Initially invalid, valid after next() called for the first time.
|
135
|
-
def term_info
|
136
|
-
return @term_info.clone
|
137
|
-
end
|
138
|
-
|
139
|
-
# Sets the argument to the current TermInfo in the enumeration.
|
140
|
-
# Initially invalid, valid after next() called for the first time.
|
141
|
-
attr_writer :term_info
|
142
|
-
#def term_info=(ti)
|
143
|
-
# return @term_info.set!(ti)
|
144
|
-
#end
|
145
|
-
|
146
|
-
# Returns the doc_freq from the current TermInfo in the enumeration.
|
147
|
-
# Initially invalid, valid after next() called for the first time.
|
148
|
-
def doc_freq
|
149
|
-
return term_info.doc_freq
|
150
|
-
end
|
151
|
-
|
152
|
-
# Returns the freq_pointer from the current TermInfo in the enumeration.
|
153
|
-
# Initially invalid, valid after next() called for the first time.
|
154
|
-
def freq_pointer
|
155
|
-
return term_info.freq_pointer
|
156
|
-
end
|
157
|
-
|
158
|
-
# Returns the prox_pointer from the current TermInfo in the enumeration.
|
159
|
-
# Initially invalid, valid after next() called for the first time.
|
160
|
-
def prox_pointer
|
161
|
-
return term_info.prox_pointer
|
162
|
-
end
|
163
|
-
|
164
|
-
# Closes the enumeration to further activity, freeing resources.
|
165
|
-
def close
|
166
|
-
@input.close()
|
167
|
-
end
|
168
|
-
end
|
169
|
-
end
|