ferret 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
module Ferret
|
2
|
+
module Index
|
3
|
+
# Abstract class for enumerating terms.
|
4
|
+
#
|
5
|
+
# Term enumerations are always ordered by Term.<=>. Each term in
|
6
|
+
# the enumeration is greater than all that precede it.
|
7
|
+
class TermEnum
|
8
|
+
# Increments the enumeration to the next element. True if one exists.
|
9
|
+
def next?
|
10
|
+
raise NotImplementedError
|
11
|
+
end
|
12
|
+
|
13
|
+
# Returns the current Term in the enumeration.
|
14
|
+
def term
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the doc_freq of the current Term in the enumeration.
|
19
|
+
def doc_freq
|
20
|
+
raise NotImplementedError
|
21
|
+
end
|
22
|
+
|
23
|
+
# Closes the enumeration to further activity, freeing resources.
|
24
|
+
def close
|
25
|
+
raise NotImplementedError
|
26
|
+
end
|
27
|
+
|
28
|
+
# Term Vector support
|
29
|
+
# Skips terms to the first beyond the current whose value is
|
30
|
+
# greater or equal to _target_.
|
31
|
+
#
|
32
|
+
# Returns true iff there is such a term.
|
33
|
+
#
|
34
|
+
# Behaves as if written:
|
35
|
+
#
|
36
|
+
# def skip_to(target_term)
|
37
|
+
# while (target > term)
|
38
|
+
# if (!next()) return false
|
39
|
+
# end
|
40
|
+
# return true
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# Some implementations are considerably more efficient than that.
|
44
|
+
def skip_to(term)
|
45
|
+
while (target > term)
|
46
|
+
return false if not next?
|
47
|
+
end
|
48
|
+
return true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
# A TermInfo is the record of information stored for a term.
|
3
|
+
class TermInfo
|
4
|
+
attr_accessor :doc_freq, :freq_pointer, :prox_pointer, :skip_offset
|
5
|
+
|
6
|
+
def initialize(df=0, fp=0, pp=0, so=0)
|
7
|
+
set_values!(df, fp, pp, so)
|
8
|
+
end
|
9
|
+
|
10
|
+
def set!(ti)
|
11
|
+
@doc_freq = ti.doc_freq
|
12
|
+
@freq_pointer = ti.freq_pointer
|
13
|
+
@prox_pointer = ti.prox_pointer
|
14
|
+
@skip_offset = ti.skip_offset
|
15
|
+
end
|
16
|
+
|
17
|
+
def set_values!(df=0, fp=0, pp=0, so=0)
|
18
|
+
@doc_freq = df
|
19
|
+
@freq_pointer = fp
|
20
|
+
@prox_pointer = pp
|
21
|
+
@skip_offset = so
|
22
|
+
end
|
23
|
+
|
24
|
+
def copy_of()
|
25
|
+
TermInfo.new(doc_freq, freq_pointer, prox_pointer, skip_offset)
|
26
|
+
end
|
27
|
+
|
28
|
+
def ==(o)
|
29
|
+
return false if !o.instance_of?(TermInfo)
|
30
|
+
@doc_freq == o.doc_freq &&
|
31
|
+
@freq_pointer == o.freq_pointer &&
|
32
|
+
@prox_pointer == o.prox_pointer &&
|
33
|
+
@skip_offset == o.skip_offset
|
34
|
+
end
|
35
|
+
alias eql? ==
|
36
|
+
|
37
|
+
def to_s()
|
38
|
+
"TermInfo:df=#{@doc_freq}:fp=#{@freq_pointer}:pp=#{@prox_pointer}:so=#{@skip_offset}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,312 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
module Ferret::Index
|
3
|
+
|
4
|
+
# This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
5
|
+
# Directory. A TermInfos can be written once, in order.
|
6
|
+
class TermInfosWriter
|
7
|
+
attr_reader :index_interval, :skip_interval, :out
|
8
|
+
attr_writer :other
|
9
|
+
# The file format version, a negative number.
|
10
|
+
FORMAT = -2
|
11
|
+
|
12
|
+
|
13
|
+
# TODO: the default values for these two parameters should be settable
|
14
|
+
# from IndexWriter. However, once that's done, folks will start setting
|
15
|
+
# them to ridiculous values and complaining that things don't work well,
|
16
|
+
# as with mergeFactor. So, let's wait until a number of folks find that
|
17
|
+
# alternate values work better. Note that both of these values are
|
18
|
+
# stored in the segment, so that it's safe to change these w/o
|
19
|
+
# rebuilding all indexes.
|
20
|
+
|
21
|
+
# Expert: The fraction of terms in the "dictionary" which should be
|
22
|
+
# stored in RAM. Smaller values use more memory, but make searching
|
23
|
+
# slightly faster, while larger values use less memory and make
|
24
|
+
# searching slightly slower. Searching is typically not dominated by
|
25
|
+
# dictionary lookup, so tweaking this is rarely useful.
|
26
|
+
#
|
27
|
+
# Expert: The fraction of TermDocEnum entries stored in skip
|
28
|
+
# tables, used to accellerate TermDocEnum#skipTo(int). Larger
|
29
|
+
# values result in smaller indexes, greater acceleration, but fewer
|
30
|
+
# accelerable cases, while smaller values result in bigger indexes, less
|
31
|
+
# acceleration and more accelerable cases. More detailed experiments
|
32
|
+
# would be useful here.
|
33
|
+
def initialize(dir, segment, fis, interval, is_index = false)
|
34
|
+
@index_interval = interval
|
35
|
+
@skip_interval = 16
|
36
|
+
@last_index_pointer = 0
|
37
|
+
@last_term = Term.new("", "")
|
38
|
+
@last_term_info = TermInfo.new()
|
39
|
+
@size = 0
|
40
|
+
@is_index = is_index
|
41
|
+
@field_infos = fis
|
42
|
+
@out = dir.create_output(segment + (@is_index ? ".tii" : ".tis"))
|
43
|
+
@out.write_int(FORMAT) # write format
|
44
|
+
@out.write_long(0) # leave space for size
|
45
|
+
@out.write_int(@index_interval) # write @index_interval
|
46
|
+
@out.write_int(@skip_interval) # write @skip_interval
|
47
|
+
unless is_index
|
48
|
+
@other = TermInfosWriter.new(dir, segment, fis, interval, true)
|
49
|
+
@other.other = self
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Adds a new <Term, TermInfo> pair to the set.
|
54
|
+
# Term must be lexicographically greater than all previous Terms added.
|
55
|
+
# TermInfo pointers must be positive and greater than all previous.
|
56
|
+
def add(term, term_info)
|
57
|
+
if (not @is_index and @last_term > term)
|
58
|
+
raise IOError, "term out of order #{term.text} < #{@last_term.text}"
|
59
|
+
end
|
60
|
+
if (term_info.freq_pointer < @last_term_info.freq_pointer)
|
61
|
+
raise IOError, "freq pointer out of order"
|
62
|
+
end
|
63
|
+
if (term_info.prox_pointer < @last_term_info.prox_pointer)
|
64
|
+
raise IOError, "prox pointer out of order"
|
65
|
+
end
|
66
|
+
|
67
|
+
if (not @is_index and @size % @index_interval == 0)
|
68
|
+
@other.add(@last_term, @last_term_info) # add an index term
|
69
|
+
end
|
70
|
+
|
71
|
+
write_term(term) # write term
|
72
|
+
@out.write_vint(term_info.doc_freq) # write doc freq
|
73
|
+
@out.write_vlong(term_info.freq_pointer - @last_term_info.freq_pointer)
|
74
|
+
@out.write_vlong(term_info.prox_pointer - @last_term_info.prox_pointer)
|
75
|
+
@out.write_vint(term_info.skip_offset) if (term_info.doc_freq >= @skip_interval)
|
76
|
+
|
77
|
+
if (@is_index)
|
78
|
+
@out.write_vlong(@other.out.pos() - @last_index_pointer)
|
79
|
+
@last_index_pointer = @other.out.pos() # write pointer
|
80
|
+
end
|
81
|
+
|
82
|
+
@last_term_info.set!(term_info)
|
83
|
+
@size += 1
|
84
|
+
end
|
85
|
+
|
86
|
+
# Called to complete TermInfos creation.
|
87
|
+
def close()
|
88
|
+
@out.seek(4) # write @size after format
|
89
|
+
@out.write_long(@size)
|
90
|
+
@out.close()
|
91
|
+
|
92
|
+
@other.close() unless @is_index
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
def write_term(term)
|
97
|
+
start = Ferret::Utils::StringHelper.string_difference(@last_term.text, term.text)
|
98
|
+
length = term.text.length() - start
|
99
|
+
|
100
|
+
@out.write_vint(start) # write shared prefix length
|
101
|
+
@out.write_vint(length) # write delta length
|
102
|
+
@out.write_chars(term.text, start, length) # write delta chars
|
103
|
+
@out.write_vint(@field_infos.field_number(term.field)) # write field num
|
104
|
+
@last_term = term
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
# This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
110
|
+
# Directory. Pairs are accessed either by Term or by ordinal position the
|
111
|
+
# set.
|
112
|
+
class TermInfosReader
|
113
|
+
include MonitorMixin
|
114
|
+
|
115
|
+
def initialize(dir, seg, fis)
|
116
|
+
super()
|
117
|
+
|
118
|
+
Thread.current["#{self.object_id}-term_enum"] = nil
|
119
|
+
|
120
|
+
@directory = dir
|
121
|
+
@segment = seg
|
122
|
+
@field_infos = fis
|
123
|
+
|
124
|
+
@orig_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tis"),
|
125
|
+
@field_infos, false)
|
126
|
+
@size = @orig_enum.size
|
127
|
+
@skip_interval = @orig_enum.skip_interval
|
128
|
+
@index_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tii"),
|
129
|
+
@field_infos, true)
|
130
|
+
@index_terms = nil
|
131
|
+
@index_infos = nil
|
132
|
+
@index_pointers = nil
|
133
|
+
end
|
134
|
+
|
135
|
+
def close()
|
136
|
+
# clear this threads cache
|
137
|
+
Thread.current["#{self.object_id}-term_enum"] = nil
|
138
|
+
|
139
|
+
@orig_enum.close() if (@orig_enum != nil)
|
140
|
+
@index_enum.close() if (@index_enum != nil)
|
141
|
+
end
|
142
|
+
|
143
|
+
# Returns the number of term/value pairs in the set.
|
144
|
+
attr_reader :size
|
145
|
+
# The skip interval for the original enumerator
|
146
|
+
attr_reader :skip_interval
|
147
|
+
|
148
|
+
|
149
|
+
# Returns the TermInfo for a Term in the set, or nil.
|
150
|
+
def get_term_info(term)
|
151
|
+
return nil if (@size == 0)
|
152
|
+
|
153
|
+
ensure_index_is_read()
|
154
|
+
|
155
|
+
# optimize sequential access: first try scanning cached enum w/o seeking
|
156
|
+
e = enum()
|
157
|
+
if e.term and term >= e.term
|
158
|
+
enum_offset = (e.position / e.index_interval).to_i + 1
|
159
|
+
if (@index_terms.length == enum_offset or
|
160
|
+
term < @index_terms[enum_offset]) # but before end of block
|
161
|
+
return scan_for_term_info(term) # no need to seek
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# random-access: must seek
|
166
|
+
seek_enum(get_index_offset(term))
|
167
|
+
return scan_for_term_info(term)
|
168
|
+
end
|
169
|
+
alias :[] :get_term_info
|
170
|
+
|
171
|
+
# Returns the nth term in the set.
|
172
|
+
def get_term(position)
|
173
|
+
return nil if (@size == 0)
|
174
|
+
|
175
|
+
e = enum()
|
176
|
+
if (e != nil and
|
177
|
+
e.term != nil and
|
178
|
+
position >= e.position and
|
179
|
+
position < (e.position + e.index_interval))
|
180
|
+
return scan_for_term(position) # can avoid seek
|
181
|
+
end
|
182
|
+
|
183
|
+
seek_enum((position / e.index_interval).to_i) # must seek
|
184
|
+
return scan_for_term(position)
|
185
|
+
end
|
186
|
+
|
187
|
+
def get_terms_position(term)
|
188
|
+
return nil if (@size == 0)
|
189
|
+
ensure_index_is_read
|
190
|
+
seek_enum(get_index_offset(term))
|
191
|
+
|
192
|
+
e = enum()
|
193
|
+
|
194
|
+
while term > e.term and e.next?
|
195
|
+
end
|
196
|
+
|
197
|
+
return term == e.term ? e.position : -1
|
198
|
+
end
|
199
|
+
|
200
|
+
# Returns an enumeration of all the Terms and TermInfos in the set.
|
201
|
+
def terms()
|
202
|
+
return @orig_enum.clone()
|
203
|
+
end
|
204
|
+
|
205
|
+
# Returns an enumeration of terms starting at or after the named term.
|
206
|
+
def terms_from(term)
|
207
|
+
get_term_info(term)
|
208
|
+
return enum().clone()
|
209
|
+
end
|
210
|
+
|
211
|
+
private
|
212
|
+
|
213
|
+
def enum()
|
214
|
+
term_enum = Thread.current["#{self.object_id}-term_enum"]
|
215
|
+
if (term_enum == nil)
|
216
|
+
term_enum = terms()
|
217
|
+
@xterm_enum = Thread.current["#{self.object_id}-term_enum"] = term_enum
|
218
|
+
end
|
219
|
+
return term_enum
|
220
|
+
end
|
221
|
+
|
222
|
+
def ensure_index_is_read()
|
223
|
+
synchronize() do
|
224
|
+
return if @index_terms
|
225
|
+
begin
|
226
|
+
index_size = @index_enum.size
|
227
|
+
|
228
|
+
@index_terms = Array.new(index_size)
|
229
|
+
@index_infos = Array.new(index_size)
|
230
|
+
@index_pointers = Array.new(index_size)
|
231
|
+
|
232
|
+
i = 0
|
233
|
+
while @index_enum.next?
|
234
|
+
@index_terms[i] = @index_enum.term
|
235
|
+
@index_infos[i] = @index_enum.term_info
|
236
|
+
@index_pointers[i] = @index_enum.index_pointer
|
237
|
+
i += 1
|
238
|
+
end
|
239
|
+
ensure
|
240
|
+
@index_enum.close()
|
241
|
+
@index_enum = nil
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# Returns the offset of the greatest index entry which is less than or
|
247
|
+
# equal to term.
|
248
|
+
def get_index_offset(term)
|
249
|
+
lo = 0 # binary search @index_terms[]
|
250
|
+
hi = @index_terms.length - 1
|
251
|
+
|
252
|
+
while (hi >= lo)
|
253
|
+
mid = (lo + hi) >> 1
|
254
|
+
delta = term <=> @index_terms[mid]
|
255
|
+
if (delta < 0)
|
256
|
+
hi = mid - 1
|
257
|
+
elsif (delta > 0)
|
258
|
+
lo = mid + 1
|
259
|
+
else
|
260
|
+
return mid
|
261
|
+
end
|
262
|
+
end
|
263
|
+
return hi
|
264
|
+
end
|
265
|
+
|
266
|
+
def seek_enum(ind_offset)
|
267
|
+
enum().seek(@index_pointers[ind_offset],
|
268
|
+
(ind_offset * enum().index_interval) - 1,
|
269
|
+
@index_terms[ind_offset],
|
270
|
+
@index_infos[ind_offset])
|
271
|
+
end
|
272
|
+
|
273
|
+
# Scans within block for matching term.
|
274
|
+
def scan_for_term_info(term)
|
275
|
+
e = enum()
|
276
|
+
e.scan_to(term)
|
277
|
+
if e.term != nil and term == e.term
|
278
|
+
return e.term_info()
|
279
|
+
else
|
280
|
+
return nil
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def scan_for_term(position)
|
285
|
+
e = enum()
|
286
|
+
while (e.position < position)
|
287
|
+
return nil if not e.next?
|
288
|
+
end
|
289
|
+
|
290
|
+
return e.term
|
291
|
+
end
|
292
|
+
|
293
|
+
# Returns the position of a Term in the set or -1.
|
294
|
+
def get_position(term)
|
295
|
+
return -1 if (@size == 0)
|
296
|
+
|
297
|
+
ind_offset = get_index_offset(term)
|
298
|
+
seek_enum(ind_offset)
|
299
|
+
|
300
|
+
e = enum()
|
301
|
+
while (term > e.term and e.next?)
|
302
|
+
end
|
303
|
+
|
304
|
+
if (term == e.term())
|
305
|
+
return e.position
|
306
|
+
else
|
307
|
+
return -1
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
end
|
312
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
class TermVectorOffsetInfo
|
3
|
+
attr_accessor :start_offset, :end_offset
|
4
|
+
|
5
|
+
def initialize(start_offset, end_offset)
|
6
|
+
@end_offset = end_offset
|
7
|
+
@start_offset = start_offset
|
8
|
+
end
|
9
|
+
|
10
|
+
def eql?(o)
|
11
|
+
return false if !o.instance_of?(TermVectorOffsetInfo)
|
12
|
+
@end_offset == o.end_offset and @start_offset == o.start_offset
|
13
|
+
end
|
14
|
+
alias :== :eql?
|
15
|
+
|
16
|
+
def hash()
|
17
|
+
29 * @start_offset + @end_offset
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,552 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
# Writer works by opening a document and then opening the fields within
|
3
|
+
# the document and then writing out the vectors for each field.
|
4
|
+
#
|
5
|
+
# Rough usage:
|
6
|
+
#
|
7
|
+
# for each document
|
8
|
+
#
|
9
|
+
# writer.open_document()
|
10
|
+
# for each field on the document
|
11
|
+
#
|
12
|
+
# writer.open_field(field)
|
13
|
+
# for all of the @terms
|
14
|
+
#
|
15
|
+
# writer.add_term(...)
|
16
|
+
# end
|
17
|
+
# writer.close_field
|
18
|
+
# end
|
19
|
+
# writer.close_document()
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
#
|
23
|
+
class TermVectorsWriter
|
24
|
+
STORE_POSITIONS_WITH_TERMVECTOR = 0x1
|
25
|
+
STORE_OFFSET_WITH_TERMVECTOR = 0x2
|
26
|
+
|
27
|
+
FORMAT_VERSION = 2
|
28
|
+
|
29
|
+
# The size in bytes that the FORMAT_VERSION will take up at the beginning
|
30
|
+
# of each file
|
31
|
+
FORMAT_SIZE = 4
|
32
|
+
|
33
|
+
TVX_EXTENSION = ".tvx"
|
34
|
+
TVD_EXTENSION = ".tvd"
|
35
|
+
TVF_EXTENSION = ".tvf"
|
36
|
+
|
37
|
+
def initialize(directory, segment, field_infos)
|
38
|
+
@current_field = nil
|
39
|
+
@current_doc_pointer = -1
|
40
|
+
|
41
|
+
# Open files for TermVector storage
|
42
|
+
@tvx = directory.create_output(segment + TVX_EXTENSION)
|
43
|
+
@tvx.write_int(FORMAT_VERSION)
|
44
|
+
@tvd = directory.create_output(segment + TVD_EXTENSION)
|
45
|
+
@tvd.write_int(FORMAT_VERSION)
|
46
|
+
@tvf = directory.create_output(segment + TVF_EXTENSION)
|
47
|
+
@tvf.write_int(FORMAT_VERSION)
|
48
|
+
|
49
|
+
@field_infos = field_infos
|
50
|
+
@fields = []
|
51
|
+
@terms = []
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
def open_document()
|
56
|
+
close_document()
|
57
|
+
@current_doc_pointer = @tvd.pos()
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def close_document()
|
62
|
+
|
63
|
+
if (document_open?())
|
64
|
+
close_field()
|
65
|
+
write_doc()
|
66
|
+
@fields.clear()
|
67
|
+
@current_doc_pointer = -1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def document_open?()
|
73
|
+
return @current_doc_pointer != -1
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Start processing a field. This can be followed by a number of calls to
|
78
|
+
# add_term, and a final call to close_field to indicate the end of
|
79
|
+
# processing of this field. If a field was previously open, it is closed
|
80
|
+
# automatically.
|
81
|
+
def open_field(field)
|
82
|
+
field_info = @field_infos[field]
|
83
|
+
create_field(field_info.number,
|
84
|
+
field_info.store_positions?,
|
85
|
+
field_info.store_offsets?)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Finished processing current field. This should be followed by a call
|
89
|
+
# to open_field before future calls to add_term.
|
90
|
+
def close_field()
|
91
|
+
if field_open?
|
92
|
+
#puts("close_field()")
|
93
|
+
|
94
|
+
# save field and @terms
|
95
|
+
write_field()
|
96
|
+
@fields << @current_field
|
97
|
+
@terms.clear()
|
98
|
+
@current_field = nil
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Return true if a field is currently open.
|
103
|
+
def field_open?()
|
104
|
+
return @current_field != nil
|
105
|
+
end
|
106
|
+
|
107
|
+
# Add term to the field's term vector. Field must already be open.
|
108
|
+
#
|
109
|
+
# Terms should be added in increasing order of @terms, one call per
|
110
|
+
# unique termNum. ProxPointer is a pointer into the TermPosition file
|
111
|
+
# (prx). Freq is the number of times this term appears in this field, in
|
112
|
+
# this document. raises:: IllegalStateException if document or field is
|
113
|
+
# not open
|
114
|
+
def add_term(term_text, freq, positions = nil, offsets = nil)
|
115
|
+
if not document_open?
|
116
|
+
raise IllegalStateError, "Cannot add terms when document is not open"
|
117
|
+
end
|
118
|
+
if not field_open?
|
119
|
+
raise IllegalStateError, "Cannot add terms when field is not open"
|
120
|
+
end
|
121
|
+
|
122
|
+
add_term_internal(term_text, freq, positions, offsets)
|
123
|
+
end
|
124
|
+
|
125
|
+
def add_term_internal(term_text, freq, positions, offsets)
|
126
|
+
@terms << TVTerm.new(term_text, freq, positions, offsets)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Add a complete document specified by all its term vectors. If document has no
|
130
|
+
# term vectors, add value for @tvx.
|
131
|
+
#
|
132
|
+
# vectors:: The documents to have their term vectors added
|
133
|
+
# raises:: IOException
|
134
|
+
def add_all_doc_vectors(vectors)
|
135
|
+
|
136
|
+
open_document()
|
137
|
+
|
138
|
+
if vectors != nil
|
139
|
+
vectors.each do |vector|
|
140
|
+
store_positions = (vector.size > 0 and vector.positions != nil)
|
141
|
+
store_offsets = (vector.size > 0 and vector.offsets != nil)
|
142
|
+
|
143
|
+
create_field(@field_infos.field_number(vector.field),
|
144
|
+
store_positions, store_offsets)
|
145
|
+
|
146
|
+
vector.size.times do |j|
|
147
|
+
add_term_internal(vector.terms[j],
|
148
|
+
vector.term_frequencies[j],
|
149
|
+
store_positions ? vector.positions[j] : nil,
|
150
|
+
store_offsets ? vector.offsets[j] : nil)
|
151
|
+
end
|
152
|
+
close_field()
|
153
|
+
end
|
154
|
+
end
|
155
|
+
close_document()
|
156
|
+
end
|
157
|
+
|
158
|
+
# Close all streams.
|
159
|
+
def close()
|
160
|
+
begin
|
161
|
+
close_document()
|
162
|
+
ensure
|
163
|
+
# make an effort to close all streams we can but remember and re-raise
|
164
|
+
# the last exception encountered in this process
|
165
|
+
keep = nil
|
166
|
+
[@tvx, @tvd, @tvf].compact.each do |os|
|
167
|
+
begin
|
168
|
+
os.close()
|
169
|
+
rescue IOError => e
|
170
|
+
keep = e
|
171
|
+
end
|
172
|
+
end
|
173
|
+
raise keep if (keep != nil)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class TVField
|
178
|
+
attr_accessor :number, :tvf_pointer, :store_positions, :store_offsets
|
179
|
+
def initialize(number, store_pos, store_off)
|
180
|
+
@tvf_pointer = 0
|
181
|
+
@number = number
|
182
|
+
@store_positions = store_pos
|
183
|
+
@store_offsets = store_off
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
class TVTerm
|
188
|
+
attr_accessor :term_text, :freq, :positions, :offsets
|
189
|
+
|
190
|
+
def initialize(term_text=nil, freq=nil, positions=nil, offsets=nil)
|
191
|
+
@term_text = term_text
|
192
|
+
@freq = freq
|
193
|
+
@positions = positions
|
194
|
+
@offsets = offsets
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
|
200
|
+
def write_field()
|
201
|
+
# remember where this field is written
|
202
|
+
@current_field.tvf_pointer = @tvf.pos
|
203
|
+
|
204
|
+
size = @terms.size
|
205
|
+
@tvf.write_vint(size)
|
206
|
+
|
207
|
+
store_positions = @current_field.store_positions
|
208
|
+
store_offsets = @current_field.store_offsets
|
209
|
+
bits = 0x0
|
210
|
+
if (store_positions)
|
211
|
+
bits |= STORE_POSITIONS_WITH_TERMVECTOR
|
212
|
+
end
|
213
|
+
if (store_offsets)
|
214
|
+
bits |= STORE_OFFSET_WITH_TERMVECTOR
|
215
|
+
end
|
216
|
+
@tvf.write_byte(bits)
|
217
|
+
|
218
|
+
last_term_text = ""
|
219
|
+
@terms.each do |term|
|
220
|
+
start = Ferret::Utils::StringHelper.string_difference(last_term_text,
|
221
|
+
term.term_text)
|
222
|
+
length = term.term_text.length() - start
|
223
|
+
@tvf.write_vint(start) # write shared prefix length
|
224
|
+
@tvf.write_vint(length) # write delta length
|
225
|
+
@tvf.write_chars(term.term_text, start, length) # write delta chars
|
226
|
+
@tvf.write_vint(term.freq)
|
227
|
+
last_term_text = term.term_text
|
228
|
+
|
229
|
+
if (store_positions)
|
230
|
+
if (term.positions == nil)
|
231
|
+
raise IllegalStateError, "Trying to write positions that are nil!"
|
232
|
+
end
|
233
|
+
|
234
|
+
# use delta encoding for positions
|
235
|
+
position = 0
|
236
|
+
term.freq.times do |j|
|
237
|
+
@tvf.write_vint(term.positions[j] - position)
|
238
|
+
position = term.positions[j]
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
if (store_offsets)
|
243
|
+
if(term.offsets == nil)
|
244
|
+
raise IllegalStateError, "Trying to write offsets that are nil!"
|
245
|
+
end
|
246
|
+
|
247
|
+
# use delta encoding for offsets
|
248
|
+
position = 0
|
249
|
+
term.freq.times do |j|
|
250
|
+
@tvf.write_vint(term.offsets[j].start_offset - position)
|
251
|
+
#Save the diff between the two.
|
252
|
+
@tvf.write_vint(term.offsets[j].end_offset -
|
253
|
+
term.offsets[j].start_offset)
|
254
|
+
position = term.offsets[j].end_offset()
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
def write_doc()
|
261
|
+
if field_open?
|
262
|
+
raise IllegalStateError, "Field is still open while writing document"
|
263
|
+
end
|
264
|
+
#puts("Writing doc pointer: " + @current_doc_pointer)
|
265
|
+
# write document index record
|
266
|
+
@tvx.write_long(@current_doc_pointer)
|
267
|
+
|
268
|
+
# write document data record
|
269
|
+
size = @fields.size
|
270
|
+
|
271
|
+
# write the number of @fields
|
272
|
+
@tvd.write_vint(size)
|
273
|
+
|
274
|
+
# write field numbers
|
275
|
+
@fields.each { |field| @tvd.write_vint(field.number) }
|
276
|
+
|
277
|
+
# write field pointers
|
278
|
+
last_field_pointer = 0
|
279
|
+
@fields.each do |field|
|
280
|
+
@tvd.write_vlong(field.tvf_pointer - last_field_pointer)
|
281
|
+
last_field_pointer = field.tvf_pointer
|
282
|
+
end
|
283
|
+
#puts("After writing doc pointer: " + @tvx.pos())
|
284
|
+
end
|
285
|
+
|
286
|
+
def create_field(field_number, store_position, store_offset)
|
287
|
+
if not document_open?
|
288
|
+
raise IllegalStateError, "Cannot open field when no document is open."
|
289
|
+
end
|
290
|
+
close_field()
|
291
|
+
@current_field = TVField.new(field_number, store_position, store_offset)
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
class TermVectorsReader
|
296
|
+
attr_reader :size
|
297
|
+
|
298
|
+
# accessors for clone method
|
299
|
+
attr_accessor :tvx, :tvd, :tvf
|
300
|
+
protected :tvx, :tvx=, :tvd, :tvd=, :tvf, :tvf=
|
301
|
+
|
302
|
+
|
303
|
+
def initialize(d, segment, field_infos)
|
304
|
+
|
305
|
+
if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION))
|
306
|
+
@tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
|
307
|
+
check_valid_format(@tvx)
|
308
|
+
@tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
|
309
|
+
@tvd_format = check_valid_format(@tvd)
|
310
|
+
@tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
|
311
|
+
@tvf_format = check_valid_format(@tvf)
|
312
|
+
@size = @tvx.length / 8
|
313
|
+
else
|
314
|
+
@tvx = nil
|
315
|
+
@tvd = nil
|
316
|
+
@tvf = nil
|
317
|
+
end
|
318
|
+
|
319
|
+
@field_infos = field_infos
|
320
|
+
end
|
321
|
+
|
322
|
+
def close()
|
323
|
+
# make an effort to close all streams we can but remember and re-raise
|
324
|
+
# the last exception encountered in this process
|
325
|
+
keep = nil
|
326
|
+
[@tvx, @tvd, @tvf].compact.each do |os|
|
327
|
+
begin
|
328
|
+
os.close()
|
329
|
+
rescue IOError => e
|
330
|
+
keep = e
|
331
|
+
end
|
332
|
+
end
|
333
|
+
raise keep if (keep != nil)
|
334
|
+
end
|
335
|
+
|
336
|
+
# Retrieve the term vector for the given document and field
|
337
|
+
# doc_num:: The document number to retrieve the vector for
|
338
|
+
# field:: The field within the document to retrieve
|
339
|
+
# returns:: The TermFreqVector for the document and field or nil if there
|
340
|
+
# is no termVector for this field.
|
341
|
+
# raises:: IOException if there is an error reading the term vector files
|
342
|
+
def get_field_tv(doc_num, field)
|
343
|
+
# Check if no term vectors are available for this segment at all
|
344
|
+
field_number = @field_infos.field_number(field)
|
345
|
+
result = nil
|
346
|
+
if (@tvx != nil)
|
347
|
+
#We need to account for the FORMAT_SIZE at when seeking in the @tvx
|
348
|
+
#We don't need to do this in other seeks because we already have the
|
349
|
+
# file pointer
|
350
|
+
#that was written in another file
|
351
|
+
@tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
|
352
|
+
#puts("TVX Pointer: " + @tvx.pos())
|
353
|
+
position = @tvx.read_long()
|
354
|
+
|
355
|
+
@tvd.seek(position)
|
356
|
+
field_count = @tvd.read_vint()
|
357
|
+
#puts("Num Fields: " + field_count)
|
358
|
+
# There are only a few fields per document. We opt for a full scan
|
359
|
+
# rather then requiring that they be ordered. We need to read through
|
360
|
+
# all of the fields anyway to get to the tvf pointers.
|
361
|
+
number = 0
|
362
|
+
found = -1
|
363
|
+
field_count.times do |i|
|
364
|
+
if @tvd_format == TermVectorsWriter::FORMAT_VERSION
|
365
|
+
number = @tvd.read_vint()
|
366
|
+
else
|
367
|
+
number += @tvd.read_vint()
|
368
|
+
end
|
369
|
+
if (number == field_number)
|
370
|
+
found = i
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
# This field, although valid in the segment, was not found in this
|
375
|
+
# document
|
376
|
+
if (found != -1)
|
377
|
+
# Compute position in the @tvf file
|
378
|
+
position = 0
|
379
|
+
(found + 1).times do
|
380
|
+
position += @tvd.read_vlong()
|
381
|
+
end
|
382
|
+
|
383
|
+
result = read_term_vector(field, position)
|
384
|
+
end
|
385
|
+
end
|
386
|
+
return result
|
387
|
+
end
|
388
|
+
|
389
|
+
# Return all term vectors stored for this document or nil if it could
|
390
|
+
# not be read in.
|
391
|
+
#
|
392
|
+
# doc_num:: The document number to retrieve the vector for
|
393
|
+
# returns:: All term frequency vectors
|
394
|
+
# raises:: IOException if there is an error reading the term vector files
|
395
|
+
def get_tv(doc_num)
|
396
|
+
result = nil
|
397
|
+
# Check if no term vectors are available for this segment at all
|
398
|
+
if (@tvx != nil)
|
399
|
+
#We need to offset by
|
400
|
+
@tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
|
401
|
+
position = @tvx.read_long()
|
402
|
+
|
403
|
+
@tvd.seek(position)
|
404
|
+
field_count = @tvd.read_vint()
|
405
|
+
|
406
|
+
# No fields are vectorized for this document
|
407
|
+
if (field_count != 0)
|
408
|
+
number = 0
|
409
|
+
fields = Array.new(field_count)
|
410
|
+
|
411
|
+
field_count.times do |i|
|
412
|
+
if @tvd_format == TermVectorsWriter::FORMAT_VERSION
|
413
|
+
number = @tvd.read_vint()
|
414
|
+
else
|
415
|
+
number += @tvd.read_vint()
|
416
|
+
end
|
417
|
+
|
418
|
+
fields[i] = @field_infos[number].name
|
419
|
+
end
|
420
|
+
|
421
|
+
# Compute position in the @tvf file
|
422
|
+
position = 0
|
423
|
+
tvf_pointers = Array.new(field_count)
|
424
|
+
field_count.times do |i|
|
425
|
+
position += @tvd.read_vlong()
|
426
|
+
tvf_pointers[i] = position
|
427
|
+
end
|
428
|
+
|
429
|
+
result = read_term_vectors(fields, tvf_pointers)
|
430
|
+
end
|
431
|
+
end
|
432
|
+
return result
|
433
|
+
end
|
434
|
+
|
435
|
+
def clone()
|
436
|
+
|
437
|
+
if (@tvx == nil or @tvd == nil or @tvf == nil)
|
438
|
+
return nil
|
439
|
+
end
|
440
|
+
|
441
|
+
clone = self
|
442
|
+
clone.tvx = @tvx.clone()
|
443
|
+
clone.tvd = @tvd.clone()
|
444
|
+
clone.tvf = @tvf.clone()
|
445
|
+
|
446
|
+
return clone
|
447
|
+
end
|
448
|
+
|
449
|
+
private
|
450
|
+
|
451
|
+
def read_term_vectors(fields, tvf_pointers)
|
452
|
+
|
453
|
+
res = Array.new(fields.length)
|
454
|
+
fields.length.times do |i|
|
455
|
+
res[i] = read_term_vector(fields[i], tvf_pointers[i])
|
456
|
+
end
|
457
|
+
return res
|
458
|
+
end
|
459
|
+
|
460
|
+
# field:: The field to read in
|
461
|
+
# tvf_pointer:: The pointer within the @tvf file where we should start reading
|
462
|
+
# returns:: The TermVector located at that position
|
463
|
+
# raises:: IOException
|
464
|
+
def read_term_vector(field, tvf_pointer)
|
465
|
+
# Now read the data from specified position
|
466
|
+
# We don't need to offset by the FORMAT here since the pointer
|
467
|
+
# already includes the offset
|
468
|
+
@tvf.seek(tvf_pointer)
|
469
|
+
|
470
|
+
num_terms = @tvf.read_vint()
|
471
|
+
# If no terms - return a constant empty termvector. However, this should
|
472
|
+
# never occur!
|
473
|
+
if (num_terms == 0)
|
474
|
+
return SegmentTermVector.new(field, nil, nil)
|
475
|
+
end
|
476
|
+
|
477
|
+
|
478
|
+
if(@tvf_format == TermVectorsWriter::FORMAT_VERSION)
|
479
|
+
bits = @tvf.read_byte()
|
480
|
+
store_positions = (bits & TermVectorsWriter::STORE_POSITIONS_WITH_TERMVECTOR) != 0
|
481
|
+
store_offsets = (bits & TermVectorsWriter::STORE_OFFSET_WITH_TERMVECTOR) != 0
|
482
|
+
else
|
483
|
+
@tvf.read_vint()
|
484
|
+
store_positions = false
|
485
|
+
store_offsets = false
|
486
|
+
end
|
487
|
+
|
488
|
+
terms = Array.new(num_terms)
|
489
|
+
term_freqs = Array.new(num_terms)
|
490
|
+
|
491
|
+
# we may not need these, but declare them
|
492
|
+
positions = nil
|
493
|
+
offsets = nil
|
494
|
+
if(store_positions)
|
495
|
+
positions = Array.new(num_terms)
|
496
|
+
end
|
497
|
+
if(store_offsets)
|
498
|
+
offsets = Array.new(num_terms)
|
499
|
+
end
|
500
|
+
|
501
|
+
start = 0
|
502
|
+
delta_length = 0
|
503
|
+
total_length = 0
|
504
|
+
buffer = ""
|
505
|
+
previous_buffer = ""
|
506
|
+
|
507
|
+
num_terms.times do |i|
|
508
|
+
start = @tvf.read_vint()
|
509
|
+
delta_length = @tvf.read_vint()
|
510
|
+
total_length = start + delta_length
|
511
|
+
@tvf.read_chars(buffer, start, delta_length)
|
512
|
+
terms[i] = buffer[0, total_length].to_s
|
513
|
+
previous_string = terms[i]
|
514
|
+
freq = @tvf.read_vint()
|
515
|
+
term_freqs[i] = freq
|
516
|
+
|
517
|
+
if (store_positions) #read in the positions
|
518
|
+
pos = Array.new(freq)
|
519
|
+
positions[i] = pos
|
520
|
+
prev_position = 0
|
521
|
+
freq.times do |j|
|
522
|
+
pos[j] = prev_position + @tvf.read_vint()
|
523
|
+
prev_position = pos[j]
|
524
|
+
end
|
525
|
+
end
|
526
|
+
|
527
|
+
if (store_offsets)
|
528
|
+
offs = Array.new(freq)
|
529
|
+
offsets[i] = offs
|
530
|
+
prev_offset = 0
|
531
|
+
freq.times do |j|
|
532
|
+
start_offset = prev_offset + @tvf.read_vint()
|
533
|
+
end_offset = start_offset + @tvf.read_vint()
|
534
|
+
offs[j] = TermVectorOffsetInfo.new(start_offset, end_offset)
|
535
|
+
prev_offset = end_offset
|
536
|
+
end
|
537
|
+
end
|
538
|
+
end
|
539
|
+
|
540
|
+
SegmentTermVector.new(field, terms, term_freqs, positions, offsets)
|
541
|
+
end
|
542
|
+
|
543
|
+
def check_valid_format(istream)
|
544
|
+
format = istream.read_int()
|
545
|
+
if (format > TermVectorsWriter::FORMAT_VERSION)
|
546
|
+
raise IOError, "Incompatible format version: #{format} expected #{TermVectorsWriter::FORMAT_VERSION} or less"
|
547
|
+
end
|
548
|
+
return format
|
549
|
+
end
|
550
|
+
|
551
|
+
end
|
552
|
+
end
|