ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,288 @@
|
|
1
|
+
require 'ferret/search/similarity'
|
2
|
+
|
3
|
+
module Ferret::Index
|
4
|
+
|
5
|
+
class DocumentWriter
|
6
|
+
# If non-nil, a message will be printed to this if max_field_length is
|
7
|
+
# reached.
|
8
|
+
attr_writer :info_stream
|
9
|
+
|
10
|
+
# directory:: The directory to write the document information to
|
11
|
+
# analyzer:: The analyzer to use for the document
|
12
|
+
# similarity:: The Similarity function writer.similarity
|
13
|
+
# max_field_length:: The maximum number of tokens a field may have
|
14
|
+
# writer.max_field_length
|
15
|
+
# term_index_interval:: The interval of terms in the index
|
16
|
+
# writer.max_field_length
|
17
|
+
def initialize(directory,
|
18
|
+
analyzer,
|
19
|
+
similarity,
|
20
|
+
max_field_length,
|
21
|
+
term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
|
22
|
+
@directory = directory
|
23
|
+
@analyzer = analyzer
|
24
|
+
@similarity = similarity
|
25
|
+
@max_field_length = max_field_length
|
26
|
+
@term_index_interval = term_index_interval
|
27
|
+
|
28
|
+
# Keys are Terms, values are Postings.
|
29
|
+
# Used to buffer a document before it is written to the index.
|
30
|
+
@posting_table = {}
|
31
|
+
|
32
|
+
@term_buffer = Term.new("", "")
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_document(segment, doc)
|
36
|
+
|
37
|
+
# write field names
|
38
|
+
@field_infos = FieldInfos.new()
|
39
|
+
@field_infos << doc
|
40
|
+
@field_infos.write_to_dir(@directory, segment + ".fnm")
|
41
|
+
|
42
|
+
# write field values
|
43
|
+
fields_writer = FieldsWriter.new(@directory, segment, @field_infos)
|
44
|
+
begin
|
45
|
+
fields_writer.add_document(doc)
|
46
|
+
ensure
|
47
|
+
fields_writer.close()
|
48
|
+
end
|
49
|
+
|
50
|
+
# invert doc into posting_table
|
51
|
+
@posting_table.clear(); # clear posting_table
|
52
|
+
arr_size = @field_infos.size
|
53
|
+
@field_lengths = Array.new(arr_size, 0) # init field_lengths
|
54
|
+
@field_positions = Array.new(arr_size, 0) # init field_positions
|
55
|
+
@field_offsets = Array.new(arr_size, 0) # init field_offsets
|
56
|
+
@field_boosts = Array.new(arr_size, doc.boost) # init field_boosts
|
57
|
+
|
58
|
+
invert_document(doc)
|
59
|
+
|
60
|
+
# sort posting_table into an array
|
61
|
+
postings = sort_posting_table()
|
62
|
+
|
63
|
+
# for (int i = 0; i < postings.length; i += 1)
|
64
|
+
# Posting posting = postings[i]
|
65
|
+
# print(posting.term)
|
66
|
+
# print(" freq=" + posting.freq)
|
67
|
+
# print(" pos=")
|
68
|
+
# print(posting.positions[0])
|
69
|
+
# for (int j = 1; j < posting.freq; j += 1)
|
70
|
+
# print("," + posting.positions[j])
|
71
|
+
# puts("")
|
72
|
+
# end
|
73
|
+
|
74
|
+
# write postings
|
75
|
+
write_postings(postings, segment)
|
76
|
+
|
77
|
+
# write norms of indexed fields
|
78
|
+
write_norms(segment)
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# Tokenizes the fields of a document into Postings.
|
85
|
+
def invert_document(doc)
|
86
|
+
|
87
|
+
fields = doc.all_fields
|
88
|
+
fields.each do |field|
|
89
|
+
field_name = field.name
|
90
|
+
field_info = @field_infos[field_name]
|
91
|
+
field_number = field_info.number
|
92
|
+
|
93
|
+
length = @field_lengths[field_number] # length of field
|
94
|
+
position = @field_positions[field_number] # position in field
|
95
|
+
offset = @field_offsets[field_number] # offset field
|
96
|
+
|
97
|
+
if field_info.indexed?
|
98
|
+
if not field.tokenized? # un-tokenized field
|
99
|
+
string_value = field.string_value
|
100
|
+
if field_info.store_offsets?
|
101
|
+
add_position(field_name,
|
102
|
+
string_value,
|
103
|
+
position,
|
104
|
+
TermVectorOffsetInfo.new(offset,
|
105
|
+
offset + string_value.length))
|
106
|
+
position += 1
|
107
|
+
else
|
108
|
+
add_position(field_name, string_value, position, nil)
|
109
|
+
position += 1
|
110
|
+
end
|
111
|
+
offset += string_value.length()
|
112
|
+
length += 1
|
113
|
+
else
|
114
|
+
|
115
|
+
reader = field.reader_value()
|
116
|
+
|
117
|
+
# Tokenize field and add to posting_table
|
118
|
+
stream = @analyzer.token_stream(field_name, reader)
|
119
|
+
begin
|
120
|
+
last_token = nil
|
121
|
+
while token = stream.next
|
122
|
+
position += (token.position_increment - 1)
|
123
|
+
|
124
|
+
if(field_info.store_offsets?())
|
125
|
+
add_position(field_name,
|
126
|
+
token.term_text(),
|
127
|
+
position,
|
128
|
+
TermVectorOffsetInfo.new(
|
129
|
+
offset + token.start_offset(),
|
130
|
+
offset + token.end_offset()))
|
131
|
+
position += 1
|
132
|
+
else
|
133
|
+
add_position(field_name, token.term_text(), position, nil)
|
134
|
+
position += 1
|
135
|
+
end
|
136
|
+
|
137
|
+
last_token = token
|
138
|
+
length += 1
|
139
|
+
if (length > @max_field_length)
|
140
|
+
if @info_stream
|
141
|
+
@info_stream.puts("max_field_length " + @max_field_length.to_s + " reached, ignoring following tokens")
|
142
|
+
end
|
143
|
+
break
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
if(last_token != nil)
|
148
|
+
offset += last_token.end_offset() + 1
|
149
|
+
end
|
150
|
+
|
151
|
+
ensure
|
152
|
+
stream.close()
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
@field_lengths[field_number] = length # save field length
|
157
|
+
@field_positions[field_number] = position # save field position
|
158
|
+
@field_boosts[field_number] *= field.boost
|
159
|
+
@field_offsets[field_number] = offset
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
def add_position(field, text, position, tv_offset_info)
|
166
|
+
@term_buffer.set!(field, text)
|
167
|
+
#puts("Offset: " + tv_offset_info)
|
168
|
+
posting = @posting_table[@term_buffer]
|
169
|
+
if (posting != nil) # word seen before
|
170
|
+
freq = posting.freq
|
171
|
+
posting.positions[freq] = position # add new position
|
172
|
+
posting.offsets[freq] = tv_offset_info # add new position
|
173
|
+
|
174
|
+
if (tv_offset_info != nil)
|
175
|
+
posting.offsets[freq] = tv_offset_info
|
176
|
+
end
|
177
|
+
posting.freq = freq + 1 # update frequency
|
178
|
+
else # word not seen before
|
179
|
+
term = Term.new(field, text)
|
180
|
+
@posting_table[term] = Posting.new(term, position, tv_offset_info)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def sort_posting_table()
|
185
|
+
# copy @posting_table into an array
|
186
|
+
return @posting_table.values.sort { |x,y| x.term <=> y.term }
|
187
|
+
end
|
188
|
+
|
189
|
+
def write_postings(postings, segment)
|
190
|
+
|
191
|
+
freq = nil
|
192
|
+
prox = nil
|
193
|
+
tis_writer = nil
|
194
|
+
tv_writer = nil
|
195
|
+
begin
|
196
|
+
#open files for inverse index storage
|
197
|
+
freq = @directory.create_output(segment + ".frq")
|
198
|
+
prox = @directory.create_output(segment + ".prx")
|
199
|
+
tis_writer = TermInfosWriter.new(@directory, segment, @field_infos,
|
200
|
+
@term_index_interval)
|
201
|
+
ti = TermInfo.new()
|
202
|
+
current_field = nil
|
203
|
+
|
204
|
+
postings.each do |posting|
|
205
|
+
# add an entry to the dictionary with pointers to prox and freq files
|
206
|
+
ti.set_values!(1, freq.pos(), prox.pos(), -1)
|
207
|
+
tis_writer.add(posting.term, ti)
|
208
|
+
|
209
|
+
# add an entry to the freq file
|
210
|
+
posting_freq = posting.freq
|
211
|
+
if (posting_freq == 1) # optimize freq=1
|
212
|
+
freq.write_vint(1) # set low bit of doc num.
|
213
|
+
else
|
214
|
+
freq.write_vint(0) # the document number
|
215
|
+
freq.write_vint(posting_freq) # frequency in doc
|
216
|
+
end
|
217
|
+
|
218
|
+
last_position = 0 # write positions
|
219
|
+
posting.positions.each do |position|
|
220
|
+
prox.write_vint(position - last_position)
|
221
|
+
last_position = position
|
222
|
+
end
|
223
|
+
# check to see if we switched to a new field
|
224
|
+
term_field = posting.term.field
|
225
|
+
if (current_field != term_field)
|
226
|
+
# changing field - see if there is something to save
|
227
|
+
current_field = term_field
|
228
|
+
fi = @field_infos[current_field]
|
229
|
+
if (fi.store_term_vector?)
|
230
|
+
if tv_writer.nil?
|
231
|
+
tv_writer = TermVectorsWriter.new(@directory, segment, @field_infos)
|
232
|
+
tv_writer.open_document()
|
233
|
+
end
|
234
|
+
tv_writer.open_field(current_field)
|
235
|
+
|
236
|
+
elsif not tv_writer.nil?
|
237
|
+
tv_writer.close_field()
|
238
|
+
end
|
239
|
+
end
|
240
|
+
if not tv_writer.nil? and tv_writer.field_open?
|
241
|
+
tv_writer.add_term(posting.term.text, posting_freq, posting.positions, posting.offsets)
|
242
|
+
end
|
243
|
+
end
|
244
|
+
if not tv_writer.nil?
|
245
|
+
tv_writer.close_document()
|
246
|
+
end
|
247
|
+
ensure
|
248
|
+
# make an effort to close all streams we can but remember and re-raise
|
249
|
+
# the last exception encountered in this process
|
250
|
+
keep = nil
|
251
|
+
[freq, prox, tis_writer, tv_writer].compact.each do |obj|
|
252
|
+
begin
|
253
|
+
obj.close
|
254
|
+
rescue IOError => e
|
255
|
+
keep = e
|
256
|
+
end
|
257
|
+
end
|
258
|
+
raise keep if not keep.nil?
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
def write_norms(segment)
|
263
|
+
@field_infos.each_with_index do |fi, i|
|
264
|
+
if fi.indexed?
|
265
|
+
norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
|
266
|
+
norms = @directory.create_output(segment + ".f" + i.to_s)
|
267
|
+
begin
|
268
|
+
norms.write_byte(Ferret::Search::Similarity.encode_norm(norm))
|
269
|
+
ensure
|
270
|
+
norms.close()
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
end
|
277
|
+
|
278
|
+
class Posting # info about a Term in a doc
|
279
|
+
attr_accessor :term, :freq, :positions, :offsets
|
280
|
+
|
281
|
+
def initialize(t, position, offset)
|
282
|
+
@term = t
|
283
|
+
@freq = 1
|
284
|
+
@positions = [position]
|
285
|
+
@offsets = [offset]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
module Ferret
|
2
|
+
module Index
|
3
|
+
# Access to the Field Info file that describes document fields and whether or
|
4
|
+
# not they are indexed. Each segment has a separate Field Info file. Objects
|
5
|
+
# of this class are thread-safe for multiple readers, but only one thread can
|
6
|
+
# be adding documents at a time, with no other reader or writer threads
|
7
|
+
# accessing this object.
|
8
|
+
class FieldInfos
|
9
|
+
|
10
|
+
NOT_A_FIELD = 0xffffffff # -1 in java int
|
11
|
+
|
12
|
+
# Construct a FieldInfos object using the directory and the name of the file
|
13
|
+
# InputStream
|
14
|
+
#
|
15
|
+
# dir:: The directory to open the InputStream from
|
16
|
+
# name:: The name of the file to open the InputStream from in the Directory
|
17
|
+
def initialize(dir = nil, name = nil)
|
18
|
+
@fi_array = []
|
19
|
+
@fi_hash = {}
|
20
|
+
if dir and dir.exists?(name)
|
21
|
+
input = dir.open_input(name)
|
22
|
+
begin
|
23
|
+
read(input)
|
24
|
+
ensure
|
25
|
+
input.close()
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the number of fields that have been added to this field infos
|
31
|
+
# object.
|
32
|
+
def size
|
33
|
+
return @fi_array.size
|
34
|
+
end
|
35
|
+
|
36
|
+
# Automatically adds all of the fields from the document if they haven't
|
37
|
+
# been added already. Or it will update the values.
|
38
|
+
def add_doc_fields(doc)
|
39
|
+
doc.all_fields.each do |field|
|
40
|
+
add(field.name,
|
41
|
+
field.indexed?,
|
42
|
+
field.store_term_vector?,
|
43
|
+
field.store_positions?,
|
44
|
+
field.store_offsets?)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
alias :<< :add_doc_fields
|
48
|
+
|
49
|
+
# Calls the 5 param add method to add all the names in the collection
|
50
|
+
def add_fields(names,
|
51
|
+
indexed = true,
|
52
|
+
store_term_vector = false,
|
53
|
+
store_position = false,
|
54
|
+
store_offset = false)
|
55
|
+
names.each do |name|
|
56
|
+
add(name, indexed, store_term_vector, store_position, store_offset)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# If the field is not yet known, adds it. If it is known, checks to make
|
61
|
+
# sure that the indexed flag is the same as was given previously for this
|
62
|
+
# field. If not - marks it as being indexed. Same goes for the TermVector
|
63
|
+
# parameters.
|
64
|
+
#
|
65
|
+
# name:: The name of the field
|
66
|
+
# indexed:: true if the field is indexed
|
67
|
+
# store_term_vector:: true if the term vector should be stored
|
68
|
+
# store_position:: true if the positions should be stored
|
69
|
+
# store_offset:: true if the offsets should be stored
|
70
|
+
def add(name,
|
71
|
+
indexed = true,
|
72
|
+
store_term_vector = false,
|
73
|
+
store_position = false,
|
74
|
+
store_offset = false)
|
75
|
+
fi = @fi_hash[name]
|
76
|
+
if (fi == nil)
|
77
|
+
fi = add_internal(name, indexed, store_term_vector, store_position, store_offset)
|
78
|
+
else
|
79
|
+
if (fi.indexed? != indexed)
|
80
|
+
fi.indexed = true # once indexed, always index
|
81
|
+
end
|
82
|
+
if (fi.store_term_vector? != store_term_vector)
|
83
|
+
fi.store_term_vector = true # once vector, always vector
|
84
|
+
end
|
85
|
+
if (fi.store_positions? != store_position)
|
86
|
+
fi.store_position = true # once vector, always vector
|
87
|
+
end
|
88
|
+
if (fi.store_offsets? != store_offset)
|
89
|
+
fi.store_offset = true # once vector, always vector
|
90
|
+
end
|
91
|
+
end
|
92
|
+
return fi
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the number of the field that goes by the field name that is
|
96
|
+
# passed. If there is no field of this name then -1 is returned
|
97
|
+
def field_number(name)
|
98
|
+
fi = @fi_hash[name]
|
99
|
+
return fi ? fi.number : NOT_A_FIELD
|
100
|
+
end
|
101
|
+
|
102
|
+
# Retrieve the field_info object by either field number or field name.
|
103
|
+
def [](index)
|
104
|
+
if index.is_a? Integer
|
105
|
+
if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
|
106
|
+
return FieldInfo.new("", false, NOT_A_FIELD, false)
|
107
|
+
end
|
108
|
+
return @fi_array[index]
|
109
|
+
else
|
110
|
+
return @fi_hash[index]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def name(index)
|
115
|
+
if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
|
116
|
+
return ""
|
117
|
+
end
|
118
|
+
return self[index].name
|
119
|
+
end
|
120
|
+
|
121
|
+
# Iterate through the field_info objects
|
122
|
+
def each()
|
123
|
+
@fi_array.each() {|fi| yield(fi) }
|
124
|
+
end
|
125
|
+
|
126
|
+
# Iterate through the field_info objects including the index
|
127
|
+
def each_with_index()
|
128
|
+
@fi_array.each_with_index() {|fi, i| yield(fi, i) }
|
129
|
+
end
|
130
|
+
|
131
|
+
# Get the number of field_infos in this object.
|
132
|
+
#
|
133
|
+
# NOTE: There is a default empty field always added at the start. This
|
134
|
+
# may later be used to set the default values for a field.
|
135
|
+
def size()
|
136
|
+
return @fi_array.size()
|
137
|
+
end
|
138
|
+
|
139
|
+
# Return true if any of the fields have store_term_vector? set to true
|
140
|
+
def has_vectors?()
|
141
|
+
@fi_array.each() { |fi| return true if fi.store_term_vector? }
|
142
|
+
return false
|
143
|
+
end
|
144
|
+
|
145
|
+
# Write the field_infos to a file specified by name in dir.
|
146
|
+
#
|
147
|
+
# dir:: the directory to write the fieldinfos to
|
148
|
+
# name:: the name of the file to write to.
|
149
|
+
def write_to_dir(dir, name)
|
150
|
+
output = dir.create_output(name)
|
151
|
+
begin
|
152
|
+
write(output)
|
153
|
+
ensure
|
154
|
+
output.close()
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
protected
|
159
|
+
|
160
|
+
# Write the field_infos to the output file
|
161
|
+
#
|
162
|
+
# output:: the file to write to
|
163
|
+
def write(output)
|
164
|
+
output.write_vint(size())
|
165
|
+
@fi_array.each() do |fi|
|
166
|
+
output.write_string(fi.name)
|
167
|
+
output.write_byte(get_field_info_byte(fi))
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# Read the field_infos object from the input file
|
172
|
+
#
|
173
|
+
# input:: the input file to read from
|
174
|
+
def read(input)
|
175
|
+
size = input.read_vint()#read in the size
|
176
|
+
size.times do |i|
|
177
|
+
name = input.read_string()
|
178
|
+
bits = input.read_byte()
|
179
|
+
indexed = (bits & IS_INDEXED) != 0
|
180
|
+
store_term_vector = (bits & STORE_TERM_VECTOR) != 0
|
181
|
+
store_position = (bits & STORE_POSITION) != 0
|
182
|
+
store_offset = (bits & STORE_OFFSET) != 0
|
183
|
+
add_internal(name, indexed, store_term_vector, store_position, store_offset)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
private
|
188
|
+
IS_INDEXED = 0x1;
|
189
|
+
STORE_TERM_VECTOR = 0x2;
|
190
|
+
STORE_POSITION = 0x4;
|
191
|
+
STORE_OFFSET = 0x8;
|
192
|
+
|
193
|
+
def add_internal(name, indexed, store_term_vector,
|
194
|
+
store_position = false,
|
195
|
+
store_offset = false)
|
196
|
+
fi = FieldInfo.new(name, indexed,
|
197
|
+
@fi_array.size(),
|
198
|
+
store_term_vector,
|
199
|
+
store_position,
|
200
|
+
store_offset)
|
201
|
+
@fi_array << fi
|
202
|
+
@fi_hash[name] = fi
|
203
|
+
return fi
|
204
|
+
end
|
205
|
+
|
206
|
+
def get_field_info_byte(fi)
|
207
|
+
bits = 0x0
|
208
|
+
if (fi.indexed?)
|
209
|
+
bits |= IS_INDEXED
|
210
|
+
end
|
211
|
+
if (fi.store_term_vector?)
|
212
|
+
bits |= STORE_TERM_VECTOR
|
213
|
+
end
|
214
|
+
if (fi.store_positions?)
|
215
|
+
bits |= STORE_POSITION
|
216
|
+
end
|
217
|
+
if (fi.store_offsets?)
|
218
|
+
bits |= STORE_OFFSET
|
219
|
+
end
|
220
|
+
return bits
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
class FieldInfo
|
225
|
+
attr_accessor :name, :number
|
226
|
+
attr_writer :indexed, :store_term_vector, :store_offset, :store_position
|
227
|
+
|
228
|
+
def indexed?()
|
229
|
+
return @indexed
|
230
|
+
end
|
231
|
+
|
232
|
+
def store_term_vector?()
|
233
|
+
return @store_term_vector
|
234
|
+
end
|
235
|
+
|
236
|
+
def store_offsets?()
|
237
|
+
return @store_offset
|
238
|
+
end
|
239
|
+
def store_positions?()
|
240
|
+
return @store_position
|
241
|
+
end
|
242
|
+
|
243
|
+
def set!(indexed, store_term_vector, store_position, store_offset)
|
244
|
+
@indexed = indexed
|
245
|
+
@store_term_vector = store_term_vector
|
246
|
+
@store_position = store_position
|
247
|
+
@store_offset = store_offset
|
248
|
+
end
|
249
|
+
|
250
|
+
def initialize(name, indexed, number, store_term_vector,
|
251
|
+
store_position = false,
|
252
|
+
store_offset = false)
|
253
|
+
@name = name
|
254
|
+
@number = number
|
255
|
+
set!(indexed, store_term_vector, store_position, store_offset)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|