ferret 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,304 @@
|
|
1
|
+
module Ferret::Document
|
2
|
+
# A field is a section of a Document. Each field has two parts, a name
|
3
|
+
# and a value. Values may be free text, provided as a String or as a
|
4
|
+
# Reader, or they may be atomic keywords, which are not further processed.
|
5
|
+
# Such keywords may be used to represent dates, urls, etc. Fields are
|
6
|
+
# optionally stored in the index, so that they may be returned with hits
|
7
|
+
# on the document.
|
8
|
+
class Field
|
9
|
+
|
10
|
+
# This value will be
|
11
|
+
# multiplied into the score of all hits on this field of this
|
12
|
+
# document.
|
13
|
+
#
|
14
|
+
# The boost is multiplied by Document#boost of the document
|
15
|
+
# containing this field. If a document has multiple fields with the same
|
16
|
+
# name, all such values are multiplied together. This product is then
|
17
|
+
# multipled by the value Similarity#length_norm(String,int), and
|
18
|
+
# rounded by Similarity#encode_norm(float) before it is stored in the
|
19
|
+
# index. One should attempt to ensure that this product does not overflow
|
20
|
+
# the range of that encoding.
|
21
|
+
#
|
22
|
+
# See Document#set_boost(float)
|
23
|
+
# See Similarity#length_norm(String, int)
|
24
|
+
# See Similarity#encode_norm(float)
|
25
|
+
#
|
26
|
+
# Note: this value is not stored directly with the document in the index.
|
27
|
+
# Documents returned from IndexReader#document(int) and
|
28
|
+
# Hits#doc(int) may thus not have the same value present as when this field
|
29
|
+
# was indexed.
|
30
|
+
attr_accessor :boost, :data
|
31
|
+
|
32
|
+
attr_reader :name
|
33
|
+
|
34
|
+
# True iff the value of the field is to be stored in the index for
|
35
|
+
# return with search hits. It is an error for this to be true if a
|
36
|
+
# field is Reader-valued.
|
37
|
+
def stored?() return @stored end
|
38
|
+
|
39
|
+
# True iff the value of the field is to be indexed, so that it may be
|
40
|
+
# searched on.
|
41
|
+
def indexed?() return @indexed end
|
42
|
+
|
43
|
+
# True iff the value of the field should be tokenized as text prior to
|
44
|
+
# indexing. Un-tokenized fields are indexed as a single word and may
|
45
|
+
# not be Reader-valued.
|
46
|
+
def tokenized?() return @tokenized end
|
47
|
+
|
48
|
+
# True if the field is to be stored as a binary value. This can be used
|
49
|
+
# to store images or other binary data in the index if you wish
|
50
|
+
def binary?() return @binary end
|
51
|
+
|
52
|
+
# True if you want to compress the data that you store. This is a good
|
53
|
+
# idea for really large text fields. The ruby Zlib library is used to do
|
54
|
+
# the compression
|
55
|
+
def compressed?() return @compressed end
|
56
|
+
|
57
|
+
# True iff the term or terms used to index this field are stored as a
|
58
|
+
# term vector, available from IndexReader#term_freq_vector(). These
|
59
|
+
# methods do not provide access to the original content of the field,
|
60
|
+
# only to terms used to index it. If the original content must be
|
61
|
+
# preserved, use the _stored_ attribute instead.
|
62
|
+
#
|
63
|
+
# See IndexReader#term_freq_vector()
|
64
|
+
def store_term_vector?() return @store_term_vector end
|
65
|
+
|
66
|
+
# True if the positions of the indexed terms in this field are stored.
|
67
|
+
def store_positions?() return @store_position end
|
68
|
+
|
69
|
+
# True if the offsets of this field are stored. The offsets are the
|
70
|
+
# positions of the start and end characters of the token in the whole
|
71
|
+
# field string
|
72
|
+
def store_offsets?() return @store_offset end
|
73
|
+
|
74
|
+
class Store < Ferret::Utils::Parameter
|
75
|
+
# Store the original field value in the index in a compressed form.
|
76
|
+
# This is useful for long documents and for binary valued fields.
|
77
|
+
COMPRESS = Store.new("COMPRESS")
|
78
|
+
|
79
|
+
# Store the original field value in the index. This is useful for
|
80
|
+
# short texts like a document's title which should be displayed with
|
81
|
+
# the results. The value is stored in its original form, i.e. no
|
82
|
+
# analyzer is used before it is stored.
|
83
|
+
YES = Store.new("YES")
|
84
|
+
|
85
|
+
# Do not store the field value in the index.
|
86
|
+
NO = Store.new("NO")
|
87
|
+
end
|
88
|
+
|
89
|
+
class Index < Ferret::Utils::Parameter
|
90
|
+
# Do not index the field value. This field can thus not be searched,
|
91
|
+
# but one can still access its contents provided it is Field.Store
|
92
|
+
# stored
|
93
|
+
NO = Index.new("NO")
|
94
|
+
|
95
|
+
# Index the field's value so it can be searched. An Analyzer will be
|
96
|
+
# used to tokenize and possibly further normalize the text before its
|
97
|
+
# terms will be stored in the index. This is useful for common text.
|
98
|
+
TOKENIZED = Index.new("TOKENIZED")
|
99
|
+
|
100
|
+
# Index the field's value without using an Analyzer, so it can be
|
101
|
+
# searched. As no analyzer is used the value will be stored as a
|
102
|
+
# single term. This is useful for unique Ids like product numbers.
|
103
|
+
UNTOKENIZED = Index.new("UNTOKENIZED")
|
104
|
+
end
|
105
|
+
|
106
|
+
class TermVector < Ferret::Utils::Parameter
|
107
|
+
# Do not store term vectors.
|
108
|
+
NO = TermVector.new("NO")
|
109
|
+
|
110
|
+
# Store the term vectors of each document. A term vector is a list of
|
111
|
+
# the document's terms and their number of occurences in that
|
112
|
+
# document.
|
113
|
+
YES = TermVector.new("YES")
|
114
|
+
|
115
|
+
# Store the term vector + token position information
|
116
|
+
#
|
117
|
+
# See #YES
|
118
|
+
WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
|
119
|
+
|
120
|
+
# Store the term vector + Token offset information
|
121
|
+
#
|
122
|
+
# See #YES
|
123
|
+
WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
|
124
|
+
|
125
|
+
# Store the term vector + Token position and offset information
|
126
|
+
#
|
127
|
+
# See #YES See #WITH_POSITIONS See #WITH_OFFSETS
|
128
|
+
WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
|
129
|
+
end
|
130
|
+
|
131
|
+
# Create a field by specifying its name, value and how it will
|
132
|
+
# be saved in the index.
|
133
|
+
#
|
134
|
+
# name:: The name of the field
|
135
|
+
# value:: The string to process
|
136
|
+
# store:: Whether _value_ should be stored in the index
|
137
|
+
# index:: Whether the field should be indexed, and if so, if it should
|
138
|
+
# be tokenized before indexing
|
139
|
+
#
|
140
|
+
# store_term_vector:: Whether term vector should be stored
|
141
|
+
# * the field is neither stored nor indexed
|
142
|
+
# * the field is not indexed but term_vector is _TermVector::YES_
|
143
|
+
#
|
144
|
+
# binary:: Whether you want to store binary data in this field. Default is
|
145
|
+
# false
|
146
|
+
# boost:: the boost for this field. Default is 1.0. A larger number makes
|
147
|
+
# this field more important.
|
148
|
+
def initialize(name,
|
149
|
+
value,
|
150
|
+
stored = Store::YES,
|
151
|
+
index = Index::UNTOKENIZED,
|
152
|
+
store_term_vector = TermVector::NO,
|
153
|
+
binary = false,
|
154
|
+
boost = 1.0)
|
155
|
+
if (index == Index::NO and stored == Store::NO)
|
156
|
+
raise ArgumentError, "it doesn't make sense to have a field that " +
|
157
|
+
"is neither indexed nor stored"
|
158
|
+
end
|
159
|
+
if (index == Index::NO && store_term_vector != TermVector::NO)
|
160
|
+
raise ArgumentError, "cannot store term vector information for a " +
|
161
|
+
"field that is not indexed"
|
162
|
+
end
|
163
|
+
|
164
|
+
# The name of the field (e.g., "date", "subject", "title", or "body")
|
165
|
+
@name = name
|
166
|
+
|
167
|
+
# the one and only data object for all different kind of field values
|
168
|
+
@data = value
|
169
|
+
self.stored = stored
|
170
|
+
self.index = index
|
171
|
+
self.store_term_vector = store_term_vector
|
172
|
+
@binary = binary
|
173
|
+
@boost = boost
|
174
|
+
end
|
175
|
+
|
176
|
+
def stored=(stored)
|
177
|
+
if (stored == Store::YES)
|
178
|
+
@stored = true
|
179
|
+
@compressed = false
|
180
|
+
elsif (stored == Store::COMPRESS)
|
181
|
+
@stored = true
|
182
|
+
@compressed = true
|
183
|
+
elsif (stored == Store::NO)
|
184
|
+
@stored = false
|
185
|
+
@compressed = false
|
186
|
+
else
|
187
|
+
raise "unknown stored parameter " + stored.to_s
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def index=(index)
|
192
|
+
if (index == Index::NO)
|
193
|
+
@indexed = false
|
194
|
+
@tokenized = false
|
195
|
+
elsif (index == Index::TOKENIZED)
|
196
|
+
@indexed = true
|
197
|
+
@tokenized = true
|
198
|
+
elsif (index == Index::UNTOKENIZED)
|
199
|
+
@indexed = true
|
200
|
+
@tokenized = false
|
201
|
+
else
|
202
|
+
raise "unknown stored parameter " + index.to_s
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def store_term_vector=(store_term_vector)
|
207
|
+
if (store_term_vector == TermVector::NO)
|
208
|
+
@store_term_vector = false
|
209
|
+
@store_position = false
|
210
|
+
@store_offset = false
|
211
|
+
elsif (store_term_vector == TermVector::YES)
|
212
|
+
@store_term_vector = true
|
213
|
+
@store_position = false
|
214
|
+
@store_offset = false
|
215
|
+
elsif (store_term_vector == TermVector::WITH_POSITIONS)
|
216
|
+
@store_term_vector = true
|
217
|
+
@store_position = true
|
218
|
+
@store_offset = false
|
219
|
+
elsif (store_term_vector == TermVector::WITH_OFFSETS)
|
220
|
+
@store_term_vector = true
|
221
|
+
@store_position = false
|
222
|
+
@store_offset = true
|
223
|
+
elsif (store_term_vector == TermVector::WITH_POSITIONS_OFFSETS)
|
224
|
+
@store_term_vector = true
|
225
|
+
@store_position = true
|
226
|
+
@store_offset = true
|
227
|
+
else
|
228
|
+
raise "unknown term_vector parameter " + store_term_vector.to_s
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
# Returns the string value of the data that is stored in this field
|
233
|
+
def string_value
|
234
|
+
if @data.instance_of? String
|
235
|
+
return @data
|
236
|
+
elsif @data.respond_to? :read
|
237
|
+
return @data.read()
|
238
|
+
else
|
239
|
+
# if it is binary object try to return a string representation
|
240
|
+
return @data.to_s
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
# if the data is stored as a binary, just return it.
|
245
|
+
def binary_value
|
246
|
+
return @data
|
247
|
+
end
|
248
|
+
|
249
|
+
# Returns the string value of the data that is stored in this field
|
250
|
+
def reader_value
|
251
|
+
if @data.respond_to? :read
|
252
|
+
return @data
|
253
|
+
elsif @data.instance_of? String
|
254
|
+
return Ferret::Utils::StringHelper::StringReader.new(@data)
|
255
|
+
else
|
256
|
+
# if it is binary object try to return a string representation
|
257
|
+
return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Create a stored field with binary value. Optionally the value
|
262
|
+
# may be compressed. But it obviously won't be tokenized or
|
263
|
+
# term vectored or anything like that.
|
264
|
+
#
|
265
|
+
# name:: The name of the field
|
266
|
+
# value:: The binary value
|
267
|
+
# store:: How _value_ should be stored (compressed or not.)
|
268
|
+
def Field.new_binary_field(name, value, stored)
|
269
|
+
if (stored == Store::NO)
|
270
|
+
raise ArgumentError, "binary values can't be unstored"
|
271
|
+
end
|
272
|
+
Field.new(name, value, stored, Index::NO, TermVector::NO, true)
|
273
|
+
end
|
274
|
+
|
275
|
+
# Prints a Field for human consumption.
|
276
|
+
def to_s()
|
277
|
+
str = ""
|
278
|
+
if (@stored)
|
279
|
+
str << "stored"
|
280
|
+
@str << @compressed ? "/compressed," : "/uncompressed,"
|
281
|
+
end
|
282
|
+
if (@indexed) then str << "indexed," end
|
283
|
+
if (@tokenized) then str << "tokenized," end
|
284
|
+
if (@store_term_vector) then str << "store_term_vector," end
|
285
|
+
if (@store_offset)
|
286
|
+
str << "term_vector_offsets,"
|
287
|
+
end
|
288
|
+
if (@store_position)
|
289
|
+
str << "term_vector_position,"
|
290
|
+
end
|
291
|
+
if (@binary) then str << "binary," end
|
292
|
+
|
293
|
+
str << '<'
|
294
|
+
str << @name
|
295
|
+
str << ':'
|
296
|
+
|
297
|
+
if (@data != null)
|
298
|
+
str << @data.to_s
|
299
|
+
end
|
300
|
+
|
301
|
+
str << '>'
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
data/lib/ferret/index.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ferret/index/index_file_names'
|
2
|
+
require 'ferret/index/term'
|
3
|
+
require 'ferret/index/term_buffer'
|
4
|
+
require 'ferret/index/term_doc_enum'
|
5
|
+
require 'ferret/index/multiple_term_doc_pos_enum'
|
6
|
+
require 'ferret/index/term_enum'
|
7
|
+
require 'ferret/index/term_info'
|
8
|
+
require 'ferret/index/term_infos_io'
|
9
|
+
require 'ferret/index/term_vector_offset_info'
|
10
|
+
require 'ferret/index/term_vectors_io'
|
11
|
+
require 'ferret/index/field_infos'
|
12
|
+
require 'ferret/index/fields_io'
|
13
|
+
require 'ferret/index/compound_file_io'
|
14
|
+
require 'ferret/index/term_buffer'
|
15
|
+
require 'ferret/index/segment_term_enum'
|
16
|
+
require 'ferret/index/segment_term_vector'
|
17
|
+
require 'ferret/index/segment_merge_info'
|
18
|
+
require 'ferret/index/segment_merge_queue'
|
19
|
+
require 'ferret/index/segment_infos'
|
20
|
+
require 'ferret/index/document_writer'
|
21
|
+
require 'ferret/index/index_reader'
|
22
|
+
require 'ferret/index/index_writer'
|
23
|
+
require 'ferret/index/multi_reader'
|
24
|
+
require 'ferret/index/segment_merger'
|
25
|
+
require 'ferret/index/segment_reader'
|
26
|
+
require 'ferret/index/index'
|
@@ -0,0 +1,343 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
|
3
|
+
module Ferret::Index
|
4
|
+
|
5
|
+
# Class for accessing a compound stream.
|
6
|
+
# This class implements a directory, but is limited to only read operations.
|
7
|
+
# Directory methods that would normally modify data raise.
|
8
|
+
class CompoundFileReader < Ferret::Store::Directory
|
9
|
+
|
10
|
+
include MonitorMixin
|
11
|
+
|
12
|
+
attr_reader :directory, :file_name
|
13
|
+
|
14
|
+
# Creates a Compound File Reader which contains a single file and has
|
15
|
+
# pointers to the individual files within. When it is initialized, the
|
16
|
+
# compound file is set and the header is read so that it is ready to read
|
17
|
+
# the individual files within.
|
18
|
+
def initialize(dir, name)
|
19
|
+
|
20
|
+
super()
|
21
|
+
|
22
|
+
@directory = dir
|
23
|
+
@file_name = name
|
24
|
+
@entries = {}
|
25
|
+
|
26
|
+
success = false
|
27
|
+
|
28
|
+
begin
|
29
|
+
@stream = dir.open_input(name)
|
30
|
+
|
31
|
+
# read the directory and init files
|
32
|
+
count = @stream.read_vint()
|
33
|
+
entry = nil
|
34
|
+
count.times() do
|
35
|
+
offset = @stream.read_long()
|
36
|
+
id = @stream.read_string()
|
37
|
+
|
38
|
+
if (entry != nil)
|
39
|
+
# set length of the previous entry
|
40
|
+
entry.length = offset - entry.offset
|
41
|
+
end
|
42
|
+
|
43
|
+
entry = FileEntry.new(offset)
|
44
|
+
@entries[id] = entry
|
45
|
+
end
|
46
|
+
|
47
|
+
# set the length of the final entry
|
48
|
+
if (entry != nil)
|
49
|
+
entry.length = @stream.length() - entry.offset
|
50
|
+
end
|
51
|
+
|
52
|
+
success = true
|
53
|
+
|
54
|
+
ensure
|
55
|
+
|
56
|
+
if not success and (@stream != nil)
|
57
|
+
begin
|
58
|
+
@stream.close()
|
59
|
+
rescue IOError
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def close()
|
66
|
+
synchronize do
|
67
|
+
if (@stream == nil): raise(IOError, "Already closed") end
|
68
|
+
|
69
|
+
@entries.clear()
|
70
|
+
@stream.close()
|
71
|
+
@stream = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def open_input(id)
|
76
|
+
synchronize do
|
77
|
+
if (@stream == nil)
|
78
|
+
raise(IOError, "Stream closed")
|
79
|
+
end
|
80
|
+
|
81
|
+
entry = @entries[id]
|
82
|
+
if (entry == nil)
|
83
|
+
raise(IOError, "No sub-file with id " + id + " found")
|
84
|
+
end
|
85
|
+
return CSIndexInput.new(@stream, entry.offset, entry.length)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns an array of strings, one for each file in the directory.
|
90
|
+
def list()
|
91
|
+
return @entries.keys()
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns true iff a file with the given name exists.
|
95
|
+
def file_exists(name)
|
96
|
+
return @entries.key?(name)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Returns the time the named file was last modified.
|
100
|
+
def modified(name)
|
101
|
+
return @directory.modified(@file_name)
|
102
|
+
end
|
103
|
+
|
104
|
+
# Set the modified time of an existing file to now.
|
105
|
+
def touch(name)
|
106
|
+
@directory.touch(@file_name)
|
107
|
+
end
|
108
|
+
|
109
|
+
# Not implemented
|
110
|
+
def delete(name) raise(UnsupportedOperationError) end
|
111
|
+
|
112
|
+
# Not implemented
|
113
|
+
def rename(from, to) raise(UnsupportedOperationError) end
|
114
|
+
|
115
|
+
# Returns the length of a file in the directory.
|
116
|
+
def file_length(name)
|
117
|
+
e = @entries[name]
|
118
|
+
if (e == nil): raise(IOError, "File " + name + " does not exist") end
|
119
|
+
return e.length
|
120
|
+
end
|
121
|
+
|
122
|
+
# Not implemented
|
123
|
+
def create_output(name) raise(UnsupportedOperationError) end
|
124
|
+
|
125
|
+
# Not implemented
|
126
|
+
def make_lock(name) raise(UnsupportedOperationError) end
|
127
|
+
|
128
|
+
# Implementation of an IndexInput that reads from a portion of the
|
129
|
+
# compound file.
|
130
|
+
class CSIndexInput < Ferret::Store::BufferedIndexInput
|
131
|
+
attr_reader :length
|
132
|
+
|
133
|
+
def initialize(base, file_offset, length)
|
134
|
+
super()
|
135
|
+
@base = base
|
136
|
+
@base.extend(MonitorMixin)
|
137
|
+
@file_offset = file_offset
|
138
|
+
@length = length
|
139
|
+
end
|
140
|
+
|
141
|
+
# Closes the stream to further operations.
|
142
|
+
def close() end
|
143
|
+
|
144
|
+
private
|
145
|
+
# Expert: implements buffer refill. Reads bytes from the current
|
146
|
+
# position in the input.
|
147
|
+
#
|
148
|
+
# b:: the array to read bytes into
|
149
|
+
# offset:: the offset in the array to start storing bytes
|
150
|
+
# len:: the number of bytes to read
|
151
|
+
def read_internal(b, offset, len)
|
152
|
+
@base.synchronize() do
|
153
|
+
start = pos()
|
154
|
+
if(start + len > @length): raise(EOFError, "read past EOF") end
|
155
|
+
@base.seek(@file_offset + start)
|
156
|
+
@base.read_bytes(b, offset, len)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Expert: implements seek. Sets current position in @file, where
|
161
|
+
# the next {@link #read_internal(byte[],int,int)} will occur.
|
162
|
+
def seek_internal(pos) end
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
# Base info
|
167
|
+
class FileEntry
|
168
|
+
attr_accessor :offset, :length
|
169
|
+
def initialize(offset)
|
170
|
+
@offset = offset
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# Combines multiple files into a single compound file.
|
177
|
+
# The file format:
|
178
|
+
#
|
179
|
+
# * VInt fileCount
|
180
|
+
# * {Directory} fileCount entries with the following structure:
|
181
|
+
# + long data_offset
|
182
|
+
# + UTFString extension
|
183
|
+
# * {File Data} fileCount entries with the raw data of the corresponding file
|
184
|
+
#
|
185
|
+
# The fileCount integer indicates how many files are contained in this compound
|
186
|
+
# file. The {directory} that follows has that many entries. Each directory entry
|
187
|
+
# contains an encoding identifier, a long pointer to the start of this file's
|
188
|
+
# data section, and a UTF String with that file's extension.
|
189
|
+
class CompoundFileWriter
|
190
|
+
|
191
|
+
attr_reader :directory, :file_name
|
192
|
+
|
193
|
+
# Create the compound stream in the specified file. The file name is the
|
194
|
+
# entire name (no extensions are added).
|
195
|
+
def initialize(dir, name)
|
196
|
+
@directory = dir
|
197
|
+
@file_name = name
|
198
|
+
@ids = Set.new
|
199
|
+
@file_entries = []
|
200
|
+
@merged = false
|
201
|
+
end
|
202
|
+
|
203
|
+
# Add a source stream. _file_name_ is the string by which the
|
204
|
+
# sub-stream will be known in the compound stream.
|
205
|
+
#
|
206
|
+
# Throws:: IllegalStateError if this writer is closed
|
207
|
+
# Throws:: IllegalArgumentError if a file with the same name
|
208
|
+
# has been added already
|
209
|
+
def add_file(file_name)
|
210
|
+
if @merged
|
211
|
+
raise(IllegalStateError, "Can't add extensions after merge has been called")
|
212
|
+
end
|
213
|
+
|
214
|
+
if not @ids.add?(file_name)
|
215
|
+
raise(IllegalArgumentError, "File " + file + " already added")
|
216
|
+
end
|
217
|
+
|
218
|
+
entry = FileEntry.new(file_name)
|
219
|
+
@file_entries << entry
|
220
|
+
end
|
221
|
+
|
222
|
+
# Merge files with the extensions added up to now.
|
223
|
+
# All files with these extensions are combined sequentially into the
|
224
|
+
# compound stream. After successful merge, the source files
|
225
|
+
# are deleted.
|
226
|
+
#
|
227
|
+
# Throws:: IllegalStateException if close() had been called before or
|
228
|
+
# if no file has been added to this object
|
229
|
+
def close()
|
230
|
+
|
231
|
+
if @merged
|
232
|
+
raise(IllegalStateException, "Merge already performed")
|
233
|
+
end
|
234
|
+
|
235
|
+
if @file_entries.empty?
|
236
|
+
raise(IllegalStateException, "No entries to merge have been defined")
|
237
|
+
end
|
238
|
+
|
239
|
+
@merged = true
|
240
|
+
|
241
|
+
# open the compound stream
|
242
|
+
os = nil
|
243
|
+
begin
|
244
|
+
os = @directory.create_output(@file_name)
|
245
|
+
|
246
|
+
# Write the number of entries
|
247
|
+
os.write_vint(@file_entries.size)
|
248
|
+
|
249
|
+
# Write the directory with all offsets at 0.
|
250
|
+
# Remember the positions of directory entries so that we can
|
251
|
+
# adjust the offsets later
|
252
|
+
@file_entries.each do |fe|
|
253
|
+
fe.directory_offset = os.pos()
|
254
|
+
os.write_long(0) # for now
|
255
|
+
os.write_string(fe.file_name)
|
256
|
+
end
|
257
|
+
|
258
|
+
# Open the files and copy their data into the stream.
|
259
|
+
# Remember the locations of each file's data section.
|
260
|
+
@file_entries.each do |fe|
|
261
|
+
fe.data_offset = os.pos()
|
262
|
+
copy_file(fe, os)
|
263
|
+
end
|
264
|
+
|
265
|
+
# Write the data offsets into the directory of the compound stream
|
266
|
+
@file_entries.each do |fe|
|
267
|
+
os.seek(fe.directory_offset)
|
268
|
+
os.write_long(fe.data_offset)
|
269
|
+
end
|
270
|
+
|
271
|
+
# Close the output stream. Set the os to nil before trying to
|
272
|
+
# close so that if an exception occurs during the close, the
|
273
|
+
# finally clause below will not attempt to close the stream
|
274
|
+
# the second time.
|
275
|
+
tmp = os
|
276
|
+
os = nil
|
277
|
+
tmp.close()
|
278
|
+
|
279
|
+
ensure
|
280
|
+
if (os != nil)
|
281
|
+
begin
|
282
|
+
os.close()
|
283
|
+
rescue
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
private
|
290
|
+
|
291
|
+
# Internal class for holding a file
|
292
|
+
class FileEntry
|
293
|
+
|
294
|
+
attr_accessor :file_name, :directory_offset, :data_offset
|
295
|
+
|
296
|
+
def initialize(file_name)
|
297
|
+
@file_name = file_name
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
301
|
+
|
302
|
+
# Copy the contents of the file with specified extension into the
|
303
|
+
# provided output stream. Use a buffer for moving data
|
304
|
+
# to reduce memory allocation.
|
305
|
+
def copy_file(source, os)
|
306
|
+
is = nil
|
307
|
+
begin
|
308
|
+
start_ptr = os.pos()
|
309
|
+
|
310
|
+
is = @directory.open_input(source.file_name)
|
311
|
+
remainder = length = is.length
|
312
|
+
|
313
|
+
buffer = Ferret::Store::BUFFER.clone
|
314
|
+
while (remainder > 0)
|
315
|
+
len = [remainder, Ferret::Store::BUFFER_SIZE].min
|
316
|
+
is.read_bytes(buffer, 0, len)
|
317
|
+
os.write_bytes(buffer, len)
|
318
|
+
remainder -= len
|
319
|
+
end
|
320
|
+
|
321
|
+
# Verify that remainder is 0
|
322
|
+
if (remainder != 0)
|
323
|
+
raise(IOError,
|
324
|
+
"Non-zero remainder length after copying: " + remainder.to_s +
|
325
|
+
" (id: " + source.file_name + ", length: " + length.to_s +
|
326
|
+
", buffer size: " + Ferret::Store::BUFFER_SIZE.to_s + ")")
|
327
|
+
end
|
328
|
+
|
329
|
+
# Verify that the output length diff is equal to original file
|
330
|
+
end_ptr = os.pos()
|
331
|
+
diff = end_ptr - start_ptr
|
332
|
+
if (diff != length)
|
333
|
+
raise(IOError,
|
334
|
+
"Difference in the output file offsets " + diff.to_s +
|
335
|
+
" does not match the original file length " + length.to_s)
|
336
|
+
end
|
337
|
+
|
338
|
+
ensure
|
339
|
+
if (is != nil): is.close() end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|