ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,304 @@
|
|
1
|
+
module Ferret::Document
|
2
|
+
# A field is a section of a Document. Each field has two parts, a name
|
3
|
+
# and a value. Values may be free text, provided as a String or as a
|
4
|
+
# Reader, or they may be atomic keywords, which are not further processed.
|
5
|
+
# Such keywords may be used to represent dates, urls, etc. Fields are
|
6
|
+
# optionally stored in the index, so that they may be returned with hits
|
7
|
+
# on the document.
|
8
|
+
class Field
|
9
|
+
|
10
|
+
# This value will be
|
11
|
+
# multiplied into the score of all hits on this field of this
|
12
|
+
# document.
|
13
|
+
#
|
14
|
+
# The boost is multiplied by Document#boost of the document
|
15
|
+
# containing this field. If a document has multiple fields with the same
|
16
|
+
# name, all such values are multiplied together. This product is then
|
17
|
+
# multipled by the value Similarity#length_norm(String,int), and
|
18
|
+
# rounded by Similarity#encode_norm(float) before it is stored in the
|
19
|
+
# index. One should attempt to ensure that this product does not overflow
|
20
|
+
# the range of that encoding.
|
21
|
+
#
|
22
|
+
# See Document#set_boost(float)
|
23
|
+
# See Similarity#length_norm(String, int)
|
24
|
+
# See Similarity#encode_norm(float)
|
25
|
+
#
|
26
|
+
# Note: this value is not stored directly with the document in the index.
|
27
|
+
# Documents returned from IndexReader#document(int) and
|
28
|
+
# Hits#doc(int) may thus not have the same value present as when this field
|
29
|
+
# was indexed.
|
30
|
+
attr_accessor :boost, :data
|
31
|
+
|
32
|
+
attr_reader :name
|
33
|
+
|
34
|
+
# True iff the value of the field is to be stored in the index for
|
35
|
+
# return with search hits. It is an error for this to be true if a
|
36
|
+
# field is Reader-valued.
|
37
|
+
def stored?() return @stored end
|
38
|
+
|
39
|
+
# True iff the value of the field is to be indexed, so that it may be
|
40
|
+
# searched on.
|
41
|
+
def indexed?() return @indexed end
|
42
|
+
|
43
|
+
# True iff the value of the field should be tokenized as text prior to
|
44
|
+
# indexing. Un-tokenized fields are indexed as a single word and may
|
45
|
+
# not be Reader-valued.
|
46
|
+
def tokenized?() return @tokenized end
|
47
|
+
|
48
|
+
# True if the field is to be stored as a binary value. This can be used
|
49
|
+
# to store images or other binary data in the index if you wish
|
50
|
+
def binary?() return @binary end
|
51
|
+
|
52
|
+
# True if you want to compress the data that you store. This is a good
|
53
|
+
# idea for really large text fields. The ruby Zlib library is used to do
|
54
|
+
# the compression
|
55
|
+
def compressed?() return @compressed end
|
56
|
+
|
57
|
+
# True iff the term or terms used to index this field are stored as a
|
58
|
+
# term vector, available from IndexReader#term_freq_vector(). These
|
59
|
+
# methods do not provide access to the original content of the field,
|
60
|
+
# only to terms used to index it. If the original content must be
|
61
|
+
# preserved, use the _stored_ attribute instead.
|
62
|
+
#
|
63
|
+
# See IndexReader#term_freq_vector()
|
64
|
+
def store_term_vector?() return @store_term_vector end
|
65
|
+
|
66
|
+
# True if the positions of the indexed terms in this field are stored.
|
67
|
+
def store_positions?() return @store_position end
|
68
|
+
|
69
|
+
# True if the offsets of this field are stored. The offsets are the
|
70
|
+
# positions of the start and end characters of the token in the whole
|
71
|
+
# field string
|
72
|
+
def store_offsets?() return @store_offset end
|
73
|
+
|
74
|
+
class Store < Ferret::Utils::Parameter
|
75
|
+
# Store the original field value in the index in a compressed form.
|
76
|
+
# This is useful for long documents and for binary valued fields.
|
77
|
+
COMPRESS = Store.new("COMPRESS")
|
78
|
+
|
79
|
+
# Store the original field value in the index. This is useful for
|
80
|
+
# short texts like a document's title which should be displayed with
|
81
|
+
# the results. The value is stored in its original form, i.e. no
|
82
|
+
# analyzer is used before it is stored.
|
83
|
+
YES = Store.new("YES")
|
84
|
+
|
85
|
+
# Do not store the field value in the index.
|
86
|
+
NO = Store.new("NO")
|
87
|
+
end
|
88
|
+
|
89
|
+
class Index < Ferret::Utils::Parameter
|
90
|
+
# Do not index the field value. This field can thus not be searched,
|
91
|
+
# but one can still access its contents provided it is Field.Store
|
92
|
+
# stored
|
93
|
+
NO = Index.new("NO")
|
94
|
+
|
95
|
+
# Index the field's value so it can be searched. An Analyzer will be
|
96
|
+
# used to tokenize and possibly further normalize the text before its
|
97
|
+
# terms will be stored in the index. This is useful for common text.
|
98
|
+
TOKENIZED = Index.new("TOKENIZED")
|
99
|
+
|
100
|
+
# Index the field's value without using an Analyzer, so it can be
|
101
|
+
# searched. As no analyzer is used the value will be stored as a
|
102
|
+
# single term. This is useful for unique Ids like product numbers.
|
103
|
+
UNTOKENIZED = Index.new("UNTOKENIZED")
|
104
|
+
end
|
105
|
+
|
106
|
+
class TermVector < Ferret::Utils::Parameter
|
107
|
+
# Do not store term vectors.
|
108
|
+
NO = TermVector.new("NO")
|
109
|
+
|
110
|
+
# Store the term vectors of each document. A term vector is a list of
|
111
|
+
# the document's terms and their number of occurences in that
|
112
|
+
# document.
|
113
|
+
YES = TermVector.new("YES")
|
114
|
+
|
115
|
+
# Store the term vector + token position information
|
116
|
+
#
|
117
|
+
# See #YES
|
118
|
+
WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
|
119
|
+
|
120
|
+
# Store the term vector + Token offset information
|
121
|
+
#
|
122
|
+
# See #YES
|
123
|
+
WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
|
124
|
+
|
125
|
+
# Store the term vector + Token position and offset information
|
126
|
+
#
|
127
|
+
# See #YES See #WITH_POSITIONS See #WITH_OFFSETS
|
128
|
+
WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
|
129
|
+
end
|
130
|
+
|
131
|
+
# Create a field by specifying its name, value and how it will
|
132
|
+
# be saved in the index.
|
133
|
+
#
|
134
|
+
# name:: The name of the field
|
135
|
+
# value:: The string to process
|
136
|
+
# store:: Whether _value_ should be stored in the index
|
137
|
+
# index:: Whether the field should be indexed, and if so, if it should
|
138
|
+
# be tokenized before indexing
|
139
|
+
#
|
140
|
+
# store_term_vector:: Whether term vector should be stored
|
141
|
+
# * the field is neither stored nor indexed
|
142
|
+
# * the field is not indexed but term_vector is _TermVector::YES_
|
143
|
+
#
|
144
|
+
# binary:: Whether you want to store binary data in this field. Default is
|
145
|
+
# false
|
146
|
+
# boost:: the boost for this field. Default is 1.0. A larger number makes
|
147
|
+
# this field more important.
|
148
|
+
def initialize(name,
|
149
|
+
value,
|
150
|
+
stored = Store::YES,
|
151
|
+
index = Index::UNTOKENIZED,
|
152
|
+
store_term_vector = TermVector::NO,
|
153
|
+
binary = false,
|
154
|
+
boost = 1.0)
|
155
|
+
if (index == Index::NO and stored == Store::NO)
|
156
|
+
raise ArgumentError, "it doesn't make sense to have a field that " +
|
157
|
+
"is neither indexed nor stored"
|
158
|
+
end
|
159
|
+
if (index == Index::NO && store_term_vector != TermVector::NO)
|
160
|
+
raise ArgumentError, "cannot store term vector information for a " +
|
161
|
+
"field that is not indexed"
|
162
|
+
end
|
163
|
+
|
164
|
+
# The name of the field (e.g., "date", "subject", "title", or "body")
|
165
|
+
@name = name
|
166
|
+
|
167
|
+
# the one and only data object for all different kind of field values
|
168
|
+
@data = value
|
169
|
+
self.stored = stored
|
170
|
+
self.index = index
|
171
|
+
self.store_term_vector = store_term_vector
|
172
|
+
@binary = binary
|
173
|
+
@boost = boost
|
174
|
+
end
|
175
|
+
|
176
|
+
def stored=(stored)
|
177
|
+
if (stored == Store::YES)
|
178
|
+
@stored = true
|
179
|
+
@compressed = false
|
180
|
+
elsif (stored == Store::COMPRESS)
|
181
|
+
@stored = true
|
182
|
+
@compressed = true
|
183
|
+
elsif (stored == Store::NO)
|
184
|
+
@stored = false
|
185
|
+
@compressed = false
|
186
|
+
else
|
187
|
+
raise "unknown stored parameter " + stored.to_s
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def index=(index)
|
192
|
+
if (index == Index::NO)
|
193
|
+
@indexed = false
|
194
|
+
@tokenized = false
|
195
|
+
elsif (index == Index::TOKENIZED)
|
196
|
+
@indexed = true
|
197
|
+
@tokenized = true
|
198
|
+
elsif (index == Index::UNTOKENIZED)
|
199
|
+
@indexed = true
|
200
|
+
@tokenized = false
|
201
|
+
else
|
202
|
+
raise "unknown stored parameter " + index.to_s
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def store_term_vector=(store_term_vector)
|
207
|
+
if (store_term_vector == TermVector::NO)
|
208
|
+
@store_term_vector = false
|
209
|
+
@store_position = false
|
210
|
+
@store_offset = false
|
211
|
+
elsif (store_term_vector == TermVector::YES)
|
212
|
+
@store_term_vector = true
|
213
|
+
@store_position = false
|
214
|
+
@store_offset = false
|
215
|
+
elsif (store_term_vector == TermVector::WITH_POSITIONS)
|
216
|
+
@store_term_vector = true
|
217
|
+
@store_position = true
|
218
|
+
@store_offset = false
|
219
|
+
elsif (store_term_vector == TermVector::WITH_OFFSETS)
|
220
|
+
@store_term_vector = true
|
221
|
+
@store_position = false
|
222
|
+
@store_offset = true
|
223
|
+
elsif (store_term_vector == TermVector::WITH_POSITIONS_OFFSETS)
|
224
|
+
@store_term_vector = true
|
225
|
+
@store_position = true
|
226
|
+
@store_offset = true
|
227
|
+
else
|
228
|
+
raise "unknown term_vector parameter " + store_term_vector.to_s
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
# Returns the string value of the data that is stored in this field
|
233
|
+
def string_value
|
234
|
+
if @data.instance_of? String
|
235
|
+
return @data
|
236
|
+
elsif @data.respond_to? :read
|
237
|
+
return @data.read()
|
238
|
+
else
|
239
|
+
# if it is binary object try to return a string representation
|
240
|
+
return @data.to_s
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
# if the data is stored as a binary, just return it.
|
245
|
+
def binary_value
|
246
|
+
return @data
|
247
|
+
end
|
248
|
+
|
249
|
+
# Returns the string value of the data that is stored in this field
|
250
|
+
def reader_value
|
251
|
+
if @data.respond_to? :read
|
252
|
+
return @data
|
253
|
+
elsif @data.instance_of? String
|
254
|
+
return Ferret::Utils::StringHelper::StringReader.new(@data)
|
255
|
+
else
|
256
|
+
# if it is binary object try to return a string representation
|
257
|
+
return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Create a stored field with binary value. Optionally the value
|
262
|
+
# may be compressed. But it obviously won't be tokenized or
|
263
|
+
# term vectored or anything like that.
|
264
|
+
#
|
265
|
+
# name:: The name of the field
|
266
|
+
# value:: The binary value
|
267
|
+
# store:: How _value_ should be stored (compressed or not.)
|
268
|
+
def Field.new_binary_field(name, value, stored)
|
269
|
+
if (stored == Store::NO)
|
270
|
+
raise ArgumentError, "binary values can't be unstored"
|
271
|
+
end
|
272
|
+
Field.new(name, value, stored, Index::NO, TermVector::NO, true)
|
273
|
+
end
|
274
|
+
|
275
|
+
# Prints a Field for human consumption.
|
276
|
+
def to_s()
|
277
|
+
str = ""
|
278
|
+
if (@stored)
|
279
|
+
str << "stored"
|
280
|
+
@str << @compressed ? "/compressed," : "/uncompressed,"
|
281
|
+
end
|
282
|
+
if (@indexed) then str << "indexed," end
|
283
|
+
if (@tokenized) then str << "tokenized," end
|
284
|
+
if (@store_term_vector) then str << "store_term_vector," end
|
285
|
+
if (@store_offset)
|
286
|
+
str << "term_vector_offsets,"
|
287
|
+
end
|
288
|
+
if (@store_position)
|
289
|
+
str << "term_vector_position,"
|
290
|
+
end
|
291
|
+
if (@binary) then str << "binary," end
|
292
|
+
|
293
|
+
str << '<'
|
294
|
+
str << @name
|
295
|
+
str << ':'
|
296
|
+
|
297
|
+
if (@data != null)
|
298
|
+
str << @data.to_s
|
299
|
+
end
|
300
|
+
|
301
|
+
str << '>'
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
data/lib/ferret/index.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ferret/index/index_file_names'
|
2
|
+
require 'ferret/index/term'
|
3
|
+
require 'ferret/index/term_buffer'
|
4
|
+
require 'ferret/index/term_doc_enum'
|
5
|
+
require 'ferret/index/multiple_term_doc_pos_enum'
|
6
|
+
require 'ferret/index/term_enum'
|
7
|
+
require 'ferret/index/term_info'
|
8
|
+
require 'ferret/index/term_infos_io'
|
9
|
+
require 'ferret/index/term_vector_offset_info'
|
10
|
+
require 'ferret/index/term_vectors_io'
|
11
|
+
require 'ferret/index/field_infos'
|
12
|
+
require 'ferret/index/fields_io'
|
13
|
+
require 'ferret/index/compound_file_io'
|
14
|
+
require 'ferret/index/term_buffer'
|
15
|
+
require 'ferret/index/segment_term_enum'
|
16
|
+
require 'ferret/index/segment_term_vector'
|
17
|
+
require 'ferret/index/segment_merge_info'
|
18
|
+
require 'ferret/index/segment_merge_queue'
|
19
|
+
require 'ferret/index/segment_infos'
|
20
|
+
require 'ferret/index/document_writer'
|
21
|
+
require 'ferret/index/index_reader'
|
22
|
+
require 'ferret/index/index_writer'
|
23
|
+
require 'ferret/index/multi_reader'
|
24
|
+
require 'ferret/index/segment_merger'
|
25
|
+
require 'ferret/index/segment_reader'
|
26
|
+
require 'ferret/index/index'
|
@@ -0,0 +1,343 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
|
3
|
+
module Ferret::Index
|
4
|
+
|
5
|
+
# Class for accessing a compound stream.
|
6
|
+
# This class implements a directory, but is limited to only read operations.
|
7
|
+
# Directory methods that would normally modify data raise.
|
8
|
+
class CompoundFileReader < Ferret::Store::Directory
|
9
|
+
|
10
|
+
include MonitorMixin
|
11
|
+
|
12
|
+
attr_reader :directory, :file_name
|
13
|
+
|
14
|
+
# Creates a Compound File Reader which contains a single file and has
|
15
|
+
# pointers to the individual files within. When it is initialized, the
|
16
|
+
# compound file is set and the header is read so that it is ready to read
|
17
|
+
# the individual files within.
|
18
|
+
def initialize(dir, name)
|
19
|
+
|
20
|
+
super()
|
21
|
+
|
22
|
+
@directory = dir
|
23
|
+
@file_name = name
|
24
|
+
@entries = {}
|
25
|
+
|
26
|
+
success = false
|
27
|
+
|
28
|
+
begin
|
29
|
+
@stream = dir.open_input(name)
|
30
|
+
|
31
|
+
# read the directory and init files
|
32
|
+
count = @stream.read_vint()
|
33
|
+
entry = nil
|
34
|
+
count.times() do
|
35
|
+
offset = @stream.read_long()
|
36
|
+
id = @stream.read_string()
|
37
|
+
|
38
|
+
if (entry != nil)
|
39
|
+
# set length of the previous entry
|
40
|
+
entry.length = offset - entry.offset
|
41
|
+
end
|
42
|
+
|
43
|
+
entry = FileEntry.new(offset)
|
44
|
+
@entries[id] = entry
|
45
|
+
end
|
46
|
+
|
47
|
+
# set the length of the final entry
|
48
|
+
if (entry != nil)
|
49
|
+
entry.length = @stream.length() - entry.offset
|
50
|
+
end
|
51
|
+
|
52
|
+
success = true
|
53
|
+
|
54
|
+
ensure
|
55
|
+
|
56
|
+
if not success and (@stream != nil)
|
57
|
+
begin
|
58
|
+
@stream.close()
|
59
|
+
rescue IOError
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def close()
|
66
|
+
synchronize do
|
67
|
+
if (@stream == nil): raise(IOError, "Already closed") end
|
68
|
+
|
69
|
+
@entries.clear()
|
70
|
+
@stream.close()
|
71
|
+
@stream = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def open_input(id)
|
76
|
+
synchronize do
|
77
|
+
if (@stream == nil)
|
78
|
+
raise(IOError, "Stream closed")
|
79
|
+
end
|
80
|
+
|
81
|
+
entry = @entries[id]
|
82
|
+
if (entry == nil)
|
83
|
+
raise(IOError, "No sub-file with id " + id + " found")
|
84
|
+
end
|
85
|
+
return CSIndexInput.new(@stream, entry.offset, entry.length)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns an array of strings, one for each file in the directory.
|
90
|
+
def list()
|
91
|
+
return @entries.keys()
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns true iff a file with the given name exists.
|
95
|
+
def file_exists(name)
|
96
|
+
return @entries.key?(name)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Returns the time the named file was last modified.
|
100
|
+
def modified(name)
|
101
|
+
return @directory.modified(@file_name)
|
102
|
+
end
|
103
|
+
|
104
|
+
# Set the modified time of an existing file to now.
|
105
|
+
def touch(name)
|
106
|
+
@directory.touch(@file_name)
|
107
|
+
end
|
108
|
+
|
109
|
+
# Not implemented
|
110
|
+
def delete(name) raise(UnsupportedOperationError) end
|
111
|
+
|
112
|
+
# Not implemented
|
113
|
+
def rename(from, to) raise(UnsupportedOperationError) end
|
114
|
+
|
115
|
+
# Returns the length of a file in the directory.
|
116
|
+
def file_length(name)
|
117
|
+
e = @entries[name]
|
118
|
+
if (e == nil): raise(IOError, "File " + name + " does not exist") end
|
119
|
+
return e.length
|
120
|
+
end
|
121
|
+
|
122
|
+
# Not implemented
|
123
|
+
def create_output(name) raise(UnsupportedOperationError) end
|
124
|
+
|
125
|
+
# Not implemented
|
126
|
+
def make_lock(name) raise(UnsupportedOperationError) end
|
127
|
+
|
128
|
+
# Implementation of an IndexInput that reads from a portion of the
|
129
|
+
# compound file.
|
130
|
+
class CSIndexInput < Ferret::Store::BufferedIndexInput
|
131
|
+
attr_reader :length
|
132
|
+
|
133
|
+
def initialize(base, file_offset, length)
|
134
|
+
super()
|
135
|
+
@base = base
|
136
|
+
@base.extend(MonitorMixin)
|
137
|
+
@file_offset = file_offset
|
138
|
+
@length = length
|
139
|
+
end
|
140
|
+
|
141
|
+
# Closes the stream to further operations.
|
142
|
+
def close() end
|
143
|
+
|
144
|
+
private
|
145
|
+
# Expert: implements buffer refill. Reads bytes from the current
|
146
|
+
# position in the input.
|
147
|
+
#
|
148
|
+
# b:: the array to read bytes into
|
149
|
+
# offset:: the offset in the array to start storing bytes
|
150
|
+
# len:: the number of bytes to read
|
151
|
+
def read_internal(b, offset, len)
|
152
|
+
@base.synchronize() do
|
153
|
+
start = pos()
|
154
|
+
if(start + len > @length): raise(EOFError, "read past EOF") end
|
155
|
+
@base.seek(@file_offset + start)
|
156
|
+
@base.read_bytes(b, offset, len)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Expert: implements seek. Sets current position in @file, where
|
161
|
+
# the next {@link #read_internal(byte[],int,int)} will occur.
|
162
|
+
def seek_internal(pos) end
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
# Base info
|
167
|
+
class FileEntry
|
168
|
+
attr_accessor :offset, :length
|
169
|
+
def initialize(offset)
|
170
|
+
@offset = offset
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
# Combines multiple files into a single compound file.
|
177
|
+
# The file format:
|
178
|
+
#
|
179
|
+
# * VInt fileCount
|
180
|
+
# * {Directory} fileCount entries with the following structure:
|
181
|
+
# + long data_offset
|
182
|
+
# + UTFString extension
|
183
|
+
# * {File Data} fileCount entries with the raw data of the corresponding file
|
184
|
+
#
|
185
|
+
# The fileCount integer indicates how many files are contained in this compound
|
186
|
+
# file. The {directory} that follows has that many entries. Each directory entry
|
187
|
+
# contains an encoding identifier, a long pointer to the start of this file's
|
188
|
+
# data section, and a UTF String with that file's extension.
|
189
|
+
class CompoundFileWriter
|
190
|
+
|
191
|
+
attr_reader :directory, :file_name
|
192
|
+
|
193
|
+
# Create the compound stream in the specified file. The file name is the
|
194
|
+
# entire name (no extensions are added).
|
195
|
+
def initialize(dir, name)
|
196
|
+
@directory = dir
|
197
|
+
@file_name = name
|
198
|
+
@ids = Set.new
|
199
|
+
@file_entries = []
|
200
|
+
@merged = false
|
201
|
+
end
|
202
|
+
|
203
|
+
# Add a source stream. _file_name_ is the string by which the
|
204
|
+
# sub-stream will be known in the compound stream.
|
205
|
+
#
|
206
|
+
# Throws:: IllegalStateError if this writer is closed
|
207
|
+
# Throws:: IllegalArgumentError if a file with the same name
|
208
|
+
# has been added already
|
209
|
+
def add_file(file_name)
|
210
|
+
if @merged
|
211
|
+
raise(IllegalStateError, "Can't add extensions after merge has been called")
|
212
|
+
end
|
213
|
+
|
214
|
+
if not @ids.add?(file_name)
|
215
|
+
raise(IllegalArgumentError, "File " + file + " already added")
|
216
|
+
end
|
217
|
+
|
218
|
+
entry = FileEntry.new(file_name)
|
219
|
+
@file_entries << entry
|
220
|
+
end
|
221
|
+
|
222
|
+
# Merge files with the extensions added up to now.
|
223
|
+
# All files with these extensions are combined sequentially into the
|
224
|
+
# compound stream. After successful merge, the source files
|
225
|
+
# are deleted.
|
226
|
+
#
|
227
|
+
# Throws:: IllegalStateException if close() had been called before or
|
228
|
+
# if no file has been added to this object
|
229
|
+
def close()
|
230
|
+
|
231
|
+
if @merged
|
232
|
+
raise(IllegalStateException, "Merge already performed")
|
233
|
+
end
|
234
|
+
|
235
|
+
if @file_entries.empty?
|
236
|
+
raise(IllegalStateException, "No entries to merge have been defined")
|
237
|
+
end
|
238
|
+
|
239
|
+
@merged = true
|
240
|
+
|
241
|
+
# open the compound stream
|
242
|
+
os = nil
|
243
|
+
begin
|
244
|
+
os = @directory.create_output(@file_name)
|
245
|
+
|
246
|
+
# Write the number of entries
|
247
|
+
os.write_vint(@file_entries.size)
|
248
|
+
|
249
|
+
# Write the directory with all offsets at 0.
|
250
|
+
# Remember the positions of directory entries so that we can
|
251
|
+
# adjust the offsets later
|
252
|
+
@file_entries.each do |fe|
|
253
|
+
fe.directory_offset = os.pos()
|
254
|
+
os.write_long(0) # for now
|
255
|
+
os.write_string(fe.file_name)
|
256
|
+
end
|
257
|
+
|
258
|
+
# Open the files and copy their data into the stream.
|
259
|
+
# Remember the locations of each file's data section.
|
260
|
+
@file_entries.each do |fe|
|
261
|
+
fe.data_offset = os.pos()
|
262
|
+
copy_file(fe, os)
|
263
|
+
end
|
264
|
+
|
265
|
+
# Write the data offsets into the directory of the compound stream
|
266
|
+
@file_entries.each do |fe|
|
267
|
+
os.seek(fe.directory_offset)
|
268
|
+
os.write_long(fe.data_offset)
|
269
|
+
end
|
270
|
+
|
271
|
+
# Close the output stream. Set the os to nil before trying to
|
272
|
+
# close so that if an exception occurs during the close, the
|
273
|
+
# finally clause below will not attempt to close the stream
|
274
|
+
# the second time.
|
275
|
+
tmp = os
|
276
|
+
os = nil
|
277
|
+
tmp.close()
|
278
|
+
|
279
|
+
ensure
|
280
|
+
if (os != nil)
|
281
|
+
begin
|
282
|
+
os.close()
|
283
|
+
rescue
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
private
|
290
|
+
|
291
|
+
# Internal class for holding a file
|
292
|
+
class FileEntry
|
293
|
+
|
294
|
+
attr_accessor :file_name, :directory_offset, :data_offset
|
295
|
+
|
296
|
+
def initialize(file_name)
|
297
|
+
@file_name = file_name
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
301
|
+
|
302
|
+
# Copy the contents of the file with specified extension into the
|
303
|
+
# provided output stream. Use a buffer for moving data
|
304
|
+
# to reduce memory allocation.
|
305
|
+
def copy_file(source, os)
|
306
|
+
is = nil
|
307
|
+
begin
|
308
|
+
start_ptr = os.pos()
|
309
|
+
|
310
|
+
is = @directory.open_input(source.file_name)
|
311
|
+
remainder = length = is.length
|
312
|
+
|
313
|
+
buffer = Ferret::Store::BUFFER.clone
|
314
|
+
while (remainder > 0)
|
315
|
+
len = [remainder, Ferret::Store::BUFFER_SIZE].min
|
316
|
+
is.read_bytes(buffer, 0, len)
|
317
|
+
os.write_bytes(buffer, len)
|
318
|
+
remainder -= len
|
319
|
+
end
|
320
|
+
|
321
|
+
# Verify that remainder is 0
|
322
|
+
if (remainder != 0)
|
323
|
+
raise(IOError,
|
324
|
+
"Non-zero remainder length after copying: " + remainder.to_s +
|
325
|
+
" (id: " + source.file_name + ", length: " + length.to_s +
|
326
|
+
", buffer size: " + Ferret::Store::BUFFER_SIZE.to_s + ")")
|
327
|
+
end
|
328
|
+
|
329
|
+
# Verify that the output length diff is equal to original file
|
330
|
+
end_ptr = os.pos()
|
331
|
+
diff = end_ptr - start_ptr
|
332
|
+
if (diff != length)
|
333
|
+
raise(IOError,
|
334
|
+
"Difference in the output file offsets " + diff.to_s +
|
335
|
+
" does not match the original file length " + length.to_s)
|
336
|
+
end
|
337
|
+
|
338
|
+
ensure
|
339
|
+
if (is != nil): is.close() end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|