jk-ferret 0.11.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +24 -0
- data/MIT-LICENSE +20 -0
- data/README +90 -0
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +443 -0
- data/TODO +109 -0
- data/TUTORIAL +231 -0
- data/bin/ferret-browser +79 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/STEMMER_api.c +66 -0
- data/ext/STEMMER_libstemmer.c +93 -0
- data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
- data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
- data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/STEMMER_stem_UTF_8_german.c +509 -0
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/STEMMER_utilities.c +478 -0
- data/ext/analysis.c +1710 -0
- data/ext/analysis.h +266 -0
- data/ext/api.h +26 -0
- data/ext/array.c +125 -0
- data/ext/array.h +62 -0
- data/ext/bitvector.c +96 -0
- data/ext/bitvector.h +594 -0
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +384 -0
- data/ext/config.h +52 -0
- data/ext/document.c +159 -0
- data/ext/document.h +63 -0
- data/ext/except.c +102 -0
- data/ext/except.h +176 -0
- data/ext/extconf.rb +15 -0
- data/ext/ferret.c +416 -0
- data/ext/ferret.h +94 -0
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +157 -0
- data/ext/fs_store.c +493 -0
- data/ext/global.c +458 -0
- data/ext/global.h +302 -0
- data/ext/hash.c +524 -0
- data/ext/hash.h +515 -0
- data/ext/hashset.c +192 -0
- data/ext/hashset.h +215 -0
- data/ext/header.h +58 -0
- data/ext/helper.c +63 -0
- data/ext/helper.h +21 -0
- data/ext/index.c +6804 -0
- data/ext/index.h +935 -0
- data/ext/internal.h +1019 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +68 -0
- data/ext/libstemmer.h +79 -0
- data/ext/mempool.c +88 -0
- data/ext/mempool.h +43 -0
- data/ext/modules.h +190 -0
- data/ext/multimapper.c +351 -0
- data/ext/multimapper.h +60 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +973 -0
- data/ext/priorityqueue.c +149 -0
- data/ext/priorityqueue.h +155 -0
- data/ext/q_boolean.c +1621 -0
- data/ext/q_const_score.c +162 -0
- data/ext/q_filtered_query.c +212 -0
- data/ext/q_fuzzy.c +280 -0
- data/ext/q_match_all.c +149 -0
- data/ext/q_multi_term.c +673 -0
- data/ext/q_parser.c +3103 -0
- data/ext/q_phrase.c +1206 -0
- data/ext/q_prefix.c +98 -0
- data/ext/q_range.c +682 -0
- data/ext/q_span.c +2390 -0
- data/ext/q_term.c +337 -0
- data/ext/q_wildcard.c +167 -0
- data/ext/r_analysis.c +2626 -0
- data/ext/r_index.c +3468 -0
- data/ext/r_qparser.c +635 -0
- data/ext/r_search.c +4490 -0
- data/ext/r_store.c +513 -0
- data/ext/r_utils.c +1131 -0
- data/ext/ram_store.c +476 -0
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +1864 -0
- data/ext/search.h +953 -0
- data/ext/similarity.c +151 -0
- data/ext/similarity.h +89 -0
- data/ext/sort.c +786 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +410 -0
- data/ext/store.c +698 -0
- data/ext/store.h +799 -0
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +73 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +62 -0
- data/lib/ferret.rb +30 -0
- data/lib/ferret/browser.rb +246 -0
- data/lib/ferret/browser/s/global.js +192 -0
- data/lib/ferret/browser/s/style.css +148 -0
- data/lib/ferret/browser/views/document/list.rhtml +49 -0
- data/lib/ferret/browser/views/document/show.rhtml +27 -0
- data/lib/ferret/browser/views/error/index.rhtml +7 -0
- data/lib/ferret/browser/views/help/index.rhtml +8 -0
- data/lib/ferret/browser/views/home/index.rhtml +29 -0
- data/lib/ferret/browser/views/layout.rhtml +22 -0
- data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
- data/lib/ferret/browser/views/term/index.rhtml +199 -0
- data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
- data/lib/ferret/browser/webrick.rb +14 -0
- data/lib/ferret/document.rb +130 -0
- data/lib/ferret/field_infos.rb +44 -0
- data/lib/ferret/field_symbol.rb +87 -0
- data/lib/ferret/index.rb +973 -0
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret/version.rb +3 -0
- data/setup.rb +1555 -0
- data/test/long_running/largefile/tc_largefile.rb +46 -0
- data/test/test_all.rb +5 -0
- data/test/test_helper.rb +29 -0
- data/test/test_installed.rb +1 -0
- data/test/threading/number_to_spoken.rb +132 -0
- data/test/threading/thread_safety_index_test.rb +88 -0
- data/test/threading/thread_safety_read_write_test.rb +73 -0
- data/test/threading/thread_safety_test.rb +133 -0
- data/test/unit/analysis/tc_analyzer.rb +550 -0
- data/test/unit/analysis/tc_token_stream.rb +653 -0
- data/test/unit/index/tc_index.rb +867 -0
- data/test/unit/index/tc_index_reader.rb +699 -0
- data/test/unit/index/tc_index_writer.rb +447 -0
- data/test/unit/index/th_doc.rb +332 -0
- data/test/unit/query_parser/tc_query_parser.rb +238 -0
- data/test/unit/search/tc_filter.rb +156 -0
- data/test/unit/search/tc_fuzzy_query.rb +147 -0
- data/test/unit/search/tc_index_searcher.rb +67 -0
- data/test/unit/search/tc_multi_searcher.rb +128 -0
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tc_search_and_sort.rb +179 -0
- data/test/unit/search/tc_sort.rb +49 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +190 -0
- data/test/unit/search/tm_searcher.rb +436 -0
- data/test/unit/store/tc_fs_store.rb +115 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +34 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +2 -0
- data/test/unit/ts_index.rb +2 -0
- data/test/unit/ts_largefile.rb +4 -0
- data/test/unit/ts_query_parser.rb +2 -0
- data/test/unit/ts_search.rb +2 -0
- data/test/unit/ts_store.rb +2 -0
- data/test/unit/ts_utils.rb +2 -0
- data/test/unit/utils/tc_bit_vector.rb +295 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +319 -0
data/lib/ferret/index.rb
ADDED
@@ -0,0 +1,973 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
|
3
|
+
module Ferret::Index
|
4
|
+
# This is a simplified interface to the index. See the TUTORIAL for more
|
5
|
+
# information on how to use this class.
|
6
|
+
class Index
|
7
|
+
include MonitorMixin
|
8
|
+
|
9
|
+
include Ferret::Store
|
10
|
+
include Ferret::Search
|
11
|
+
|
12
|
+
attr_reader :options
|
13
|
+
|
14
|
+
# If you create an Index without any options, it'll simply create an index
|
15
|
+
# in memory. But this class is highly configurable and every option that
|
16
|
+
# you can supply to IndexWriter and QueryParser, you can also set here.
|
17
|
+
# Please look at the options for the constructors to these classes.
|
18
|
+
#
|
19
|
+
# === Options
|
20
|
+
#
|
21
|
+
# See;
|
22
|
+
#
|
23
|
+
# * QueryParser
|
24
|
+
# * IndexWriter
|
25
|
+
#
|
26
|
+
# default_input_field:: Default: "id". This specifies the default field
|
27
|
+
# that will be used when you add a simple string
|
28
|
+
# to the index using #add_document or <<.
|
29
|
+
# id_field:: Default: "id". This field is as the field to
|
30
|
+
# search when doing searches on a term. For
|
31
|
+
# example, if you do a lookup by term "cat", ie
|
32
|
+
# index["cat"], this will be the field that is
|
33
|
+
# searched.
|
34
|
+
# key:: Default: nil. Expert: This should only be used
|
35
|
+
# if you really know what you are doing. Basically
|
36
|
+
# you can set a field or an array of fields to be
|
37
|
+
# the key for the index. So if you add a document
|
38
|
+
# with a same key as an existing document, the
|
39
|
+
# existing document will be replaced by the new
|
40
|
+
# object. Using a multiple field key will slow
|
41
|
+
# down indexing so it should not be done if
|
42
|
+
# performance is a concern. A single field key (or
|
43
|
+
# id) should be find however. Also, you must make
|
44
|
+
# sure that your key/keys are either untokenized
|
45
|
+
# or that they are not broken up by the analyzer.
|
46
|
+
# auto_flush:: Default: false. Set this option to true if you
|
47
|
+
# want the index automatically flushed every time
|
48
|
+
# you do a write (includes delete) to the index.
|
49
|
+
# This is useful if you have multiple processes
|
50
|
+
# accessing the index and you don't want lock
|
51
|
+
# errors. Setting :auto_flush to true has a huge
|
52
|
+
# performance impact so don't use it if you are
|
53
|
+
# concerned about performance. In that case you
|
54
|
+
# should think about setting up a DRb indexing
|
55
|
+
# service.
|
56
|
+
# lock_retry_time:: Default: 2 seconds. This parameter specifies how
|
57
|
+
# long to wait before retrying to obtain the
|
58
|
+
# commit lock when detecting if the IndexReader is
|
59
|
+
# at the latest version.
|
60
|
+
# close_dir:: Default: false. If you explicitly pass a
|
61
|
+
# Directory object to this class and you want
|
62
|
+
# Index to close it when it is closed itself then
|
63
|
+
# set this to true.
|
64
|
+
# use_typed_range_query:: Default: true. Use TypedRangeQuery instead of
|
65
|
+
# the standard RangeQuery when parsing
|
66
|
+
# range queries. This is useful if you have number
|
67
|
+
# fields which you want to perform range queries
|
68
|
+
# on. You won't need to pad or normalize the data
|
69
|
+
# in the field in anyway to get correct results.
|
70
|
+
# However, performance will be a lot slower for
|
71
|
+
# large indexes, hence the default.
|
72
|
+
#
|
73
|
+
# == Examples
|
74
|
+
#
|
75
|
+
# index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
|
76
|
+
#
|
77
|
+
# index = Index::Index.new(:path => '/path/to/index',
|
78
|
+
# :create_if_missing => false,
|
79
|
+
# :auto_flush => true)
|
80
|
+
#
|
81
|
+
# index = Index::Index.new(:dir => directory,
|
82
|
+
# :default_slop => 2,
|
83
|
+
# :handle_parse_errors => false)
|
84
|
+
#
|
85
|
+
# You can also pass a block if you like. The index will be yielded and
|
86
|
+
# closed at the index of the box. For example;
|
87
|
+
#
|
88
|
+
# Ferret::I.new() do |index|
|
89
|
+
# # do stuff with index. Most of your actions will be cached.
|
90
|
+
# end
|
91
|
+
def initialize(options = {}, &block)
|
92
|
+
super()
|
93
|
+
|
94
|
+
if options[:key]
|
95
|
+
@key = options[:key]
|
96
|
+
if @key.is_a?(Array)
|
97
|
+
@key.flatten.map {|k| k.to_s.intern}
|
98
|
+
end
|
99
|
+
else
|
100
|
+
@key = nil
|
101
|
+
end
|
102
|
+
|
103
|
+
if (fi = options[:field_infos]).is_a?(String)
|
104
|
+
options[:field_infos] = FieldInfos.load(fi)
|
105
|
+
end
|
106
|
+
|
107
|
+
@close_dir = options[:close_dir]
|
108
|
+
if options[:dir].is_a?(String)
|
109
|
+
options[:path] = options[:dir]
|
110
|
+
end
|
111
|
+
if options[:path]
|
112
|
+
@close_dir = true
|
113
|
+
begin
|
114
|
+
@dir = FSDirectory.new(options[:path], options[:create])
|
115
|
+
rescue IOError => io
|
116
|
+
@dir = FSDirectory.new(options[:path],
|
117
|
+
options[:create_if_missing] != false)
|
118
|
+
end
|
119
|
+
elsif options[:dir]
|
120
|
+
@dir = options[:dir]
|
121
|
+
else
|
122
|
+
options[:create] = true # this should always be true for a new RAMDir
|
123
|
+
@close_dir = true
|
124
|
+
@dir = RAMDirectory.new
|
125
|
+
end
|
126
|
+
|
127
|
+
@dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin
|
128
|
+
options[:dir] = @dir
|
129
|
+
options[:lock_retry_time]||= 2
|
130
|
+
@options = options
|
131
|
+
if (!@dir.exists?("segments")) || options[:create]
|
132
|
+
IndexWriter.new(options).close
|
133
|
+
end
|
134
|
+
options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
|
135
|
+
if options[:use_typed_range_query].nil?
|
136
|
+
options[:use_typed_range_query] = true
|
137
|
+
end
|
138
|
+
|
139
|
+
@searcher = nil
|
140
|
+
@writer = nil
|
141
|
+
@reader = nil
|
142
|
+
|
143
|
+
@options.delete(:create) # only create the first time if at all
|
144
|
+
@auto_flush = @options[:auto_flush] || false
|
145
|
+
if (@options[:id_field].nil? and @key.is_a?(Symbol))
|
146
|
+
@id_field = @key
|
147
|
+
else
|
148
|
+
@id_field = @options[:id_field] || :id
|
149
|
+
end
|
150
|
+
@default_field = (@options[:default_field]||= :*)
|
151
|
+
@default_input_field = options[:default_input_field] || @id_field
|
152
|
+
|
153
|
+
if @default_input_field.respond_to?(:intern)
|
154
|
+
@default_input_field = @default_input_field.intern
|
155
|
+
end
|
156
|
+
@open = true
|
157
|
+
@qp = nil
|
158
|
+
if block
|
159
|
+
yield self
|
160
|
+
self.close
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
# Returns an array of strings with the matches highlighted. The +query+ can
|
165
|
+
# either a query String or a Ferret::Search::Query object. The doc_id is
|
166
|
+
# the id of the document you want to highlight (usually returned by the
|
167
|
+
# search methods). There are also a number of options you can pass;
|
168
|
+
#
|
169
|
+
# === Options
|
170
|
+
#
|
171
|
+
# field:: Default: @options[:default_field]. The default_field
|
172
|
+
# is the field that is usually highlighted but you can
|
173
|
+
# specify which field you want to highlight here. If
|
174
|
+
# you want to highlight multiple fields then you will
|
175
|
+
# need to call this method multiple times.
|
176
|
+
# excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
|
177
|
+
# terms will be in the centre of the excerpt. Set to
|
178
|
+
# :all to highlight the entire field.
|
179
|
+
# num_excerpts:: Default: 2. Number of excerpts to return.
|
180
|
+
# pre_tag:: Default: "<b>". Tag to place to the left of the
|
181
|
+
# match. You'll probably want to change this to a
|
182
|
+
# "<span>" tag with a class. Try "\033[36m" for use in
|
183
|
+
# a terminal.
|
184
|
+
# post_tag:: Default: "</b>". This tag should close the
|
185
|
+
# +:pre_tag+. Try tag "\033[m" in the terminal.
|
186
|
+
# ellipsis:: Default: "...". This is the string that is appended
|
187
|
+
# at the beginning and end of excerpts (unless the
|
188
|
+
# excerpt hits the start or end of the field.
|
189
|
+
# Alternatively you may want to use the HTML entity
|
190
|
+
# … or the UTF-8 string "\342\200\246".
|
191
|
+
def highlight(query, doc_id, options = {})
|
192
|
+
@dir.synchronize do
|
193
|
+
ensure_searcher_open()
|
194
|
+
@searcher.highlight(do_process_query(query),
|
195
|
+
doc_id,
|
196
|
+
options[:field]||@options[:default_field],
|
197
|
+
options)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Closes this index by closing its associated reader and writer objects.
|
202
|
+
def close
|
203
|
+
@dir.synchronize do
|
204
|
+
if not @open
|
205
|
+
raise(StandardError, "tried to close an already closed directory")
|
206
|
+
end
|
207
|
+
@searcher.close() if @searcher
|
208
|
+
@reader.close() if @reader
|
209
|
+
@writer.close() if @writer
|
210
|
+
@dir.close() if @close_dir
|
211
|
+
|
212
|
+
@open = false
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# Get the reader for this index.
|
217
|
+
# NOTE:: This will close the writer from this index.
|
218
|
+
def reader
|
219
|
+
ensure_reader_open()
|
220
|
+
return @reader
|
221
|
+
end
|
222
|
+
|
223
|
+
# Get the searcher for this index.
|
224
|
+
# NOTE:: This will close the writer from this index.
|
225
|
+
def searcher
|
226
|
+
ensure_searcher_open()
|
227
|
+
return @searcher
|
228
|
+
end
|
229
|
+
|
230
|
+
# Get the writer for this index.
|
231
|
+
# NOTE:: This will close the reader from this index.
|
232
|
+
def writer
|
233
|
+
ensure_writer_open()
|
234
|
+
return @writer
|
235
|
+
end
|
236
|
+
|
237
|
+
# Adds a document to this index, using the provided analyzer instead of
|
238
|
+
# the local analyzer if provided. If the document contains more than
|
239
|
+
# IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
|
240
|
+
# discarded.
|
241
|
+
#
|
242
|
+
# There are three ways to add a document to the index.
|
243
|
+
# To add a document you can simply add a string or an array of strings.
|
244
|
+
# This will store all the strings in the "" (ie empty string) field
|
245
|
+
# (unless you specify the default_field when you create the index).
|
246
|
+
#
|
247
|
+
# index << "This is a new document to be indexed"
|
248
|
+
# index << ["And here", "is another", "new document", "to be indexed"]
|
249
|
+
#
|
250
|
+
# But these are pretty simple documents. If this is all you want to index
|
251
|
+
# you could probably just use SimpleSearch. So let's give our documents
|
252
|
+
# some fields;
|
253
|
+
#
|
254
|
+
# index << {:title => "Programming Ruby", :content => "blah blah blah"}
|
255
|
+
# index << {:title => "Programming Ruby", :content => "yada yada yada"}
|
256
|
+
#
|
257
|
+
# Or if you are indexing data stored in a database, you'll probably want
|
258
|
+
# to store the id;
|
259
|
+
#
|
260
|
+
# index << {:id => row.id, :title => row.title, :date => row.date}
|
261
|
+
#
|
262
|
+
# See FieldInfos for more information on how to set field properties.
|
263
|
+
def add_document(doc, analyzer = nil)
|
264
|
+
@dir.synchronize do
|
265
|
+
ensure_writer_open()
|
266
|
+
if doc.is_a?(String) or doc.is_a?(Array)
|
267
|
+
doc = {@default_input_field => doc}
|
268
|
+
end
|
269
|
+
|
270
|
+
# delete existing documents with the same key
|
271
|
+
if @key
|
272
|
+
if @key.is_a?(Array)
|
273
|
+
query = @key.inject(BooleanQuery.new()) do |bq, field|
|
274
|
+
bq.add_query(TermQuery.new(field, doc[field].to_s), :must)
|
275
|
+
bq
|
276
|
+
end
|
277
|
+
query_delete(query)
|
278
|
+
else
|
279
|
+
id = doc[@key].to_s
|
280
|
+
if id
|
281
|
+
@writer.delete(@key, id)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
285
|
+
ensure_writer_open()
|
286
|
+
|
287
|
+
if analyzer
|
288
|
+
old_analyzer = @writer.analyzer
|
289
|
+
@writer.analyzer = analyzer
|
290
|
+
@writer.add_document(doc)
|
291
|
+
@writer.analyzer = old_analyzer
|
292
|
+
else
|
293
|
+
@writer.add_document(doc)
|
294
|
+
end
|
295
|
+
|
296
|
+
flush() if @auto_flush
|
297
|
+
end
|
298
|
+
end
|
299
|
+
alias :<< :add_document
|
300
|
+
|
301
|
+
# Run a query through the Searcher on the index. A TopDocs object is
|
302
|
+
# returned with the relevant results. The +query+ is a built in Query
|
303
|
+
# object or a query string that can be parsed by the Ferret::QueryParser.
|
304
|
+
# Here are the options;
|
305
|
+
#
|
306
|
+
# === Options
|
307
|
+
#
|
308
|
+
# offset:: Default: 0. The offset of the start of the section of the
|
309
|
+
# result-set to return. This is used for paging through
|
310
|
+
# results. Let's say you have a page size of 10. If you
|
311
|
+
# don't find the result you want among the first 10 results
|
312
|
+
# then set +:offset+ to 10 and look at the next 10 results,
|
313
|
+
# then 20 and so on.
|
314
|
+
# limit:: Default: 10. This is the number of results you want
|
315
|
+
# returned, also called the page size. Set +:limit+ to
|
316
|
+
# +:all+ to return all results
|
317
|
+
# sort:: A Sort object or sort string describing how the field
|
318
|
+
# should be sorted. A sort string is made up of field names
|
319
|
+
# which cannot contain spaces and the word "DESC" if you
|
320
|
+
# want the field reversed, all separated by commas. For
|
321
|
+
# example; "rating DESC, author, title". Note that Ferret
|
322
|
+
# will try to determine a field's type by looking at the
|
323
|
+
# first term in the index and seeing if it can be parsed as
|
324
|
+
# an integer or a float. Keep this in mind as you may need
|
325
|
+
# to specify a fields type to sort it correctly. For more
|
326
|
+
# on this, see the documentation for SortField
|
327
|
+
# filter:: a Filter object to filter the search results with
|
328
|
+
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
329
|
+
# and the Searcher object as its parameters and returns a
|
330
|
+
# Boolean value specifying whether the result should be
|
331
|
+
# included in the result set.
|
332
|
+
def search(query, options = {})
|
333
|
+
@dir.synchronize do
|
334
|
+
return do_search(query, options)
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# Run a query through the Searcher on the index. A TopDocs object is
|
339
|
+
# returned with the relevant results. The +query+ is a Query object or a
|
340
|
+
# query string that can be validly parsed by the Ferret::QueryParser. The
|
341
|
+
# Searcher#search_each method yields the internal document id (used to
|
342
|
+
# reference documents in the Searcher object like this;
|
343
|
+
# +searcher[doc_id]+) and the search score for that document. It is
|
344
|
+
# possible for the score to be greater than 1.0 for some queries and
|
345
|
+
# taking boosts into account. This method will also normalize scores to
|
346
|
+
# the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
|
347
|
+
# options;
|
348
|
+
#
|
349
|
+
# === Options
|
350
|
+
#
|
351
|
+
# offset:: Default: 0. The offset of the start of the section of the
|
352
|
+
# result-set to return. This is used for paging through
|
353
|
+
# results. Let's say you have a page size of 10. If you
|
354
|
+
# don't find the result you want among the first 10 results
|
355
|
+
# then set +:offset+ to 10 and look at the next 10 results,
|
356
|
+
# then 20 and so on.
|
357
|
+
# limit:: Default: 10. This is the number of results you want
|
358
|
+
# returned, also called the page size. Set +:limit+ to
|
359
|
+
# +:all+ to return all results
|
360
|
+
# sort:: A Sort object or sort string describing how the field
|
361
|
+
# should be sorted. A sort string is made up of field names
|
362
|
+
# which cannot contain spaces and the word "DESC" if you
|
363
|
+
# want the field reversed, all separated by commas. For
|
364
|
+
# example; "rating DESC, author, title". Note that Ferret
|
365
|
+
# will try to determine a field's type by looking at the
|
366
|
+
# first term in the index and seeing if it can be parsed as
|
367
|
+
# an integer or a float. Keep this in mind as you may need
|
368
|
+
# to specify a fields type to sort it correctly. For more
|
369
|
+
# on this, see the documentation for SortField
|
370
|
+
# filter:: a Filter object to filter the search results with
|
371
|
+
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
372
|
+
# and the Searcher object as its parameters and returns a
|
373
|
+
# Boolean value specifying whether the result should be
|
374
|
+
# included in the result set.
|
375
|
+
#
|
376
|
+
# returns:: The total number of hits.
|
377
|
+
#
|
378
|
+
# === Example
|
379
|
+
# eg.
|
380
|
+
# index.search_each(query, options = {}) do |doc, score|
|
381
|
+
# puts "hit document number #{doc} with a score of #{score}"
|
382
|
+
# end
|
383
|
+
#
|
384
|
+
def search_each(query, options = {}) # :yield: doc, score
|
385
|
+
@dir.synchronize do
|
386
|
+
ensure_searcher_open()
|
387
|
+
query = do_process_query(query)
|
388
|
+
|
389
|
+
@searcher.search_each(query, options) do |doc, score|
|
390
|
+
yield doc, score
|
391
|
+
end
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
# Run a query through the Searcher on the index, ignoring scoring and
|
396
|
+
# starting at +:start_doc+ and stopping when +:limit+ matches have been
|
397
|
+
# found. It returns an array of the matching document numbers.
|
398
|
+
#
|
399
|
+
# There is a big performance advange when using this search method on a
|
400
|
+
# very large index when there are potentially thousands of matching
|
401
|
+
# documents and you only want say 50 of them. The other search methods need
|
402
|
+
# to look at every single match to decide which one has the highest score.
|
403
|
+
# This search method just needs to find +:limit+ number of matches before
|
404
|
+
# it returns.
|
405
|
+
#
|
406
|
+
# === Options
|
407
|
+
#
|
408
|
+
# start_doc:: Default: 0. The start document to start the search from.
|
409
|
+
# NOTE very carefully that this is not the same as the
|
410
|
+
# +:offset+ parameter used in the other search methods
|
411
|
+
# which refers to the offset in the result-set. This is the
|
412
|
+
# document to start the scan from. So if you scanning
|
413
|
+
# through the index in increments of 50 documents at a time
|
414
|
+
# you need to use the last matched doc in the previous
|
415
|
+
# search to start your next search. See the example below.
|
416
|
+
# limit:: Default: 50. This is the number of results you want
|
417
|
+
# returned, also called the page size. Set +:limit+ to
|
418
|
+
# +:all+ to return all results.
|
419
|
+
# TODO: add option to return loaded documents instead
|
420
|
+
#
|
421
|
+
# === Options
|
422
|
+
#
|
423
|
+
# start_doc = 0
|
424
|
+
# begin
|
425
|
+
# results = @searcher.scan(query, :start_doc => start_doc)
|
426
|
+
# yield results # or do something with them
|
427
|
+
# start_doc = results.last
|
428
|
+
# # start_doc will be nil now if results is empty, ie no more matches
|
429
|
+
# end while start_doc
|
430
|
+
def scan(query, options = {})
|
431
|
+
@dir.synchronize do
|
432
|
+
ensure_searcher_open()
|
433
|
+
query = do_process_query(query)
|
434
|
+
|
435
|
+
@searcher.scan(query, options)
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
# Retrieves a document/documents from the index. The method for retrieval
|
440
|
+
# depends on the type of the argument passed.
|
441
|
+
#
|
442
|
+
# If +arg+ is an Integer then return the document based on the internal
|
443
|
+
# document number.
|
444
|
+
#
|
445
|
+
# If +arg+ is a Range, then return the documents within the range based on
|
446
|
+
# internal document number.
|
447
|
+
#
|
448
|
+
# If +arg+ is a String then search for the first document with +arg+ in
|
449
|
+
# the +id+ field. The +id+ field is either :id or whatever you set
|
450
|
+
# +:id_field+ parameter to when you create the Index object.
|
451
|
+
def doc(*arg)
|
452
|
+
@dir.synchronize do
|
453
|
+
id = arg[0]
|
454
|
+
if id.kind_of?(String) or id.kind_of?(Symbol)
|
455
|
+
ensure_reader_open()
|
456
|
+
term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
|
457
|
+
return term_doc_enum.next? ? @reader[term_doc_enum.doc] : nil
|
458
|
+
else
|
459
|
+
ensure_reader_open(false)
|
460
|
+
return @reader[*arg]
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
alias :[] :doc
|
465
|
+
|
466
|
+
# Retrieves the term_vector for a document. The document can be referenced
|
467
|
+
# by either a string id to match the id field or an integer corresponding
|
468
|
+
# to Ferret's document number.
|
469
|
+
#
|
470
|
+
# See Ferret::Index::IndexReader#term_vector
|
471
|
+
def term_vector(id, field)
|
472
|
+
@dir.synchronize do
|
473
|
+
ensure_reader_open()
|
474
|
+
if id.kind_of?(String) or id.kind_of?(Symbol)
|
475
|
+
term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
|
476
|
+
if term_doc_enum.next?
|
477
|
+
id = term_doc_enum.doc
|
478
|
+
else
|
479
|
+
return nil
|
480
|
+
end
|
481
|
+
end
|
482
|
+
return @reader.term_vector(id, field)
|
483
|
+
end
|
484
|
+
end
|
485
|
+
|
486
|
+
# iterate through all documents in the index. This method preloads the
|
487
|
+
# documents so you don't need to call #load on the document to load all the
|
488
|
+
# fields.
|
489
|
+
def each
|
490
|
+
@dir.synchronize do
|
491
|
+
ensure_reader_open
|
492
|
+
(0...@reader.max_doc).each do |i|
|
493
|
+
yield @reader[i].load unless @reader.deleted?(i)
|
494
|
+
end
|
495
|
+
end
|
496
|
+
end
|
497
|
+
|
498
|
+
# Deletes a document/documents from the index. The method for determining
|
499
|
+
# the document to delete depends on the type of the argument passed.
|
500
|
+
#
|
501
|
+
# If +arg+ is an Integer then delete the document based on the internal
|
502
|
+
# document number. Will raise an error if the document does not exist.
|
503
|
+
#
|
504
|
+
# If +arg+ is a String then search for the documents with +arg+ in the
|
505
|
+
# +id+ field. The +id+ field is either :id or whatever you set +:id_field+
|
506
|
+
# parameter to when you create the Index object. Will fail quietly if the
|
507
|
+
# no document exists.
|
508
|
+
#
|
509
|
+
# If +arg+ is a Hash or an Array then a batch delete will be performed.
|
510
|
+
# If +arg+ is an Array then it will be considered an array of +id+'s. If
|
511
|
+
# it is a Hash, then its keys will be used instead as the Array of
|
512
|
+
# document +id+'s. If the +id+ is an Integer then it is considered a
|
513
|
+
# Ferret document number and the corresponding document will be deleted.
|
514
|
+
# If the +id+ is a String or a Symbol then the +id+ will be considered a
|
515
|
+
# term and the documents that contain that term in the +:id_field+ will be
|
516
|
+
# deleted.
|
517
|
+
def delete(arg)
|
518
|
+
@dir.synchronize do
|
519
|
+
if arg.is_a?(String) or arg.is_a?(Symbol)
|
520
|
+
ensure_writer_open()
|
521
|
+
@writer.delete(@id_field, arg.to_s)
|
522
|
+
elsif arg.is_a?(Integer)
|
523
|
+
ensure_reader_open()
|
524
|
+
cnt = @reader.delete(arg)
|
525
|
+
elsif arg.is_a?(Hash) or arg.is_a?(Array)
|
526
|
+
batch_delete(arg)
|
527
|
+
else
|
528
|
+
raise ArgumentError, "Cannot delete for arg of type #{arg.class}"
|
529
|
+
end
|
530
|
+
flush() if @auto_flush
|
531
|
+
end
|
532
|
+
return self
|
533
|
+
end
|
534
|
+
|
535
|
+
# Delete all documents returned by the query.
|
536
|
+
#
|
537
|
+
# query:: The query to find documents you wish to delete. Can either be a
|
538
|
+
# string (in which case it is parsed by the standard query parser)
|
539
|
+
# or an actual query object.
|
540
|
+
def query_delete(query)
|
541
|
+
@dir.synchronize do
|
542
|
+
ensure_writer_open()
|
543
|
+
ensure_searcher_open()
|
544
|
+
query = do_process_query(query)
|
545
|
+
@searcher.search_each(query, :limit => :all) do |doc, score|
|
546
|
+
@reader.delete(doc)
|
547
|
+
end
|
548
|
+
flush() if @auto_flush
|
549
|
+
end
|
550
|
+
end
|
551
|
+
|
552
|
+
# Returns true if document +n+ has been deleted
|
553
|
+
def deleted?(n)
|
554
|
+
@dir.synchronize do
|
555
|
+
ensure_reader_open()
|
556
|
+
return @reader.deleted?(n)
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
# Update the document referenced by the document number +id+ if +id+ is an
|
561
|
+
# integer or all of the documents which have the term +id+ if +id+ is a
|
562
|
+
# term..
|
563
|
+
# For batch update of set of documents, for performance reasons, see batch_update
|
564
|
+
#
|
565
|
+
# id:: The number of the document to update. Can also be a string
|
566
|
+
# representing the value in the +id+ field. Also consider using
|
567
|
+
# the :key attribute.
|
568
|
+
# new_doc:: The document to replace the old document with
|
569
|
+
def update(id, new_doc)
|
570
|
+
@dir.synchronize do
|
571
|
+
ensure_writer_open()
|
572
|
+
delete(id)
|
573
|
+
if id.is_a?(String) or id.is_a?(Symbol)
|
574
|
+
@writer.commit
|
575
|
+
else
|
576
|
+
ensure_writer_open()
|
577
|
+
end
|
578
|
+
@writer << new_doc
|
579
|
+
flush() if @auto_flush
|
580
|
+
end
|
581
|
+
end
|
582
|
+
|
583
|
+
# Batch updates the documents in an index. You can pass either a Hash or
|
584
|
+
# an Array.
|
585
|
+
#
|
586
|
+
# === Array (recommended)
|
587
|
+
#
|
588
|
+
# If you pass an Array then each value needs to be a Document or a Hash
|
589
|
+
# and each of those documents must have an +:id_field+ which will be used
|
590
|
+
# to delete the old document that this document is replacing.
|
591
|
+
#
|
592
|
+
# === Hash
|
593
|
+
#
|
594
|
+
# If you pass a Hash then the keys of the Hash will be considered the
|
595
|
+
# +id+'s and the values will be the new documents to replace the old ones
|
596
|
+
# with.If the +id+ is an Integer then it is considered a Ferret document
|
597
|
+
# number and the corresponding document will be deleted. If the +id+ is a
|
598
|
+
# String or a Symbol then the +id+ will be considered a term and the
|
599
|
+
# documents that contain that term in the +:id_field+ will be deleted.
|
600
|
+
#
|
601
|
+
# Note: No error will be raised if the document does not currently
|
602
|
+
# exist. A new document will simply be created.
|
603
|
+
#
|
604
|
+
# == Examples
|
605
|
+
#
|
606
|
+
# # will replace the documents with the +id+'s id:133 and id:254
|
607
|
+
# @index.batch_update({
|
608
|
+
# '133' => {:id => '133', :content => 'yada yada yada'},
|
609
|
+
# '253' => {:id => '253', :content => 'bla bla bal'}
|
610
|
+
# })
|
611
|
+
#
|
612
|
+
# # will replace the documents with the Ferret Document numbers 2 and 92
|
613
|
+
# @index.batch_update({
|
614
|
+
# 2 => {:id => '133', :content => 'yada yada yada'},
|
615
|
+
# 92 => {:id => '253', :content => 'bla bla bal'}
|
616
|
+
# })
|
617
|
+
#
|
618
|
+
# # will replace the documents with the +id+'s id:133 and id:254
|
619
|
+
# # this is recommended as it guarantees no duplicate keys
|
620
|
+
# @index.batch_update([
|
621
|
+
# {:id => '133', :content => 'yada yada yada'},
|
622
|
+
# {:id => '253', :content => 'bla bla bal'}
|
623
|
+
# ])
|
624
|
+
#
|
625
|
+
# docs:: A Hash of id/document pairs. The set of documents to be updated
|
626
|
+
def batch_update(docs)
|
627
|
+
@dir.synchronize do
|
628
|
+
ids = values = nil
|
629
|
+
case docs
|
630
|
+
when Array
|
631
|
+
ids = docs.collect{|doc| doc[@id_field].to_s}
|
632
|
+
if ids.include?(nil)
|
633
|
+
raise ArgumentError, "all documents must have an #{@id_field} "
|
634
|
+
"field when doing a batch update"
|
635
|
+
end
|
636
|
+
when Hash
|
637
|
+
ids = docs.keys
|
638
|
+
docs = docs.values
|
639
|
+
else
|
640
|
+
raise ArgumentError, "must pass Hash or Array, not #{docs.class}"
|
641
|
+
end
|
642
|
+
batch_delete(ids)
|
643
|
+
ensure_writer_open()
|
644
|
+
docs.each {|new_doc| @writer << new_doc }
|
645
|
+
flush()
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
649
|
+
|
650
|
+
# Update all the documents returned by the query.
|
651
|
+
#
|
652
|
+
# query:: The query to find documents you wish to update. Can either be
|
653
|
+
# a string (in which case it is parsed by the standard query
|
654
|
+
# parser) or an actual query object.
|
655
|
+
# new_val:: The values we are updating. This can be a string in which case
|
656
|
+
# the default field is updated, or it can be a hash, in which
|
657
|
+
# case, all fields in the hash are merged into the old hash.
|
658
|
+
# That is, the old fields are replaced by values in the new hash
|
659
|
+
# if they exist.
|
660
|
+
#
|
661
|
+
# === Example
|
662
|
+
#
|
663
|
+
# index << {:id => "26", :title => "Babylon", :artist => "David Grey"}
|
664
|
+
# index << {:id => "29", :title => "My Oh My", :artist => "David Grey"}
|
665
|
+
#
|
666
|
+
# # correct
|
667
|
+
# index.query_update('artist:"David Grey"', {:artist => "David Gray"})
|
668
|
+
#
|
669
|
+
# index["26"]
|
670
|
+
# #=> {:id => "26", :title => "Babylon", :artist => "David Gray"}
|
671
|
+
# index["28"]
|
672
|
+
# #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}
|
673
|
+
#
|
674
|
+
def query_update(query, new_val)
|
675
|
+
@dir.synchronize do
|
676
|
+
ensure_writer_open()
|
677
|
+
ensure_searcher_open()
|
678
|
+
docs_to_add = []
|
679
|
+
query = do_process_query(query)
|
680
|
+
@searcher.search_each(query, :limit => :all) do |id, score|
|
681
|
+
document = @searcher[id].load
|
682
|
+
if new_val.is_a?(Hash)
|
683
|
+
document.merge!(new_val)
|
684
|
+
else new_val.is_a?(String) or new_val.is_a?(Symbol)
|
685
|
+
document[@default_input_field] = new_val.to_s
|
686
|
+
end
|
687
|
+
docs_to_add << document
|
688
|
+
@reader.delete(id)
|
689
|
+
end
|
690
|
+
ensure_writer_open()
|
691
|
+
docs_to_add.each {|doc| @writer << doc }
|
692
|
+
flush() if @auto_flush
|
693
|
+
end
|
694
|
+
end
|
695
|
+
|
696
|
+
# Returns true if any documents have been deleted since the index was last
|
697
|
+
# flushed.
|
698
|
+
def has_deletions?()
|
699
|
+
@dir.synchronize do
|
700
|
+
ensure_reader_open()
|
701
|
+
return @reader.has_deletions?
|
702
|
+
end
|
703
|
+
end
|
704
|
+
|
705
|
+
# Flushes all writes to the index. This will not optimize the index but it
|
706
|
+
# will make sure that all writes are written to it.
|
707
|
+
#
|
708
|
+
# NOTE: this is not necessary if you are only using this class. All writes
|
709
|
+
# will automatically flush when you perform an operation that reads the
|
710
|
+
# index.
|
711
|
+
def flush()
|
712
|
+
@dir.synchronize do
|
713
|
+
if @reader
|
714
|
+
if @searcher
|
715
|
+
@searcher.close
|
716
|
+
@searcher = nil
|
717
|
+
end
|
718
|
+
@reader.commit
|
719
|
+
elsif @writer
|
720
|
+
@writer.close
|
721
|
+
@writer = nil
|
722
|
+
end
|
723
|
+
end
|
724
|
+
end
|
725
|
+
alias :commit :flush
|
726
|
+
|
727
|
+
# optimizes the index. This should only be called when the index will no
|
728
|
+
# longer be updated very often, but will be read a lot.
|
729
|
+
def optimize()
|
730
|
+
@dir.synchronize do
|
731
|
+
ensure_writer_open()
|
732
|
+
@writer.optimize()
|
733
|
+
@writer.close()
|
734
|
+
@writer = nil
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
# returns the number of documents in the index
|
739
|
+
def size()
|
740
|
+
@dir.synchronize do
|
741
|
+
ensure_reader_open()
|
742
|
+
return @reader.num_docs()
|
743
|
+
end
|
744
|
+
end
|
745
|
+
|
746
|
+
# Merges all segments from an index or an array of indexes into this
|
747
|
+
# index. You can pass a single Index::Index, Index::Reader,
|
748
|
+
# Store::Directory or an array of any single one of these.
|
749
|
+
#
|
750
|
+
# This may be used to parallelize batch indexing. A large document
|
751
|
+
# collection can be broken into sub-collections. Each sub-collection can
|
752
|
+
# be indexed in parallel, on a different thread, process or machine and
|
753
|
+
# perhaps all in memory. The complete index can then be created by
|
754
|
+
# merging sub-collection indexes with this method.
|
755
|
+
#
|
756
|
+
# After this completes, the index is optimized.
|
757
|
+
def add_indexes(indexes)
|
758
|
+
@dir.synchronize do
|
759
|
+
ensure_writer_open()
|
760
|
+
indexes = [indexes].flatten # make sure we have an array
|
761
|
+
return if indexes.size == 0 # nothing to do
|
762
|
+
if indexes[0].is_a?(Index)
|
763
|
+
indexes.delete(self) # don't merge with self
|
764
|
+
indexes = indexes.map {|index| index.reader }
|
765
|
+
elsif indexes[0].is_a?(Ferret::Store::Directory)
|
766
|
+
indexes.delete(@dir) # don't merge with self
|
767
|
+
indexes = indexes.map {|dir| IndexReader.new(dir) }
|
768
|
+
elsif indexes[0].is_a?(IndexReader)
|
769
|
+
indexes.delete(@reader) # don't merge with self
|
770
|
+
else
|
771
|
+
raise ArgumentError, "Unknown index type when trying to merge indexes"
|
772
|
+
end
|
773
|
+
ensure_writer_open
|
774
|
+
@writer.add_readers(indexes)
|
775
|
+
end
|
776
|
+
end
|
777
|
+
|
778
|
+
# This is a simple utility method for saving an in memory or RAM index to
|
779
|
+
# the file system. The same thing can be achieved by using the
|
780
|
+
# Index::Index#add_indexes method and you will have more options when
|
781
|
+
# creating the new index, however this is a simple way to turn a RAM index
|
782
|
+
# into a file system index.
|
783
|
+
#
|
784
|
+
# directory:: This can either be a Store::Directory object or a String
|
785
|
+
# representing the path to the directory where you would
|
786
|
+
# like to store the index.
|
787
|
+
#
|
788
|
+
# create:: True if you'd like to create the directory if it doesn't
|
789
|
+
# exist or copy over an existing directory. False if you'd
|
790
|
+
# like to merge with the existing directory. This defaults to
|
791
|
+
# false.
|
792
|
+
def persist(directory, create = true)
|
793
|
+
synchronize do
|
794
|
+
close_all()
|
795
|
+
old_dir = @dir
|
796
|
+
if directory.is_a?(String)
|
797
|
+
@dir = FSDirectory.new(directory, create)
|
798
|
+
elsif directory.is_a?(Ferret::Store::Directory)
|
799
|
+
@dir = directory
|
800
|
+
end
|
801
|
+
@dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin
|
802
|
+
@options[:dir] = @dir
|
803
|
+
@options[:create_if_missing] = true
|
804
|
+
add_indexes([old_dir])
|
805
|
+
end
|
806
|
+
end
|
807
|
+
|
808
|
+
def to_s
|
809
|
+
buf = ""
|
810
|
+
(0...(size)).each do |i|
|
811
|
+
buf << self[i].to_s + "\n" if not deleted?(i)
|
812
|
+
end
|
813
|
+
buf
|
814
|
+
end
|
815
|
+
|
816
|
+
# Returns an Explanation that describes how +doc+ scored against
|
817
|
+
# +query+.
|
818
|
+
#
|
819
|
+
# This is intended to be used in developing Similarity implementations,
|
820
|
+
# and, for good performance, should not be displayed with every hit.
|
821
|
+
# Computing an explanation is as expensive as executing the query over the
|
822
|
+
# entire index.
|
823
|
+
def explain(query, doc)
|
824
|
+
@dir.synchronize do
|
825
|
+
ensure_searcher_open()
|
826
|
+
query = do_process_query(query)
|
827
|
+
|
828
|
+
return @searcher.explain(query, doc)
|
829
|
+
end
|
830
|
+
end
|
831
|
+
|
832
|
+
# Turn a query string into a Query object with the Index's QueryParser
|
833
|
+
def process_query(query)
|
834
|
+
@dir.synchronize do
|
835
|
+
ensure_searcher_open()
|
836
|
+
return do_process_query(query)
|
837
|
+
end
|
838
|
+
end
|
839
|
+
|
840
|
+
# Returns the field_infos object so that you can add new fields to the
|
841
|
+
# index.
|
842
|
+
def field_infos
|
843
|
+
@dir.synchronize do
|
844
|
+
ensure_writer_open()
|
845
|
+
return @writer.field_infos
|
846
|
+
end
|
847
|
+
end
|
848
|
+
|
849
|
+
|
850
|
+
protected
|
851
|
+
def ensure_writer_open()
|
852
|
+
raise "tried to use a closed index" if not @open
|
853
|
+
return if @writer
|
854
|
+
if @reader
|
855
|
+
@searcher.close if @searcher
|
856
|
+
@reader.close
|
857
|
+
@reader = nil
|
858
|
+
@searcher = nil
|
859
|
+
end
|
860
|
+
@writer = IndexWriter.new(@options)
|
861
|
+
end
|
862
|
+
|
863
|
+
# returns the new reader if one is opened
|
864
|
+
def ensure_reader_open(get_latest = true)
|
865
|
+
raise "tried to use a closed index" if not @open
|
866
|
+
if @reader
|
867
|
+
if get_latest
|
868
|
+
latest = false
|
869
|
+
begin
|
870
|
+
latest = @reader.latest?
|
871
|
+
rescue Lock::LockError => le
|
872
|
+
sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
|
873
|
+
latest = @reader.latest?
|
874
|
+
end
|
875
|
+
if not latest
|
876
|
+
@searcher.close if @searcher
|
877
|
+
@reader.close
|
878
|
+
return @reader = IndexReader.new(@dir)
|
879
|
+
end
|
880
|
+
end
|
881
|
+
else
|
882
|
+
if @writer
|
883
|
+
@writer.close
|
884
|
+
@writer = nil
|
885
|
+
end
|
886
|
+
return @reader = IndexReader.new(@dir)
|
887
|
+
end
|
888
|
+
return false
|
889
|
+
end
|
890
|
+
|
891
|
+
def ensure_searcher_open()
|
892
|
+
raise "tried to use a closed index" if not @open
|
893
|
+
if ensure_reader_open() or not @searcher
|
894
|
+
@searcher = Searcher.new(@reader)
|
895
|
+
end
|
896
|
+
end
|
897
|
+
|
898
|
+
private
|
899
|
+
def do_process_query(query)
|
900
|
+
if query.is_a?(String)
|
901
|
+
if @qp.nil?
|
902
|
+
@qp = Ferret::QueryParser.new(@options)
|
903
|
+
end
|
904
|
+
# we need to set this every time, in case a new field has been added
|
905
|
+
@qp.fields =
|
906
|
+
@reader.fields unless options[:all_fields] || options[:fields]
|
907
|
+
@qp.tokenized_fields =
|
908
|
+
@reader.tokenized_fields unless options[:tokenized_fields]
|
909
|
+
query = @qp.parse(query)
|
910
|
+
end
|
911
|
+
return query
|
912
|
+
end
|
913
|
+
|
914
|
+
def do_search(query, options)
|
915
|
+
ensure_searcher_open()
|
916
|
+
query = do_process_query(query)
|
917
|
+
|
918
|
+
return @searcher.search(query, options)
|
919
|
+
end
|
920
|
+
|
921
|
+
def close_all()
|
922
|
+
@dir.synchronize do
|
923
|
+
@searcher.close if @searcher
|
924
|
+
@reader.close if @reader
|
925
|
+
@writer.close if @writer
|
926
|
+
@reader = nil
|
927
|
+
@searcher = nil
|
928
|
+
@writer = nil
|
929
|
+
end
|
930
|
+
end
|
931
|
+
|
932
|
+
# If +docs+ is a Hash or an Array then a batch delete will be performed.
|
933
|
+
# If +docs+ is an Array then it will be considered an array of +id+'s. If
|
934
|
+
# it is a Hash, then its keys will be used instead as the Array of
|
935
|
+
# document +id+'s. If the +id+ is an Integers then it is considered a
|
936
|
+
# Ferret document number and the corresponding document will be deleted.
|
937
|
+
# If the +id+ is a String or a Symbol then the +id+ will be considered a
|
938
|
+
# term and the documents that contain that term in the +:id_field+ will
|
939
|
+
# be deleted.
|
940
|
+
#
|
941
|
+
# docs:: An Array of docs to be deleted, or a Hash (in which case the keys
|
942
|
+
# are used)
|
943
|
+
def batch_delete(docs)
|
944
|
+
docs = docs.keys if docs.is_a?(Hash)
|
945
|
+
raise ArgumentError, "must pass Array or Hash" unless docs.is_a? Array
|
946
|
+
ids = []
|
947
|
+
terms = []
|
948
|
+
docs.each do |doc|
|
949
|
+
case doc
|
950
|
+
when String then terms << doc
|
951
|
+
when Symbol then terms << doc.to_s
|
952
|
+
when Integer then ids << doc
|
953
|
+
else
|
954
|
+
raise ArgumentError, "Cannot delete for arg of type #{id.class}"
|
955
|
+
end
|
956
|
+
end
|
957
|
+
if ids.size > 0
|
958
|
+
ensure_reader_open
|
959
|
+
ids.each {|id| @reader.delete(id)}
|
960
|
+
end
|
961
|
+
if terms.size > 0
|
962
|
+
ensure_writer_open()
|
963
|
+
@writer.delete(@id_field, terms)
|
964
|
+
end
|
965
|
+
return self
|
966
|
+
end
|
967
|
+
|
968
|
+
end
|
969
|
+
end
|
970
|
+
|
971
|
+
module Ferret
|
972
|
+
I = Index::Index
|
973
|
+
end
|