xapian-fu 0.2 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +152 -13
- data/examples/query.rb +34 -6
- data/examples/spider.rb +44 -15
- data/lib/xapian_fu/query_parser.rb +179 -0
- data/lib/xapian_fu/result_set.rb +52 -0
- data/lib/xapian_fu/stopper_factory.rb +40 -0
- data/lib/xapian_fu/stopwords/README +7 -0
- data/lib/xapian_fu/stopwords/danish.txt +102 -0
- data/lib/xapian_fu/stopwords/dutch.txt +113 -0
- data/lib/xapian_fu/stopwords/english.txt +312 -0
- data/lib/xapian_fu/stopwords/finnish.txt +89 -0
- data/lib/xapian_fu/stopwords/french.txt +168 -0
- data/lib/xapian_fu/stopwords/german.txt +286 -0
- data/lib/xapian_fu/stopwords/hungarian.txt +203 -0
- data/lib/xapian_fu/stopwords/italian.txt +295 -0
- data/lib/xapian_fu/stopwords/norwegian.txt +186 -0
- data/lib/xapian_fu/stopwords/portuguese.txt +245 -0
- data/lib/xapian_fu/stopwords/russian.txt +236 -0
- data/lib/xapian_fu/stopwords/spanish.txt +348 -0
- data/lib/xapian_fu/stopwords/swedish.txt +125 -0
- data/lib/xapian_fu/stopwords/update.rb +7 -0
- data/lib/xapian_fu/xapian_db.rb +215 -99
- data/lib/xapian_fu/xapian_doc.rb +229 -47
- data/lib/xapian_fu/xapian_doc_value_accessor.rb +125 -0
- data/lib/xapian_fu/xapian_documents_accessor.rb +82 -0
- data/lib/xapian_fu.rb +1 -0
- data/spec/query_parser_spec.rb +43 -0
- data/spec/stopper_factory_spec.rb +57 -0
- data/spec/xapian_db_spec.rb +458 -215
- data/spec/xapian_doc_spec.rb +180 -0
- data/spec/xapian_doc_value_accessor_spec.rb +92 -0
- metadata +29 -5
data/lib/xapian_fu/xapian_db.rb
CHANGED
@@ -1,116 +1,232 @@
|
|
1
|
-
module XapianFu
|
1
|
+
module XapianFu #:nodoc:
|
2
|
+
# Generic Xapian Fu exception class
|
2
3
|
class XapianFuError < StandardError ; end
|
3
4
|
|
4
5
|
require 'xapian'
|
5
6
|
require 'xapian_doc'
|
7
|
+
require 'stopper_factory'
|
8
|
+
require 'query_parser'
|
9
|
+
require 'result_set'
|
10
|
+
require 'xapian_documents_accessor'
|
6
11
|
require 'thread'
|
7
12
|
|
13
|
+
# Raised when two operations are attempted concurrently when it is
|
14
|
+
# not possible
|
8
15
|
class ConcurrencyError < XapianFuError ; end
|
16
|
+
|
17
|
+
# Raised when a document is requested by id that doesn't exist in
|
18
|
+
# the database
|
9
19
|
class DocNotFound < XapianFuError ; end
|
10
20
|
|
11
|
-
|
12
|
-
|
13
|
-
|
21
|
+
# The XapianFu::XapianDb encapsulates a Xapian database, handling
|
22
|
+
# setting up stemmers, stoppers, query parsers and such. This is
|
23
|
+
# the core of XapianFu.
|
24
|
+
#
|
25
|
+
# == Opening and creating the database
|
26
|
+
#
|
27
|
+
# The <tt>:dir</tt> option specified where the xapian database is to
|
28
|
+
# be read from and written to. Without this, an in-memory Xapian
|
29
|
+
# database will be used. By default, the on-disk database will not
|
30
|
+
# be created if it doesn't already exist. See the <tt>:create</tt>
|
31
|
+
# option.
|
32
|
+
#
|
33
|
+
# Setting the <tt>:create</tt> option to <tt>true</tt> will allow
|
34
|
+
# XapianDb to create a new Xapian database on-disk. If one already
|
35
|
+
# exists, it is just opened. The default is <tt>false</tt>.
|
36
|
+
#
|
37
|
+
# Setting the <tt>:overwrite</tt> option to <tt>true</tt> will force
|
38
|
+
# XapianDb to wipe the current on-disk database and start afresh.
|
39
|
+
# The default is <tt>false</tt>.
|
40
|
+
#
|
41
|
+
# db = XapianDb.new(:dir => '/tmp/mydb', :create => true)
|
42
|
+
#
|
43
|
+
# == Language, Stemmers and Stoppers
|
44
|
+
#
|
45
|
+
# The <tt>:language</tt> option specifies the default document
|
46
|
+
# language, and controls the default type of stemmer and stopper
|
47
|
+
# that will be used when indexing. The stemmer and stopper can be
|
48
|
+
# overridden with the <tt>:stemmer</tt> and <tt>stopper</tt> options.
|
49
|
+
#
|
50
|
+
# The <tt>:language, :stemmer and :stopper</tt> options can be set
|
51
|
+
# to one of of the following: <tt>:danish, :dutch, :english,
|
52
|
+
# :finnish, :french, :german, :hungarian, :italian, :norwegian,
|
53
|
+
# :portuguese, :romanian, :russian, :spanish, :swedish,
|
54
|
+
# :turkish</tt>. Set it to <tt>false</tt> to specify none.
|
55
|
+
#
|
56
|
+
# The default for all is <tt>:english</tt>.
|
57
|
+
#
|
58
|
+
# db = XapianDb.new(:language => :italian, :stopper => false)
|
59
|
+
#
|
60
|
+
# == Fields and values
|
61
|
+
#
|
62
|
+
# The <tt>:store</tt> option specifies which document fields should
|
63
|
+
# be stored in the database. By default, fields are only indexed -
|
64
|
+
# the original values cannot be retrieved.
|
65
|
+
#
|
66
|
+
# The <tt>:sortable</tt> option specifies which document fields will
|
67
|
+
# be available for sorting results on. This is really just does the
|
68
|
+
# same thing as <tt>:store</tt> and is just available to be explicit.
|
69
|
+
#
|
70
|
+
# The <tt>:collapsible</tt> option specifies which document fields
|
71
|
+
# can be used to group ("collapse") results. This also just does
|
72
|
+
# the same thing as <tt>:store</tt> and is just available to be explicit.
|
73
|
+
#
|
74
|
+
# A more complete way of defining fields is available:
|
75
|
+
#
|
76
|
+
# XapianDb.new(:fields => { :title => { :type => String },
|
77
|
+
# :slug => { :type => String, :index => false },
|
78
|
+
# :created_at => { :type => Time, :store => true },
|
79
|
+
# :votes => { :type => Fixnum, :store => true },
|
80
|
+
# })
|
81
|
+
#
|
82
|
+
# XapianFu will use the :type option when instantiating a store
|
83
|
+
# value, so you'll get back a Time object rather than the result of
|
84
|
+
# Time's to_s method as is the default. Defining the type for
|
85
|
+
# numerical classes (such as Time, Fixnum and Bignum) allows
|
86
|
+
# XapianFu to to store them on-disk in a much more efficient way,
|
87
|
+
# and sort them efficiently (without having to resort to storing
|
88
|
+
# leading zeros or anything like that).
|
89
|
+
#
|
90
|
+
class XapianDb # :nonew:
|
91
|
+
# Path to the on-disk database. Nil if in-memory database
|
92
|
+
attr_reader :dir
|
93
|
+
attr_reader :db_flag #:nodoc:
|
94
|
+
# An array of the fields that will be stored in the Xapian
|
95
|
+
attr_reader :store_values
|
96
|
+
# True if term positions will be stored
|
97
|
+
attr_reader :index_positions
|
98
|
+
# The default document language. Used for setting up stoppers and stemmers.
|
99
|
+
attr_reader :language
|
100
|
+
# An hash of field names and their types
|
101
|
+
attr_reader :fields
|
102
|
+
# An array of fields that will not be indexed
|
103
|
+
attr_reader :unindexed_fields
|
14
104
|
|
15
105
|
def initialize( options = { } )
|
16
|
-
@
|
106
|
+
@options = { :index_positions => true }.merge(options)
|
107
|
+
@dir = @options[:dir]
|
108
|
+
@index_positions = @options[:index_positions]
|
17
109
|
@db_flag = Xapian::DB_OPEN
|
18
|
-
@db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
|
19
|
-
@db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
|
20
|
-
@store_fields = Array.new(1, options[:store]).compact
|
21
|
-
@store_values = Array.new(1, options[:sortable]).compact
|
22
|
-
@store_values += Array.new(1, options[:collapsible]).compact
|
23
|
-
rw.flush if options[:create]
|
110
|
+
@db_flag = Xapian::DB_CREATE_OR_OPEN if @options[:create]
|
111
|
+
@db_flag = Xapian::DB_CREATE_OR_OVERWRITE if @options[:overwrite]
|
24
112
|
@tx_mutex = Mutex.new
|
113
|
+
@language = @options.fetch(:language, :english)
|
114
|
+
@stemmer = @options.fetch(:stemmer, @language)
|
115
|
+
@stopper = @options.fetch(:stopper, @language)
|
116
|
+
setup_fields(@options[:fields])
|
117
|
+
@store_values << @options[:store]
|
118
|
+
@store_values << @options[:sortable]
|
119
|
+
@store_values << @options[:collapsible]
|
120
|
+
@store_values = @store_values.flatten.uniq.compact
|
121
|
+
end
|
122
|
+
|
123
|
+
# Return a new stemmer object for this database
|
124
|
+
def stemmer
|
125
|
+
StemFactory.stemmer_for(@stemmer)
|
126
|
+
end
|
127
|
+
|
128
|
+
# The stopper object for this database
|
129
|
+
def stopper
|
130
|
+
StopperFactory.stopper_for(@stopper)
|
25
131
|
end
|
26
132
|
|
27
|
-
#
|
133
|
+
# The writable Xapian database
|
28
134
|
def rw
|
29
135
|
@rw ||= setup_rw_db
|
30
136
|
end
|
31
137
|
|
32
|
-
#
|
138
|
+
# The read-only Xapian database
|
33
139
|
def ro
|
34
140
|
@ro ||= setup_ro_db
|
35
141
|
end
|
36
142
|
|
37
|
-
#
|
143
|
+
# The number of docs in the Xapian database
|
38
144
|
def size
|
39
145
|
ro.doccount
|
40
146
|
end
|
41
147
|
|
42
|
-
#
|
148
|
+
# The XapianFu::XapianDocumentsAccessor for this database
|
43
149
|
def documents
|
44
150
|
@documents_accessor ||= XapianDocumentsAccessor.new(self)
|
45
151
|
end
|
46
152
|
|
47
|
-
#
|
48
|
-
# keys representing field names and their values the data to be
|
49
|
-
# indexed. Or it can be a XapianDoc, or any object with a to_s method.
|
50
|
-
#
|
51
|
-
# If the document object reponds to the method :data, whatever it
|
52
|
-
# returns is marshalled and stored in the Xapian database. Any
|
53
|
-
# arbitrary data up to Xmeg can be stored here.
|
54
|
-
#
|
55
|
-
# Currently, all fields are stored in the database. This will
|
56
|
-
# change to store only those fields requested to be stored.
|
153
|
+
# Short-cut to documents.add
|
57
154
|
def add_doc(doc)
|
58
|
-
|
59
|
-
doc.db = self
|
60
|
-
xdoc = doc.to_xapian_document
|
61
|
-
tg = Xapian::TermGenerator.new
|
62
|
-
tg.database = rw
|
63
|
-
tg.document = xdoc
|
64
|
-
tg.index_text( doc.text )
|
65
|
-
if doc.id
|
66
|
-
rw.replace_document(doc.id, xdoc)
|
67
|
-
else
|
68
|
-
doc.id = rw.add_document(xdoc)
|
69
|
-
end
|
70
|
-
doc
|
155
|
+
documents.add(doc)
|
71
156
|
end
|
72
157
|
alias_method "<<", :add_doc
|
73
158
|
|
74
|
-
# Conduct a search on the Xapian database, returning an array of
|
75
|
-
# XapianDoc objects for the matches
|
159
|
+
# Conduct a search on the Xapian database, returning an array of
|
160
|
+
# XapianDoc objects for the matches.
|
161
|
+
#
|
162
|
+
# The <tt>:limit</tt> option sets how many results to return. For
|
163
|
+
# compatability with the <tt>will_paginate</tt> plugin, the
|
164
|
+
# <tt>:per_page</tt> option does the same thing (though overrides
|
165
|
+
# <tt>:limit</tt>). Defaults to 10.
|
166
|
+
#
|
167
|
+
# The <tt>:page</tt> option sets which page of results to return.
|
168
|
+
# Defaults to 1.
|
169
|
+
#
|
170
|
+
# The <tt>:order</tt> option specifies the stored field to order
|
171
|
+
# the results by (instead of the default search result weight).
|
172
|
+
#
|
173
|
+
# The <tt>:reverse</tt> option reverses the order of the results,
|
174
|
+
# so lowest search weight first (or lowest stored field value
|
175
|
+
# first).
|
176
|
+
#
|
177
|
+
# The <tt>:collapse</tt> option specifies which stored field value
|
178
|
+
# to collapse (group) the results on. Works a bit like the
|
179
|
+
# SQL <tt>GROUP BY</tt> behaviour
|
180
|
+
#
|
181
|
+
# For additional options on how the query is parsed, see
|
182
|
+
# XapianFu::QueryParser
|
183
|
+
|
76
184
|
def search(q, options = {})
|
77
|
-
defaults = { :page => 1, :
|
185
|
+
defaults = { :page => 1, :reverse => false,
|
186
|
+
:boolean => true, :boolean_anycase => true, :wildcards => true,
|
187
|
+
:lovehate => true, :spelling => true, :pure_not => false }
|
78
188
|
options = defaults.merge(options)
|
79
189
|
page = options[:page].to_i rescue 1
|
80
190
|
page = page > 1 ? page - 1 : 0
|
81
|
-
per_page = options[:per_page]
|
191
|
+
per_page = options[:per_page] || options[:limit] || 10
|
192
|
+
per_page = per_page.to_i rescue 10
|
82
193
|
offset = page * per_page
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
194
|
+
qp = XapianFu::QueryParser.new({ :database => self }.merge(options))
|
195
|
+
query = qp.parse_query(q.to_s)
|
196
|
+
enquiry = Xapian::Enquire.new(ro)
|
197
|
+
setup_ordering(enquiry, options[:order], options[:reverse])
|
87
198
|
if options[:collapse]
|
88
199
|
enquiry.collapse_key = options[:collapse].to_s.hash
|
89
200
|
end
|
90
201
|
enquiry.query = query
|
91
|
-
enquiry.mset(offset, per_page)
|
202
|
+
ResultSet.new(:mset => enquiry.mset(offset, per_page), :current_page => page + 1,
|
203
|
+
:per_page => per_page, :corrected_query => qp.corrected_query)
|
92
204
|
end
|
93
205
|
|
94
|
-
# Run the given block in a XapianDB transaction. Any changes to the
|
206
|
+
# Run the given block in a XapianDB transaction. Any changes to the
|
95
207
|
# Xapian database made in the block will be atomically committed at the end.
|
96
|
-
#
|
208
|
+
#
|
97
209
|
# If an exception is raised by the block, all changes are discarded and the
|
98
210
|
# exception re-raised.
|
99
|
-
#
|
211
|
+
#
|
100
212
|
# Xapian does not support multiple concurrent transactions on the
|
101
213
|
# same Xapian database. Any attempts at this will be serialized by
|
102
214
|
# XapianFu, which is not perfect but probably better than just
|
103
215
|
# kicking up an exception.
|
104
216
|
#
|
105
|
-
def transaction
|
217
|
+
def transaction(flush_on_commit = true)
|
106
218
|
@tx_mutex.synchronize do
|
107
|
-
|
108
|
-
|
219
|
+
begin
|
220
|
+
rw.begin_transaction(flush_on_commit)
|
221
|
+
yield
|
222
|
+
rescue Exception => e
|
223
|
+
rw.cancel_transaction
|
224
|
+
ro.reopen
|
225
|
+
raise e
|
226
|
+
end
|
109
227
|
rw.commit_transaction
|
228
|
+
ro.reopen
|
110
229
|
end
|
111
|
-
rescue Exception => e
|
112
|
-
rw.cancel_transaction
|
113
|
-
raise e
|
114
230
|
end
|
115
231
|
|
116
232
|
# Flush any changes to disk and reopen the read-only database.
|
@@ -121,29 +237,21 @@ module XapianFu
|
|
121
237
|
ro.reopen
|
122
238
|
end
|
123
239
|
|
124
|
-
def query_parser
|
125
|
-
unless @query_parser
|
126
|
-
@query_parser = Xapian::QueryParser.new
|
127
|
-
@query_parser.database = ro
|
128
|
-
end
|
129
|
-
@query_parser
|
130
|
-
end
|
131
|
-
|
132
|
-
def enquiry
|
133
|
-
@enquiry ||= Xapian::Enquire.new(ro)
|
134
|
-
end
|
135
|
-
|
136
240
|
private
|
137
241
|
|
242
|
+
# Setup the writable database
|
138
243
|
def setup_rw_db
|
139
244
|
if dir
|
140
245
|
@rw = Xapian::WritableDatabase.new(dir, db_flag)
|
246
|
+
@rw.flush if @options[:create]
|
247
|
+
@rw
|
141
248
|
else
|
142
249
|
# In memory database
|
143
250
|
@rw = Xapian::inmemory_open
|
144
251
|
end
|
145
252
|
end
|
146
253
|
|
254
|
+
# Setup the read-only database
|
147
255
|
def setup_ro_db
|
148
256
|
if dir
|
149
257
|
@ro = Xapian::Database.new(dir)
|
@@ -153,41 +261,49 @@ module XapianFu
|
|
153
261
|
end
|
154
262
|
end
|
155
263
|
|
156
|
-
#
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
xdoc = @xdb.ro.document(doc_id)
|
167
|
-
XapianDoc.new(xdoc)
|
168
|
-
rescue RuntimeError => e
|
169
|
-
raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
|
264
|
+
# Setup ordering for the given Xapian::Enquire objects
|
265
|
+
def setup_ordering(enquiry, order = nil, reverse = true)
|
266
|
+
if order.to_s == "id"
|
267
|
+
# Sorting by a value that doesn't exist falls back to docid ordering
|
268
|
+
enquiry.sort_by_value!((1 << 32)-1, reverse)
|
269
|
+
enquiry.docid_order = reverse ? Xapian::Enquire::DESCENDING : Xapian::Enquire::ASCENDING
|
270
|
+
elsif order.is_a? String or order.is_a? Symbol
|
271
|
+
enquiry.sort_by_value!(order.to_s.hash, reverse)
|
272
|
+
else
|
273
|
+
enquiry.sort_by_relevance!
|
170
274
|
end
|
275
|
+
enquiry
|
276
|
+
end
|
171
277
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
278
|
+
# Setup the fields hash and stored_values list from the given options
|
279
|
+
def setup_fields(field_options)
|
280
|
+
@fields = { }
|
281
|
+
@unindexed_fields = []
|
282
|
+
@store_values = []
|
283
|
+
return nil if field_options.nil?
|
284
|
+
default_opts = {
|
285
|
+
:store => true,
|
286
|
+
:index => true,
|
287
|
+
:type => String
|
288
|
+
}
|
289
|
+
# Convert array argument to hash, with String as default type
|
290
|
+
if field_options.is_a? Array
|
291
|
+
fohash = { }
|
292
|
+
field_options.each { |f| fohash[f] = { :type => String } }
|
293
|
+
field_options = fohash
|
178
294
|
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
if
|
184
|
-
|
185
|
-
|
186
|
-
end
|
187
|
-
rescue RuntimeError => e
|
188
|
-
raise e unless e.to_s =~ /^DocNotFoundError/
|
295
|
+
field_options.each do |name,opts|
|
296
|
+
# Handle simple setup by type only
|
297
|
+
opts = { :type => opts } unless opts.is_a? Hash
|
298
|
+
opts = default_opts.merge(opts)
|
299
|
+
@store_values << name if opts[:store]
|
300
|
+
@unindexed_fields << name if opts[:index] == false
|
301
|
+
@fields[name] = opts[:type]
|
189
302
|
end
|
303
|
+
@fields
|
190
304
|
end
|
305
|
+
|
191
306
|
end
|
192
|
-
|
307
|
+
|
193
308
|
end
|
309
|
+
|