xapian-fu 0.2 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +152 -13
- data/examples/query.rb +34 -6
- data/examples/spider.rb +44 -15
- data/lib/xapian_fu/query_parser.rb +179 -0
- data/lib/xapian_fu/result_set.rb +52 -0
- data/lib/xapian_fu/stopper_factory.rb +40 -0
- data/lib/xapian_fu/stopwords/README +7 -0
- data/lib/xapian_fu/stopwords/danish.txt +102 -0
- data/lib/xapian_fu/stopwords/dutch.txt +113 -0
- data/lib/xapian_fu/stopwords/english.txt +312 -0
- data/lib/xapian_fu/stopwords/finnish.txt +89 -0
- data/lib/xapian_fu/stopwords/french.txt +168 -0
- data/lib/xapian_fu/stopwords/german.txt +286 -0
- data/lib/xapian_fu/stopwords/hungarian.txt +203 -0
- data/lib/xapian_fu/stopwords/italian.txt +295 -0
- data/lib/xapian_fu/stopwords/norwegian.txt +186 -0
- data/lib/xapian_fu/stopwords/portuguese.txt +245 -0
- data/lib/xapian_fu/stopwords/russian.txt +236 -0
- data/lib/xapian_fu/stopwords/spanish.txt +348 -0
- data/lib/xapian_fu/stopwords/swedish.txt +125 -0
- data/lib/xapian_fu/stopwords/update.rb +7 -0
- data/lib/xapian_fu/xapian_db.rb +215 -99
- data/lib/xapian_fu/xapian_doc.rb +229 -47
- data/lib/xapian_fu/xapian_doc_value_accessor.rb +125 -0
- data/lib/xapian_fu/xapian_documents_accessor.rb +82 -0
- data/lib/xapian_fu.rb +1 -0
- data/spec/query_parser_spec.rb +43 -0
- data/spec/stopper_factory_spec.rb +57 -0
- data/spec/xapian_db_spec.rb +458 -215
- data/spec/xapian_doc_spec.rb +180 -0
- data/spec/xapian_doc_value_accessor_spec.rb +92 -0
- metadata +29 -5
data/lib/xapian_fu/xapian_db.rb
CHANGED
@@ -1,116 +1,232 @@
|
|
1
|
-
module XapianFu
|
1
|
+
module XapianFu #:nodoc:
|
2
|
+
# Generic Xapian Fu exception class
|
2
3
|
class XapianFuError < StandardError ; end
|
3
4
|
|
4
5
|
require 'xapian'
|
5
6
|
require 'xapian_doc'
|
7
|
+
require 'stopper_factory'
|
8
|
+
require 'query_parser'
|
9
|
+
require 'result_set'
|
10
|
+
require 'xapian_documents_accessor'
|
6
11
|
require 'thread'
|
7
12
|
|
13
|
+
# Raised when two operations are attempted concurrently when it is
|
14
|
+
# not possible
|
8
15
|
class ConcurrencyError < XapianFuError ; end
|
16
|
+
|
17
|
+
# Raised when a document is requested by id that doesn't exist in
|
18
|
+
# the database
|
9
19
|
class DocNotFound < XapianFuError ; end
|
10
20
|
|
11
|
-
|
12
|
-
|
13
|
-
|
21
|
+
# The XapianFu::XapianDb encapsulates a Xapian database, handling
|
22
|
+
# setting up stemmers, stoppers, query parsers and such. This is
|
23
|
+
# the core of XapianFu.
|
24
|
+
#
|
25
|
+
# == Opening and creating the database
|
26
|
+
#
|
27
|
+
# The <tt>:dir</tt> option specified where the xapian database is to
|
28
|
+
# be read from and written to. Without this, an in-memory Xapian
|
29
|
+
# database will be used. By default, the on-disk database will not
|
30
|
+
# be created if it doesn't already exist. See the <tt>:create</tt>
|
31
|
+
# option.
|
32
|
+
#
|
33
|
+
# Setting the <tt>:create</tt> option to <tt>true</tt> will allow
|
34
|
+
# XapianDb to create a new Xapian database on-disk. If one already
|
35
|
+
# exists, it is just opened. The default is <tt>false</tt>.
|
36
|
+
#
|
37
|
+
# Setting the <tt>:overwrite</tt> option to <tt>true</tt> will force
|
38
|
+
# XapianDb to wipe the current on-disk database and start afresh.
|
39
|
+
# The default is <tt>false</tt>.
|
40
|
+
#
|
41
|
+
# db = XapianDb.new(:dir => '/tmp/mydb', :create => true)
|
42
|
+
#
|
43
|
+
# == Language, Stemmers and Stoppers
|
44
|
+
#
|
45
|
+
# The <tt>:language</tt> option specifies the default document
|
46
|
+
# language, and controls the default type of stemmer and stopper
|
47
|
+
# that will be used when indexing. The stemmer and stopper can be
|
48
|
+
# overridden with the <tt>:stemmer</tt> and <tt>stopper</tt> options.
|
49
|
+
#
|
50
|
+
# The <tt>:language, :stemmer and :stopper</tt> options can be set
|
51
|
+
# to one of of the following: <tt>:danish, :dutch, :english,
|
52
|
+
# :finnish, :french, :german, :hungarian, :italian, :norwegian,
|
53
|
+
# :portuguese, :romanian, :russian, :spanish, :swedish,
|
54
|
+
# :turkish</tt>. Set it to <tt>false</tt> to specify none.
|
55
|
+
#
|
56
|
+
# The default for all is <tt>:english</tt>.
|
57
|
+
#
|
58
|
+
# db = XapianDb.new(:language => :italian, :stopper => false)
|
59
|
+
#
|
60
|
+
# == Fields and values
|
61
|
+
#
|
62
|
+
# The <tt>:store</tt> option specifies which document fields should
|
63
|
+
# be stored in the database. By default, fields are only indexed -
|
64
|
+
# the original values cannot be retrieved.
|
65
|
+
#
|
66
|
+
# The <tt>:sortable</tt> option specifies which document fields will
|
67
|
+
# be available for sorting results on. This is really just does the
|
68
|
+
# same thing as <tt>:store</tt> and is just available to be explicit.
|
69
|
+
#
|
70
|
+
# The <tt>:collapsible</tt> option specifies which document fields
|
71
|
+
# can be used to group ("collapse") results. This also just does
|
72
|
+
# the same thing as <tt>:store</tt> and is just available to be explicit.
|
73
|
+
#
|
74
|
+
# A more complete way of defining fields is available:
|
75
|
+
#
|
76
|
+
# XapianDb.new(:fields => { :title => { :type => String },
|
77
|
+
# :slug => { :type => String, :index => false },
|
78
|
+
# :created_at => { :type => Time, :store => true },
|
79
|
+
# :votes => { :type => Fixnum, :store => true },
|
80
|
+
# })
|
81
|
+
#
|
82
|
+
# XapianFu will use the :type option when instantiating a store
|
83
|
+
# value, so you'll get back a Time object rather than the result of
|
84
|
+
# Time's to_s method as is the default. Defining the type for
|
85
|
+
# numerical classes (such as Time, Fixnum and Bignum) allows
|
86
|
+
# XapianFu to to store them on-disk in a much more efficient way,
|
87
|
+
# and sort them efficiently (without having to resort to storing
|
88
|
+
# leading zeros or anything like that).
|
89
|
+
#
|
90
|
+
class XapianDb # :nonew:
|
91
|
+
# Path to the on-disk database. Nil if in-memory database
|
92
|
+
attr_reader :dir
|
93
|
+
attr_reader :db_flag #:nodoc:
|
94
|
+
# An array of the fields that will be stored in the Xapian
|
95
|
+
attr_reader :store_values
|
96
|
+
# True if term positions will be stored
|
97
|
+
attr_reader :index_positions
|
98
|
+
# The default document language. Used for setting up stoppers and stemmers.
|
99
|
+
attr_reader :language
|
100
|
+
# An hash of field names and their types
|
101
|
+
attr_reader :fields
|
102
|
+
# An array of fields that will not be indexed
|
103
|
+
attr_reader :unindexed_fields
|
14
104
|
|
15
105
|
def initialize( options = { } )
|
16
|
-
@
|
106
|
+
@options = { :index_positions => true }.merge(options)
|
107
|
+
@dir = @options[:dir]
|
108
|
+
@index_positions = @options[:index_positions]
|
17
109
|
@db_flag = Xapian::DB_OPEN
|
18
|
-
@db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
|
19
|
-
@db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
|
20
|
-
@store_fields = Array.new(1, options[:store]).compact
|
21
|
-
@store_values = Array.new(1, options[:sortable]).compact
|
22
|
-
@store_values += Array.new(1, options[:collapsible]).compact
|
23
|
-
rw.flush if options[:create]
|
110
|
+
@db_flag = Xapian::DB_CREATE_OR_OPEN if @options[:create]
|
111
|
+
@db_flag = Xapian::DB_CREATE_OR_OVERWRITE if @options[:overwrite]
|
24
112
|
@tx_mutex = Mutex.new
|
113
|
+
@language = @options.fetch(:language, :english)
|
114
|
+
@stemmer = @options.fetch(:stemmer, @language)
|
115
|
+
@stopper = @options.fetch(:stopper, @language)
|
116
|
+
setup_fields(@options[:fields])
|
117
|
+
@store_values << @options[:store]
|
118
|
+
@store_values << @options[:sortable]
|
119
|
+
@store_values << @options[:collapsible]
|
120
|
+
@store_values = @store_values.flatten.uniq.compact
|
121
|
+
end
|
122
|
+
|
123
|
+
# Return a new stemmer object for this database
|
124
|
+
def stemmer
|
125
|
+
StemFactory.stemmer_for(@stemmer)
|
126
|
+
end
|
127
|
+
|
128
|
+
# The stopper object for this database
|
129
|
+
def stopper
|
130
|
+
StopperFactory.stopper_for(@stopper)
|
25
131
|
end
|
26
132
|
|
27
|
-
#
|
133
|
+
# The writable Xapian database
|
28
134
|
def rw
|
29
135
|
@rw ||= setup_rw_db
|
30
136
|
end
|
31
137
|
|
32
|
-
#
|
138
|
+
# The read-only Xapian database
|
33
139
|
def ro
|
34
140
|
@ro ||= setup_ro_db
|
35
141
|
end
|
36
142
|
|
37
|
-
#
|
143
|
+
# The number of docs in the Xapian database
|
38
144
|
def size
|
39
145
|
ro.doccount
|
40
146
|
end
|
41
147
|
|
42
|
-
#
|
148
|
+
# The XapianFu::XapianDocumentsAccessor for this database
|
43
149
|
def documents
|
44
150
|
@documents_accessor ||= XapianDocumentsAccessor.new(self)
|
45
151
|
end
|
46
152
|
|
47
|
-
#
|
48
|
-
# keys representing field names and their values the data to be
|
49
|
-
# indexed. Or it can be a XapianDoc, or any object with a to_s method.
|
50
|
-
#
|
51
|
-
# If the document object reponds to the method :data, whatever it
|
52
|
-
# returns is marshalled and stored in the Xapian database. Any
|
53
|
-
# arbitrary data up to Xmeg can be stored here.
|
54
|
-
#
|
55
|
-
# Currently, all fields are stored in the database. This will
|
56
|
-
# change to store only those fields requested to be stored.
|
153
|
+
# Short-cut to documents.add
|
57
154
|
def add_doc(doc)
|
58
|
-
|
59
|
-
doc.db = self
|
60
|
-
xdoc = doc.to_xapian_document
|
61
|
-
tg = Xapian::TermGenerator.new
|
62
|
-
tg.database = rw
|
63
|
-
tg.document = xdoc
|
64
|
-
tg.index_text( doc.text )
|
65
|
-
if doc.id
|
66
|
-
rw.replace_document(doc.id, xdoc)
|
67
|
-
else
|
68
|
-
doc.id = rw.add_document(xdoc)
|
69
|
-
end
|
70
|
-
doc
|
155
|
+
documents.add(doc)
|
71
156
|
end
|
72
157
|
alias_method "<<", :add_doc
|
73
158
|
|
74
|
-
# Conduct a search on the Xapian database, returning an array of
|
75
|
-
# XapianDoc objects for the matches
|
159
|
+
# Conduct a search on the Xapian database, returning an array of
|
160
|
+
# XapianDoc objects for the matches.
|
161
|
+
#
|
162
|
+
# The <tt>:limit</tt> option sets how many results to return. For
|
163
|
+
# compatability with the <tt>will_paginate</tt> plugin, the
|
164
|
+
# <tt>:per_page</tt> option does the same thing (though overrides
|
165
|
+
# <tt>:limit</tt>). Defaults to 10.
|
166
|
+
#
|
167
|
+
# The <tt>:page</tt> option sets which page of results to return.
|
168
|
+
# Defaults to 1.
|
169
|
+
#
|
170
|
+
# The <tt>:order</tt> option specifies the stored field to order
|
171
|
+
# the results by (instead of the default search result weight).
|
172
|
+
#
|
173
|
+
# The <tt>:reverse</tt> option reverses the order of the results,
|
174
|
+
# so lowest search weight first (or lowest stored field value
|
175
|
+
# first).
|
176
|
+
#
|
177
|
+
# The <tt>:collapse</tt> option specifies which stored field value
|
178
|
+
# to collapse (group) the results on. Works a bit like the
|
179
|
+
# SQL <tt>GROUP BY</tt> behaviour
|
180
|
+
#
|
181
|
+
# For additional options on how the query is parsed, see
|
182
|
+
# XapianFu::QueryParser
|
183
|
+
|
76
184
|
def search(q, options = {})
|
77
|
-
defaults = { :page => 1, :
|
185
|
+
defaults = { :page => 1, :reverse => false,
|
186
|
+
:boolean => true, :boolean_anycase => true, :wildcards => true,
|
187
|
+
:lovehate => true, :spelling => true, :pure_not => false }
|
78
188
|
options = defaults.merge(options)
|
79
189
|
page = options[:page].to_i rescue 1
|
80
190
|
page = page > 1 ? page - 1 : 0
|
81
|
-
per_page = options[:per_page]
|
191
|
+
per_page = options[:per_page] || options[:limit] || 10
|
192
|
+
per_page = per_page.to_i rescue 10
|
82
193
|
offset = page * per_page
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
194
|
+
qp = XapianFu::QueryParser.new({ :database => self }.merge(options))
|
195
|
+
query = qp.parse_query(q.to_s)
|
196
|
+
enquiry = Xapian::Enquire.new(ro)
|
197
|
+
setup_ordering(enquiry, options[:order], options[:reverse])
|
87
198
|
if options[:collapse]
|
88
199
|
enquiry.collapse_key = options[:collapse].to_s.hash
|
89
200
|
end
|
90
201
|
enquiry.query = query
|
91
|
-
enquiry.mset(offset, per_page)
|
202
|
+
ResultSet.new(:mset => enquiry.mset(offset, per_page), :current_page => page + 1,
|
203
|
+
:per_page => per_page, :corrected_query => qp.corrected_query)
|
92
204
|
end
|
93
205
|
|
94
|
-
# Run the given block in a XapianDB transaction. Any changes to the
|
206
|
+
# Run the given block in a XapianDB transaction. Any changes to the
|
95
207
|
# Xapian database made in the block will be atomically committed at the end.
|
96
|
-
#
|
208
|
+
#
|
97
209
|
# If an exception is raised by the block, all changes are discarded and the
|
98
210
|
# exception re-raised.
|
99
|
-
#
|
211
|
+
#
|
100
212
|
# Xapian does not support multiple concurrent transactions on the
|
101
213
|
# same Xapian database. Any attempts at this will be serialized by
|
102
214
|
# XapianFu, which is not perfect but probably better than just
|
103
215
|
# kicking up an exception.
|
104
216
|
#
|
105
|
-
def transaction
|
217
|
+
def transaction(flush_on_commit = true)
|
106
218
|
@tx_mutex.synchronize do
|
107
|
-
|
108
|
-
|
219
|
+
begin
|
220
|
+
rw.begin_transaction(flush_on_commit)
|
221
|
+
yield
|
222
|
+
rescue Exception => e
|
223
|
+
rw.cancel_transaction
|
224
|
+
ro.reopen
|
225
|
+
raise e
|
226
|
+
end
|
109
227
|
rw.commit_transaction
|
228
|
+
ro.reopen
|
110
229
|
end
|
111
|
-
rescue Exception => e
|
112
|
-
rw.cancel_transaction
|
113
|
-
raise e
|
114
230
|
end
|
115
231
|
|
116
232
|
# Flush any changes to disk and reopen the read-only database.
|
@@ -121,29 +237,21 @@ module XapianFu
|
|
121
237
|
ro.reopen
|
122
238
|
end
|
123
239
|
|
124
|
-
def query_parser
|
125
|
-
unless @query_parser
|
126
|
-
@query_parser = Xapian::QueryParser.new
|
127
|
-
@query_parser.database = ro
|
128
|
-
end
|
129
|
-
@query_parser
|
130
|
-
end
|
131
|
-
|
132
|
-
def enquiry
|
133
|
-
@enquiry ||= Xapian::Enquire.new(ro)
|
134
|
-
end
|
135
|
-
|
136
240
|
private
|
137
241
|
|
242
|
+
# Setup the writable database
|
138
243
|
def setup_rw_db
|
139
244
|
if dir
|
140
245
|
@rw = Xapian::WritableDatabase.new(dir, db_flag)
|
246
|
+
@rw.flush if @options[:create]
|
247
|
+
@rw
|
141
248
|
else
|
142
249
|
# In memory database
|
143
250
|
@rw = Xapian::inmemory_open
|
144
251
|
end
|
145
252
|
end
|
146
253
|
|
254
|
+
# Setup the read-only database
|
147
255
|
def setup_ro_db
|
148
256
|
if dir
|
149
257
|
@ro = Xapian::Database.new(dir)
|
@@ -153,41 +261,49 @@ module XapianFu
|
|
153
261
|
end
|
154
262
|
end
|
155
263
|
|
156
|
-
#
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
xdoc = @xdb.ro.document(doc_id)
|
167
|
-
XapianDoc.new(xdoc)
|
168
|
-
rescue RuntimeError => e
|
169
|
-
raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
|
264
|
+
# Setup ordering for the given Xapian::Enquire objects
|
265
|
+
def setup_ordering(enquiry, order = nil, reverse = true)
|
266
|
+
if order.to_s == "id"
|
267
|
+
# Sorting by a value that doesn't exist falls back to docid ordering
|
268
|
+
enquiry.sort_by_value!((1 << 32)-1, reverse)
|
269
|
+
enquiry.docid_order = reverse ? Xapian::Enquire::DESCENDING : Xapian::Enquire::ASCENDING
|
270
|
+
elsif order.is_a? String or order.is_a? Symbol
|
271
|
+
enquiry.sort_by_value!(order.to_s.hash, reverse)
|
272
|
+
else
|
273
|
+
enquiry.sort_by_relevance!
|
170
274
|
end
|
275
|
+
enquiry
|
276
|
+
end
|
171
277
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
278
|
+
# Setup the fields hash and stored_values list from the given options
|
279
|
+
def setup_fields(field_options)
|
280
|
+
@fields = { }
|
281
|
+
@unindexed_fields = []
|
282
|
+
@store_values = []
|
283
|
+
return nil if field_options.nil?
|
284
|
+
default_opts = {
|
285
|
+
:store => true,
|
286
|
+
:index => true,
|
287
|
+
:type => String
|
288
|
+
}
|
289
|
+
# Convert array argument to hash, with String as default type
|
290
|
+
if field_options.is_a? Array
|
291
|
+
fohash = { }
|
292
|
+
field_options.each { |f| fohash[f] = { :type => String } }
|
293
|
+
field_options = fohash
|
178
294
|
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
if
|
184
|
-
|
185
|
-
|
186
|
-
end
|
187
|
-
rescue RuntimeError => e
|
188
|
-
raise e unless e.to_s =~ /^DocNotFoundError/
|
295
|
+
field_options.each do |name,opts|
|
296
|
+
# Handle simple setup by type only
|
297
|
+
opts = { :type => opts } unless opts.is_a? Hash
|
298
|
+
opts = default_opts.merge(opts)
|
299
|
+
@store_values << name if opts[:store]
|
300
|
+
@unindexed_fields << name if opts[:index] == false
|
301
|
+
@fields[name] = opts[:type]
|
189
302
|
end
|
303
|
+
@fields
|
190
304
|
end
|
305
|
+
|
191
306
|
end
|
192
|
-
|
307
|
+
|
193
308
|
end
|
309
|
+
|