xapian-fu 0.2 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,116 +1,232 @@
1
- module XapianFu
1
+ module XapianFu #:nodoc:
2
+ # Generic Xapian Fu exception class
2
3
  class XapianFuError < StandardError ; end
3
4
 
4
5
  require 'xapian'
5
6
  require 'xapian_doc'
7
+ require 'stopper_factory'
8
+ require 'query_parser'
9
+ require 'result_set'
10
+ require 'xapian_documents_accessor'
6
11
  require 'thread'
7
12
 
13
+ # Raised when two operations are attempted concurrently when it is
14
+ # not possible
8
15
  class ConcurrencyError < XapianFuError ; end
16
+
17
+ # Raised when a document is requested by id that doesn't exist in
18
+ # the database
9
19
  class DocNotFound < XapianFuError ; end
10
20
 
11
- class XapianDb
12
- attr_reader :dir, :db_flag, :query_parser
13
- attr_reader :store_fields, :store_values
21
+ # The XapianFu::XapianDb encapsulates a Xapian database, handling
22
+ # setting up stemmers, stoppers, query parsers and such. This is
23
+ # the core of XapianFu.
24
+ #
25
+ # == Opening and creating the database
26
+ #
27
+ # The <tt>:dir</tt> option specified where the xapian database is to
28
+ # be read from and written to. Without this, an in-memory Xapian
29
+ # database will be used. By default, the on-disk database will not
30
+ # be created if it doesn't already exist. See the <tt>:create</tt>
31
+ # option.
32
+ #
33
+ # Setting the <tt>:create</tt> option to <tt>true</tt> will allow
34
+ # XapianDb to create a new Xapian database on-disk. If one already
35
+ # exists, it is just opened. The default is <tt>false</tt>.
36
+ #
37
+ # Setting the <tt>:overwrite</tt> option to <tt>true</tt> will force
38
+ # XapianDb to wipe the current on-disk database and start afresh.
39
+ # The default is <tt>false</tt>.
40
+ #
41
+ # db = XapianDb.new(:dir => '/tmp/mydb', :create => true)
42
+ #
43
+ # == Language, Stemmers and Stoppers
44
+ #
45
+ # The <tt>:language</tt> option specifies the default document
46
+ # language, and controls the default type of stemmer and stopper
47
+ # that will be used when indexing. The stemmer and stopper can be
48
+ # overridden with the <tt>:stemmer</tt> and <tt>stopper</tt> options.
49
+ #
50
+ # The <tt>:language, :stemmer and :stopper</tt> options can be set
51
+ # to one of of the following: <tt>:danish, :dutch, :english,
52
+ # :finnish, :french, :german, :hungarian, :italian, :norwegian,
53
+ # :portuguese, :romanian, :russian, :spanish, :swedish,
54
+ # :turkish</tt>. Set it to <tt>false</tt> to specify none.
55
+ #
56
+ # The default for all is <tt>:english</tt>.
57
+ #
58
+ # db = XapianDb.new(:language => :italian, :stopper => false)
59
+ #
60
+ # == Fields and values
61
+ #
62
+ # The <tt>:store</tt> option specifies which document fields should
63
+ # be stored in the database. By default, fields are only indexed -
64
+ # the original values cannot be retrieved.
65
+ #
66
+ # The <tt>:sortable</tt> option specifies which document fields will
67
+ # be available for sorting results on. This is really just does the
68
+ # same thing as <tt>:store</tt> and is just available to be explicit.
69
+ #
70
+ # The <tt>:collapsible</tt> option specifies which document fields
71
+ # can be used to group ("collapse") results. This also just does
72
+ # the same thing as <tt>:store</tt> and is just available to be explicit.
73
+ #
74
+ # A more complete way of defining fields is available:
75
+ #
76
+ # XapianDb.new(:fields => { :title => { :type => String },
77
+ # :slug => { :type => String, :index => false },
78
+ # :created_at => { :type => Time, :store => true },
79
+ # :votes => { :type => Fixnum, :store => true },
80
+ # })
81
+ #
82
+ # XapianFu will use the :type option when instantiating a store
83
+ # value, so you'll get back a Time object rather than the result of
84
+ # Time's to_s method as is the default. Defining the type for
85
+ # numerical classes (such as Time, Fixnum and Bignum) allows
86
+ # XapianFu to to store them on-disk in a much more efficient way,
87
+ # and sort them efficiently (without having to resort to storing
88
+ # leading zeros or anything like that).
89
+ #
90
+ class XapianDb # :nonew:
91
+ # Path to the on-disk database. Nil if in-memory database
92
+ attr_reader :dir
93
+ attr_reader :db_flag #:nodoc:
94
+ # An array of the fields that will be stored in the Xapian
95
+ attr_reader :store_values
96
+ # True if term positions will be stored
97
+ attr_reader :index_positions
98
+ # The default document language. Used for setting up stoppers and stemmers.
99
+ attr_reader :language
100
+ # An hash of field names and their types
101
+ attr_reader :fields
102
+ # An array of fields that will not be indexed
103
+ attr_reader :unindexed_fields
14
104
 
15
105
  def initialize( options = { } )
16
- @dir = options[:dir]
106
+ @options = { :index_positions => true }.merge(options)
107
+ @dir = @options[:dir]
108
+ @index_positions = @options[:index_positions]
17
109
  @db_flag = Xapian::DB_OPEN
18
- @db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
19
- @db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
20
- @store_fields = Array.new(1, options[:store]).compact
21
- @store_values = Array.new(1, options[:sortable]).compact
22
- @store_values += Array.new(1, options[:collapsible]).compact
23
- rw.flush if options[:create]
110
+ @db_flag = Xapian::DB_CREATE_OR_OPEN if @options[:create]
111
+ @db_flag = Xapian::DB_CREATE_OR_OVERWRITE if @options[:overwrite]
24
112
  @tx_mutex = Mutex.new
113
+ @language = @options.fetch(:language, :english)
114
+ @stemmer = @options.fetch(:stemmer, @language)
115
+ @stopper = @options.fetch(:stopper, @language)
116
+ setup_fields(@options[:fields])
117
+ @store_values << @options[:store]
118
+ @store_values << @options[:sortable]
119
+ @store_values << @options[:collapsible]
120
+ @store_values = @store_values.flatten.uniq.compact
121
+ end
122
+
123
+ # Return a new stemmer object for this database
124
+ def stemmer
125
+ StemFactory.stemmer_for(@stemmer)
126
+ end
127
+
128
+ # The stopper object for this database
129
+ def stopper
130
+ StopperFactory.stopper_for(@stopper)
25
131
  end
26
132
 
27
- # Return the writable Xapian database
133
+ # The writable Xapian database
28
134
  def rw
29
135
  @rw ||= setup_rw_db
30
136
  end
31
137
 
32
- # Return the read-only Xapian database
138
+ # The read-only Xapian database
33
139
  def ro
34
140
  @ro ||= setup_ro_db
35
141
  end
36
142
 
37
- # Return the number of docs in the Xapian database
143
+ # The number of docs in the Xapian database
38
144
  def size
39
145
  ro.doccount
40
146
  end
41
147
 
42
- # Return the XapianDocumentsAccessor for this database
148
+ # The XapianFu::XapianDocumentsAccessor for this database
43
149
  def documents
44
150
  @documents_accessor ||= XapianDocumentsAccessor.new(self)
45
151
  end
46
152
 
47
- # Add a document to the index. A document can be just a hash, the
48
- # keys representing field names and their values the data to be
49
- # indexed. Or it can be a XapianDoc, or any object with a to_s method.
50
- #
51
- # If the document object reponds to the method :data, whatever it
52
- # returns is marshalled and stored in the Xapian database. Any
53
- # arbitrary data up to Xmeg can be stored here.
54
- #
55
- # Currently, all fields are stored in the database. This will
56
- # change to store only those fields requested to be stored.
153
+ # Short-cut to documents.add
57
154
  def add_doc(doc)
58
- doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
59
- doc.db = self
60
- xdoc = doc.to_xapian_document
61
- tg = Xapian::TermGenerator.new
62
- tg.database = rw
63
- tg.document = xdoc
64
- tg.index_text( doc.text )
65
- if doc.id
66
- rw.replace_document(doc.id, xdoc)
67
- else
68
- doc.id = rw.add_document(xdoc)
69
- end
70
- doc
155
+ documents.add(doc)
71
156
  end
72
157
  alias_method "<<", :add_doc
73
158
 
74
- # Conduct a search on the Xapian database, returning an array of
75
- # XapianDoc objects for the matches
159
+ # Conduct a search on the Xapian database, returning an array of
160
+ # XapianDoc objects for the matches.
161
+ #
162
+ # The <tt>:limit</tt> option sets how many results to return. For
163
+ # compatability with the <tt>will_paginate</tt> plugin, the
164
+ # <tt>:per_page</tt> option does the same thing (though overrides
165
+ # <tt>:limit</tt>). Defaults to 10.
166
+ #
167
+ # The <tt>:page</tt> option sets which page of results to return.
168
+ # Defaults to 1.
169
+ #
170
+ # The <tt>:order</tt> option specifies the stored field to order
171
+ # the results by (instead of the default search result weight).
172
+ #
173
+ # The <tt>:reverse</tt> option reverses the order of the results,
174
+ # so lowest search weight first (or lowest stored field value
175
+ # first).
176
+ #
177
+ # The <tt>:collapse</tt> option specifies which stored field value
178
+ # to collapse (group) the results on. Works a bit like the
179
+ # SQL <tt>GROUP BY</tt> behaviour
180
+ #
181
+ # For additional options on how the query is parsed, see
182
+ # XapianFu::QueryParser
183
+
76
184
  def search(q, options = {})
77
- defaults = { :page => 1, :per_page => 10, :reverse => false }
185
+ defaults = { :page => 1, :reverse => false,
186
+ :boolean => true, :boolean_anycase => true, :wildcards => true,
187
+ :lovehate => true, :spelling => true, :pure_not => false }
78
188
  options = defaults.merge(options)
79
189
  page = options[:page].to_i rescue 1
80
190
  page = page > 1 ? page - 1 : 0
81
- per_page = options[:per_page].to_i rescue 10
191
+ per_page = options[:per_page] || options[:limit] || 10
192
+ per_page = per_page.to_i rescue 10
82
193
  offset = page * per_page
83
- query = query_parser.parse_query(q, Xapian::QueryParser::FLAG_WILDCARD && Xapian::QueryParser::FLAG_LOVEHATE)
84
- if options[:order]
85
- enquiry.sort_by_value!(options[:order].to_s.hash, options[:reverse])
86
- end
194
+ qp = XapianFu::QueryParser.new({ :database => self }.merge(options))
195
+ query = qp.parse_query(q.to_s)
196
+ enquiry = Xapian::Enquire.new(ro)
197
+ setup_ordering(enquiry, options[:order], options[:reverse])
87
198
  if options[:collapse]
88
199
  enquiry.collapse_key = options[:collapse].to_s.hash
89
200
  end
90
201
  enquiry.query = query
91
- enquiry.mset(offset, per_page).matches.collect { |m| XapianDoc.new(m) }
202
+ ResultSet.new(:mset => enquiry.mset(offset, per_page), :current_page => page + 1,
203
+ :per_page => per_page, :corrected_query => qp.corrected_query)
92
204
  end
93
205
 
94
- # Run the given block in a XapianDB transaction. Any changes to the
206
+ # Run the given block in a XapianDB transaction. Any changes to the
95
207
  # Xapian database made in the block will be atomically committed at the end.
96
- #
208
+ #
97
209
  # If an exception is raised by the block, all changes are discarded and the
98
210
  # exception re-raised.
99
- #
211
+ #
100
212
  # Xapian does not support multiple concurrent transactions on the
101
213
  # same Xapian database. Any attempts at this will be serialized by
102
214
  # XapianFu, which is not perfect but probably better than just
103
215
  # kicking up an exception.
104
216
  #
105
- def transaction
217
+ def transaction(flush_on_commit = true)
106
218
  @tx_mutex.synchronize do
107
- rw.begin_transaction
108
- yield
219
+ begin
220
+ rw.begin_transaction(flush_on_commit)
221
+ yield
222
+ rescue Exception => e
223
+ rw.cancel_transaction
224
+ ro.reopen
225
+ raise e
226
+ end
109
227
  rw.commit_transaction
228
+ ro.reopen
110
229
  end
111
- rescue Exception => e
112
- rw.cancel_transaction
113
- raise e
114
230
  end
115
231
 
116
232
  # Flush any changes to disk and reopen the read-only database.
@@ -121,29 +237,21 @@ module XapianFu
121
237
  ro.reopen
122
238
  end
123
239
 
124
- def query_parser
125
- unless @query_parser
126
- @query_parser = Xapian::QueryParser.new
127
- @query_parser.database = ro
128
- end
129
- @query_parser
130
- end
131
-
132
- def enquiry
133
- @enquiry ||= Xapian::Enquire.new(ro)
134
- end
135
-
136
240
  private
137
241
 
242
+ # Setup the writable database
138
243
  def setup_rw_db
139
244
  if dir
140
245
  @rw = Xapian::WritableDatabase.new(dir, db_flag)
246
+ @rw.flush if @options[:create]
247
+ @rw
141
248
  else
142
249
  # In memory database
143
250
  @rw = Xapian::inmemory_open
144
251
  end
145
252
  end
146
253
 
254
+ # Setup the read-only database
147
255
  def setup_ro_db
148
256
  if dir
149
257
  @ro = Xapian::Database.new(dir)
@@ -153,41 +261,49 @@ module XapianFu
153
261
  end
154
262
  end
155
263
 
156
- #
157
- class XapianDocumentsAccessor
158
- def initialize(xdb)
159
- @xdb = xdb
160
- end
161
-
162
- # Return the document with the given id from the
163
- # database. Raises a XapianFu::DocNotFoundError exception
164
- # if it doesn't exist.
165
- def find(doc_id)
166
- xdoc = @xdb.ro.document(doc_id)
167
- XapianDoc.new(xdoc)
168
- rescue RuntimeError => e
169
- raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
264
+ # Setup ordering for the given Xapian::Enquire objects
265
+ def setup_ordering(enquiry, order = nil, reverse = true)
266
+ if order.to_s == "id"
267
+ # Sorting by a value that doesn't exist falls back to docid ordering
268
+ enquiry.sort_by_value!((1 << 32)-1, reverse)
269
+ enquiry.docid_order = reverse ? Xapian::Enquire::DESCENDING : Xapian::Enquire::ASCENDING
270
+ elsif order.is_a? String or order.is_a? Symbol
271
+ enquiry.sort_by_value!(order.to_s.hash, reverse)
272
+ else
273
+ enquiry.sort_by_relevance!
170
274
  end
275
+ enquiry
276
+ end
171
277
 
172
- # Return the document with the given id from the database or nil
173
- # if it doesn't exist
174
- def [](doc_id)
175
- find(doc_id)
176
- rescue XapianFu::DocNotFound
177
- nil
278
+ # Setup the fields hash and stored_values list from the given options
279
+ def setup_fields(field_options)
280
+ @fields = { }
281
+ @unindexed_fields = []
282
+ @store_values = []
283
+ return nil if field_options.nil?
284
+ default_opts = {
285
+ :store => true,
286
+ :index => true,
287
+ :type => String
288
+ }
289
+ # Convert array argument to hash, with String as default type
290
+ if field_options.is_a? Array
291
+ fohash = { }
292
+ field_options.each { |f| fohash[f] = { :type => String } }
293
+ field_options = fohash
178
294
  end
179
-
180
- # Delete the given document from the database and return the
181
- # document id, or nil if it doesn't exist
182
- def delete(doc)
183
- if doc.respond_to?(:to_i)
184
- @xdb.rw.delete_document(doc.to_i)
185
- doc.to_i
186
- end
187
- rescue RuntimeError => e
188
- raise e unless e.to_s =~ /^DocNotFoundError/
295
+ field_options.each do |name,opts|
296
+ # Handle simple setup by type only
297
+ opts = { :type => opts } unless opts.is_a? Hash
298
+ opts = default_opts.merge(opts)
299
+ @store_values << name if opts[:store]
300
+ @unindexed_fields << name if opts[:index] == false
301
+ @fields[name] = opts[:type]
189
302
  end
303
+ @fields
190
304
  end
305
+
191
306
  end
192
-
307
+
193
308
  end
309
+