xapian-fu 0.2 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,116 +1,232 @@
1
- module XapianFu
1
+ module XapianFu #:nodoc:
2
+ # Generic Xapian Fu exception class
2
3
  class XapianFuError < StandardError ; end
3
4
 
4
5
  require 'xapian'
5
6
  require 'xapian_doc'
7
+ require 'stopper_factory'
8
+ require 'query_parser'
9
+ require 'result_set'
10
+ require 'xapian_documents_accessor'
6
11
  require 'thread'
7
12
 
13
+ # Raised when two operations are attempted concurrently when it is
14
+ # not possible
8
15
  class ConcurrencyError < XapianFuError ; end
16
+
17
+ # Raised when a document is requested by id that doesn't exist in
18
+ # the database
9
19
  class DocNotFound < XapianFuError ; end
10
20
 
11
- class XapianDb
12
- attr_reader :dir, :db_flag, :query_parser
13
- attr_reader :store_fields, :store_values
21
+ # The XapianFu::XapianDb encapsulates a Xapian database, handling
22
+ # setting up stemmers, stoppers, query parsers and such. This is
23
+ # the core of XapianFu.
24
+ #
25
+ # == Opening and creating the database
26
+ #
27
+ # The <tt>:dir</tt> option specified where the xapian database is to
28
+ # be read from and written to. Without this, an in-memory Xapian
29
+ # database will be used. By default, the on-disk database will not
30
+ # be created if it doesn't already exist. See the <tt>:create</tt>
31
+ # option.
32
+ #
33
+ # Setting the <tt>:create</tt> option to <tt>true</tt> will allow
34
+ # XapianDb to create a new Xapian database on-disk. If one already
35
+ # exists, it is just opened. The default is <tt>false</tt>.
36
+ #
37
+ # Setting the <tt>:overwrite</tt> option to <tt>true</tt> will force
38
+ # XapianDb to wipe the current on-disk database and start afresh.
39
+ # The default is <tt>false</tt>.
40
+ #
41
+ # db = XapianDb.new(:dir => '/tmp/mydb', :create => true)
42
+ #
43
+ # == Language, Stemmers and Stoppers
44
+ #
45
+ # The <tt>:language</tt> option specifies the default document
46
+ # language, and controls the default type of stemmer and stopper
47
+ # that will be used when indexing. The stemmer and stopper can be
48
+ # overridden with the <tt>:stemmer</tt> and <tt>stopper</tt> options.
49
+ #
50
+ # The <tt>:language, :stemmer and :stopper</tt> options can be set
51
+ # to one of of the following: <tt>:danish, :dutch, :english,
52
+ # :finnish, :french, :german, :hungarian, :italian, :norwegian,
53
+ # :portuguese, :romanian, :russian, :spanish, :swedish,
54
+ # :turkish</tt>. Set it to <tt>false</tt> to specify none.
55
+ #
56
+ # The default for all is <tt>:english</tt>.
57
+ #
58
+ # db = XapianDb.new(:language => :italian, :stopper => false)
59
+ #
60
+ # == Fields and values
61
+ #
62
+ # The <tt>:store</tt> option specifies which document fields should
63
+ # be stored in the database. By default, fields are only indexed -
64
+ # the original values cannot be retrieved.
65
+ #
66
+ # The <tt>:sortable</tt> option specifies which document fields will
67
+ # be available for sorting results on. This is really just does the
68
+ # same thing as <tt>:store</tt> and is just available to be explicit.
69
+ #
70
+ # The <tt>:collapsible</tt> option specifies which document fields
71
+ # can be used to group ("collapse") results. This also just does
72
+ # the same thing as <tt>:store</tt> and is just available to be explicit.
73
+ #
74
+ # A more complete way of defining fields is available:
75
+ #
76
+ # XapianDb.new(:fields => { :title => { :type => String },
77
+ # :slug => { :type => String, :index => false },
78
+ # :created_at => { :type => Time, :store => true },
79
+ # :votes => { :type => Fixnum, :store => true },
80
+ # })
81
+ #
82
+ # XapianFu will use the :type option when instantiating a store
83
+ # value, so you'll get back a Time object rather than the result of
84
+ # Time's to_s method as is the default. Defining the type for
85
+ # numerical classes (such as Time, Fixnum and Bignum) allows
86
+ # XapianFu to to store them on-disk in a much more efficient way,
87
+ # and sort them efficiently (without having to resort to storing
88
+ # leading zeros or anything like that).
89
+ #
90
+ class XapianDb # :nonew:
91
+ # Path to the on-disk database. Nil if in-memory database
92
+ attr_reader :dir
93
+ attr_reader :db_flag #:nodoc:
94
+ # An array of the fields that will be stored in the Xapian
95
+ attr_reader :store_values
96
+ # True if term positions will be stored
97
+ attr_reader :index_positions
98
+ # The default document language. Used for setting up stoppers and stemmers.
99
+ attr_reader :language
100
+ # An hash of field names and their types
101
+ attr_reader :fields
102
+ # An array of fields that will not be indexed
103
+ attr_reader :unindexed_fields
14
104
 
15
105
  def initialize( options = { } )
16
- @dir = options[:dir]
106
+ @options = { :index_positions => true }.merge(options)
107
+ @dir = @options[:dir]
108
+ @index_positions = @options[:index_positions]
17
109
  @db_flag = Xapian::DB_OPEN
18
- @db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
19
- @db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
20
- @store_fields = Array.new(1, options[:store]).compact
21
- @store_values = Array.new(1, options[:sortable]).compact
22
- @store_values += Array.new(1, options[:collapsible]).compact
23
- rw.flush if options[:create]
110
+ @db_flag = Xapian::DB_CREATE_OR_OPEN if @options[:create]
111
+ @db_flag = Xapian::DB_CREATE_OR_OVERWRITE if @options[:overwrite]
24
112
  @tx_mutex = Mutex.new
113
+ @language = @options.fetch(:language, :english)
114
+ @stemmer = @options.fetch(:stemmer, @language)
115
+ @stopper = @options.fetch(:stopper, @language)
116
+ setup_fields(@options[:fields])
117
+ @store_values << @options[:store]
118
+ @store_values << @options[:sortable]
119
+ @store_values << @options[:collapsible]
120
+ @store_values = @store_values.flatten.uniq.compact
121
+ end
122
+
123
+ # Return a new stemmer object for this database
124
+ def stemmer
125
+ StemFactory.stemmer_for(@stemmer)
126
+ end
127
+
128
+ # The stopper object for this database
129
+ def stopper
130
+ StopperFactory.stopper_for(@stopper)
25
131
  end
26
132
 
27
- # Return the writable Xapian database
133
+ # The writable Xapian database
28
134
  def rw
29
135
  @rw ||= setup_rw_db
30
136
  end
31
137
 
32
- # Return the read-only Xapian database
138
+ # The read-only Xapian database
33
139
  def ro
34
140
  @ro ||= setup_ro_db
35
141
  end
36
142
 
37
- # Return the number of docs in the Xapian database
143
+ # The number of docs in the Xapian database
38
144
  def size
39
145
  ro.doccount
40
146
  end
41
147
 
42
- # Return the XapianDocumentsAccessor for this database
148
+ # The XapianFu::XapianDocumentsAccessor for this database
43
149
  def documents
44
150
  @documents_accessor ||= XapianDocumentsAccessor.new(self)
45
151
  end
46
152
 
47
- # Add a document to the index. A document can be just a hash, the
48
- # keys representing field names and their values the data to be
49
- # indexed. Or it can be a XapianDoc, or any object with a to_s method.
50
- #
51
- # If the document object reponds to the method :data, whatever it
52
- # returns is marshalled and stored in the Xapian database. Any
53
- # arbitrary data up to Xmeg can be stored here.
54
- #
55
- # Currently, all fields are stored in the database. This will
56
- # change to store only those fields requested to be stored.
153
+ # Short-cut to documents.add
57
154
  def add_doc(doc)
58
- doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
59
- doc.db = self
60
- xdoc = doc.to_xapian_document
61
- tg = Xapian::TermGenerator.new
62
- tg.database = rw
63
- tg.document = xdoc
64
- tg.index_text( doc.text )
65
- if doc.id
66
- rw.replace_document(doc.id, xdoc)
67
- else
68
- doc.id = rw.add_document(xdoc)
69
- end
70
- doc
155
+ documents.add(doc)
71
156
  end
72
157
  alias_method "<<", :add_doc
73
158
 
74
- # Conduct a search on the Xapian database, returning an array of
75
- # XapianDoc objects for the matches
159
+ # Conduct a search on the Xapian database, returning an array of
160
+ # XapianDoc objects for the matches.
161
+ #
162
+ # The <tt>:limit</tt> option sets how many results to return. For
163
+ # compatability with the <tt>will_paginate</tt> plugin, the
164
+ # <tt>:per_page</tt> option does the same thing (though overrides
165
+ # <tt>:limit</tt>). Defaults to 10.
166
+ #
167
+ # The <tt>:page</tt> option sets which page of results to return.
168
+ # Defaults to 1.
169
+ #
170
+ # The <tt>:order</tt> option specifies the stored field to order
171
+ # the results by (instead of the default search result weight).
172
+ #
173
+ # The <tt>:reverse</tt> option reverses the order of the results,
174
+ # so lowest search weight first (or lowest stored field value
175
+ # first).
176
+ #
177
+ # The <tt>:collapse</tt> option specifies which stored field value
178
+ # to collapse (group) the results on. Works a bit like the
179
+ # SQL <tt>GROUP BY</tt> behaviour
180
+ #
181
+ # For additional options on how the query is parsed, see
182
+ # XapianFu::QueryParser
183
+
76
184
  def search(q, options = {})
77
- defaults = { :page => 1, :per_page => 10, :reverse => false }
185
+ defaults = { :page => 1, :reverse => false,
186
+ :boolean => true, :boolean_anycase => true, :wildcards => true,
187
+ :lovehate => true, :spelling => true, :pure_not => false }
78
188
  options = defaults.merge(options)
79
189
  page = options[:page].to_i rescue 1
80
190
  page = page > 1 ? page - 1 : 0
81
- per_page = options[:per_page].to_i rescue 10
191
+ per_page = options[:per_page] || options[:limit] || 10
192
+ per_page = per_page.to_i rescue 10
82
193
  offset = page * per_page
83
- query = query_parser.parse_query(q, Xapian::QueryParser::FLAG_WILDCARD && Xapian::QueryParser::FLAG_LOVEHATE)
84
- if options[:order]
85
- enquiry.sort_by_value!(options[:order].to_s.hash, options[:reverse])
86
- end
194
+ qp = XapianFu::QueryParser.new({ :database => self }.merge(options))
195
+ query = qp.parse_query(q.to_s)
196
+ enquiry = Xapian::Enquire.new(ro)
197
+ setup_ordering(enquiry, options[:order], options[:reverse])
87
198
  if options[:collapse]
88
199
  enquiry.collapse_key = options[:collapse].to_s.hash
89
200
  end
90
201
  enquiry.query = query
91
- enquiry.mset(offset, per_page).matches.collect { |m| XapianDoc.new(m) }
202
+ ResultSet.new(:mset => enquiry.mset(offset, per_page), :current_page => page + 1,
203
+ :per_page => per_page, :corrected_query => qp.corrected_query)
92
204
  end
93
205
 
94
- # Run the given block in a XapianDB transaction. Any changes to the
206
+ # Run the given block in a XapianDB transaction. Any changes to the
95
207
  # Xapian database made in the block will be atomically committed at the end.
96
- #
208
+ #
97
209
  # If an exception is raised by the block, all changes are discarded and the
98
210
  # exception re-raised.
99
- #
211
+ #
100
212
  # Xapian does not support multiple concurrent transactions on the
101
213
  # same Xapian database. Any attempts at this will be serialized by
102
214
  # XapianFu, which is not perfect but probably better than just
103
215
  # kicking up an exception.
104
216
  #
105
- def transaction
217
+ def transaction(flush_on_commit = true)
106
218
  @tx_mutex.synchronize do
107
- rw.begin_transaction
108
- yield
219
+ begin
220
+ rw.begin_transaction(flush_on_commit)
221
+ yield
222
+ rescue Exception => e
223
+ rw.cancel_transaction
224
+ ro.reopen
225
+ raise e
226
+ end
109
227
  rw.commit_transaction
228
+ ro.reopen
110
229
  end
111
- rescue Exception => e
112
- rw.cancel_transaction
113
- raise e
114
230
  end
115
231
 
116
232
  # Flush any changes to disk and reopen the read-only database.
@@ -121,29 +237,21 @@ module XapianFu
121
237
  ro.reopen
122
238
  end
123
239
 
124
- def query_parser
125
- unless @query_parser
126
- @query_parser = Xapian::QueryParser.new
127
- @query_parser.database = ro
128
- end
129
- @query_parser
130
- end
131
-
132
- def enquiry
133
- @enquiry ||= Xapian::Enquire.new(ro)
134
- end
135
-
136
240
  private
137
241
 
242
+ # Setup the writable database
138
243
  def setup_rw_db
139
244
  if dir
140
245
  @rw = Xapian::WritableDatabase.new(dir, db_flag)
246
+ @rw.flush if @options[:create]
247
+ @rw
141
248
  else
142
249
  # In memory database
143
250
  @rw = Xapian::inmemory_open
144
251
  end
145
252
  end
146
253
 
254
+ # Setup the read-only database
147
255
  def setup_ro_db
148
256
  if dir
149
257
  @ro = Xapian::Database.new(dir)
@@ -153,41 +261,49 @@ module XapianFu
153
261
  end
154
262
  end
155
263
 
156
- #
157
- class XapianDocumentsAccessor
158
- def initialize(xdb)
159
- @xdb = xdb
160
- end
161
-
162
- # Return the document with the given id from the
163
- # database. Raises a XapianFu::DocNotFoundError exception
164
- # if it doesn't exist.
165
- def find(doc_id)
166
- xdoc = @xdb.ro.document(doc_id)
167
- XapianDoc.new(xdoc)
168
- rescue RuntimeError => e
169
- raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
264
+ # Setup ordering for the given Xapian::Enquire objects
265
+ def setup_ordering(enquiry, order = nil, reverse = true)
266
+ if order.to_s == "id"
267
+ # Sorting by a value that doesn't exist falls back to docid ordering
268
+ enquiry.sort_by_value!((1 << 32)-1, reverse)
269
+ enquiry.docid_order = reverse ? Xapian::Enquire::DESCENDING : Xapian::Enquire::ASCENDING
270
+ elsif order.is_a? String or order.is_a? Symbol
271
+ enquiry.sort_by_value!(order.to_s.hash, reverse)
272
+ else
273
+ enquiry.sort_by_relevance!
170
274
  end
275
+ enquiry
276
+ end
171
277
 
172
- # Return the document with the given id from the database or nil
173
- # if it doesn't exist
174
- def [](doc_id)
175
- find(doc_id)
176
- rescue XapianFu::DocNotFound
177
- nil
278
+ # Setup the fields hash and stored_values list from the given options
279
+ def setup_fields(field_options)
280
+ @fields = { }
281
+ @unindexed_fields = []
282
+ @store_values = []
283
+ return nil if field_options.nil?
284
+ default_opts = {
285
+ :store => true,
286
+ :index => true,
287
+ :type => String
288
+ }
289
+ # Convert array argument to hash, with String as default type
290
+ if field_options.is_a? Array
291
+ fohash = { }
292
+ field_options.each { |f| fohash[f] = { :type => String } }
293
+ field_options = fohash
178
294
  end
179
-
180
- # Delete the given document from the database and return the
181
- # document id, or nil if it doesn't exist
182
- def delete(doc)
183
- if doc.respond_to?(:to_i)
184
- @xdb.rw.delete_document(doc.to_i)
185
- doc.to_i
186
- end
187
- rescue RuntimeError => e
188
- raise e unless e.to_s =~ /^DocNotFoundError/
295
+ field_options.each do |name,opts|
296
+ # Handle simple setup by type only
297
+ opts = { :type => opts } unless opts.is_a? Hash
298
+ opts = default_opts.merge(opts)
299
+ @store_values << name if opts[:store]
300
+ @unindexed_fields << name if opts[:index] == false
301
+ @fields[name] = opts[:type]
189
302
  end
303
+ @fields
190
304
  end
305
+
191
306
  end
192
-
307
+
193
308
  end
309
+