xapian-fu 0.2 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,72 @@
1
- module XapianFu
1
+ class Time #:nodoc:
2
+ def to_xapian_fu_string
3
+ utc.strftime("%Y%m%d%H%M%S")
4
+ end
5
+ end
6
+
7
+ class Date #:nodoc:
8
+ def to_xapian_fu_string
9
+ strftime("%Y%m%d")
10
+ end
11
+ end
12
+
13
+ require 'date'
14
+
15
+ class DateTime #:nodoc:
16
+ def to_xapian_fu_string
17
+ strftime("%Y%m%d%H%M%S")
18
+ end
19
+ end
20
+
21
+ module XapianFu #:nodoc:
22
+ require 'xapian_doc_value_accessor'
2
23
 
24
+ # Raised whenever a XapianDb is needed but has not been provided,
25
+ # such as when retrieving the terms list for a document
3
26
  class XapianDbNotSet < XapianFuError ; end
4
- class XapianDocNotSet < XapianFuError ; end
27
+ # Raised if a given value cannot be stored in the database (anything
28
+ # without a to_s method)
5
29
  class XapianTypeError < XapianFuError ; end
6
-
30
+
31
+ # A XapianDoc represents a document in a XapianDb. Searches return
32
+ # XapianDoc objects and they are used internally when adding new
33
+ # documents to the database. You usually don't need to instantiate
34
+ # them yourself unless you're doing something a bit advanced.
7
35
  class XapianDoc
8
- attr_reader :fields, :data, :weight, :match
9
- attr_reader :xapian_document
10
- attr_accessor :id, :db
36
+
37
+ # A hash of the fields given to this object on initialize
38
+ attr_reader :fields
39
+
40
+ # An abitrary blob of data stored alongside the document in the
41
+ # Xapian database.
42
+ attr_reader :data
43
+
44
+ # The search score of this document when returned as part of a
45
+ # search result
46
+ attr_reader :weight
47
+
48
+ # The Xapian::Match object for this document when returned as part
49
+ # of a search result.
50
+ attr_reader :match
51
+
52
+ # The unsigned integer "primary key" for this document in the
53
+ # Xapian database.
54
+ attr_accessor :id
55
+
56
+ # The XapianDb object that this document was retrieved from, or
57
+ # should be stored in.
58
+ attr_accessor :db
11
59
 
12
60
  # Expects a Xapian::Document, a Hash-like object, or anything that
13
61
  # with a to_s method. Anything else raises a XapianTypeError.
14
- # Options can be <tt>:weight</tt> to set the search weight or
15
- # <tt>:data</tt> to set some additional data to be stored with the
16
- # record in the database.
62
+ # The <tt>:weight</tt> option sets the search weight when setting
63
+ # up search results. The <tt>:data</tt> option sets some
64
+ # additional data to be stored with the document in the database.
65
+ # The <tt>:xapian_db</tt> option sets the XapianDb to allow saves
66
+ # and term enumeration.
17
67
  def initialize(doc, options = {})
68
+ @options = options
69
+
18
70
  @fields = {}
19
71
  if doc.is_a? Xapian::Match
20
72
  match = doc
@@ -28,19 +80,8 @@ module XapianFu
28
80
  if doc.is_a?(Xapian::Document)
29
81
  @xapian_document = doc
30
82
  @id = doc.docid
31
- begin
32
- xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
33
- rescue ArgumentError
34
- @data = nil
35
- end
36
- if xdoc_data.is_a? Hash
37
- @data = xdoc_data.delete(:__data)
38
- @fields = xdoc_data
39
- else
40
- @data = xdoc_data
41
- end
42
83
  # Handle initialisation from a hash-like object
43
- elsif doc.respond_to?("[]") and doc.respond_to?(:has_key?)
84
+ elsif doc.respond_to?(:has_key?) and doc.respond_to?("[]")
44
85
  @fields = doc
45
86
  @id = doc[:id] if doc.has_key?(:id)
46
87
  # Handle initialisation from anything else that can be coerced
@@ -52,18 +93,21 @@ module XapianFu
52
93
  end
53
94
  @weight = options[:weight] if options[:weight]
54
95
  @data = options[:data] if options[:data]
96
+ @db = options[:xapian_db] if options[:xapian_db]
55
97
  end
56
98
 
57
- # Retrieve the given Xapianvalue from the XapianDb. <tt>vkey</tt>
58
- # can be a symbol or string, in which case it's hashed to get an
59
- # integer value number. Or you can give the integer value number
60
- # if you know it.
61
- def get_value(vkey)
62
- raise XapianDocNotSet unless @xapian_document
63
- vkey = vkey.to_s.hash unless vkey.is_a? Integer
64
- @xapian_document.value(vkey)
99
+ # The arbitrary data stored in the Xapian database with this
100
+ # document. Returns an empty string if none available.
101
+ def data
102
+ @data ||= xapian_document.data
65
103
  end
66
104
 
105
+ # The XapianFu::XapianDocValueAccessor for accessing the values in
106
+ # this document.
107
+ def values
108
+ @value_accessor ||= XapianDocValueAccessor.new(self)
109
+ end
110
+
67
111
  # Return a list of terms that the db has for this document.
68
112
  def terms
69
113
  raise XapianFu::XapianDbNotSet unless db
@@ -74,17 +118,25 @@ module XapianFu
74
118
  # database. Requires that the db attribute has been set up.
75
119
  def to_xapian_document
76
120
  raise XapianFu::XapianDbNotSet unless db
77
- xdoc = Xapian::Document.new
78
- add_stored_fields_to_xapian_doc(xdoc)
79
- add_stored_values_to_xapian_doc(xdoc)
80
- xdoc
121
+ xapian_document.data = data
122
+ # Clear and add values
123
+ xapian_document.clear_values
124
+ add_values_to_xapian_document
125
+ # Clear and add terms
126
+ xapian_document.clear_terms
127
+ generate_terms
128
+ xapian_document
81
129
  end
82
130
 
83
- # Return text for indexing from the fields
84
- def text
85
- fields.keys.collect { |key| fields[key].to_s }.join(' ')
131
+ # The Xapian::Document for this XapianFu::Document. If this
132
+ # document was retrieved from a XapianDb then this will have been
133
+ # initialized by Xapian, otherwise a new Xapian::Document.new is
134
+ # allocated.
135
+ def xapian_document
136
+ @xapian_document ||= Xapian::Document.new
86
137
  end
87
138
 
139
+ # Compare IDs with another XapianDoc
88
140
  def ==(b)
89
141
  if b.is_a?(XapianDoc)
90
142
  id == b.id
@@ -94,24 +146,154 @@ module XapianFu
94
146
  end
95
147
 
96
148
  def inspect
97
- "<#{self.class.to_s} id=#{id}>"
149
+ s = ["<#{self.class.to_s} id=#{id}"]
150
+ s << "weight=%.5f" % weight if weight
151
+ s << "db=#{db.nil? ? 'nil' : db}"
152
+ s.join(' ') + ">"
153
+ end
154
+
155
+ # Add this document to the Xapian Database, or replace it if it
156
+ # already has an id.
157
+ def save
158
+ id ? update : create
159
+ end
160
+
161
+ # Add this document to the Xapian Database
162
+ def create
163
+ self.id = db.rw.add_document(to_xapian_document)
164
+ end
165
+
166
+ # Update this document in the Xapian Database
167
+ def update
168
+ db.rw.replace_document(id, to_xapian_document)
169
+ end
170
+
171
+ # Set the stemmer to use for this document. Accepts any string
172
+ # that the Xapian::Stem class accepts (Either the English name for
173
+ # the language or the two letter ISO639 code). Can also be an
174
+ # existing Xapian::Stem object.
175
+ def stemmer=(s)
176
+ @stemmer = StemFactory.stemmer_for(s)
177
+ end
178
+
179
+ # Return the stemmer for this document. If not set on initialize
180
+ # by the :stemmer or :language option, it will try the database's
181
+ # stemmer and otherwise defaults to an English stemmer.
182
+ def stemmer
183
+ if @stemmer
184
+ @stemmer
185
+ else
186
+ @stemmer =
187
+ if ! @options[:stemmer].nil?
188
+ @options[:stemmer]
189
+ elsif @options[:language]
190
+ @options[:language]
191
+ elsif db
192
+ db.stemmer
193
+ else
194
+ :english
195
+ end
196
+ @stemmer = StemFactory.stemmer_for(@stemmer)
197
+ end
198
+ end
199
+
200
+ # Return the stopper for this document. If not set on initialize
201
+ # by the :stopper or :language option, it will try the database's
202
+ # stopper and otherwise default to an English stopper..
203
+ def stopper
204
+ if @stopper
205
+ @stopper
206
+ else
207
+ @stopper =
208
+ if ! @options[:stopper].nil?
209
+ @options[:stopper]
210
+ elsif @options[:language]
211
+ @options[:language]
212
+ elsif db
213
+ db.stopper
214
+ else
215
+ :english
216
+ end
217
+ @stopper = StopperFactory.stopper_for(@stopper)
218
+ end
219
+ end
220
+
221
+ # Return this document's language which is set on initialize, inherited
222
+ # from the database or defaults to :english
223
+ def language
224
+ if @language
225
+ @language
226
+ else
227
+ @language =
228
+ if ! @options[:language].nil?
229
+ @options[:language]
230
+ elsif db and db.language
231
+ db.language
232
+ else
233
+ :english
234
+ end
235
+ end
98
236
  end
99
237
 
100
238
  private
101
239
 
102
- def add_stored_fields_to_xapian_doc(xdoc)
103
- stored_fields = fields.reject { |k,v| ! db.store_fields.include? k }
104
- stored_fields[:__data] = data if data
105
- xdoc.data = Marshal.dump(stored_fields) unless stored_fields.empty?
106
- xdoc
240
+ # Array of field names not to run through the TermGenerator
241
+ def unindexed_fields
242
+ db ? db.unindexed_fields : []
243
+ end
244
+
245
+ # Add all the fields to be stored as XapianDb values
246
+ def add_values_to_xapian_document
247
+ db.store_values.collect do |key|
248
+ values[key] = fields[key]
249
+ key
250
+ end
251
+ end
252
+
253
+ # Run the Xapian term generator against this documents text
254
+ def generate_terms
255
+ tg = Xapian::TermGenerator.new
256
+ tg.database = db.rw
257
+ tg.document = xapian_document
258
+ tg.stopper = stopper
259
+ tg.stemmer = stemmer
260
+ index_method = db.index_positions ? :index_text : :index_text_without_positions
261
+ fields.each do |k,v|
262
+ next if unindexed_fields.include?(k)
263
+ if v.respond_to?(:to_xapian_fu_string)
264
+ v = v.to_xapian_fu_string
265
+ else
266
+ v = v.to_s
267
+ end
268
+ # add value with field name
269
+ tg.send(index_method, v, 1, 'X' + k.to_s.upcase)
270
+ # add value without field name
271
+ tg.send(index_method, v)
272
+ end
273
+ xapian_document
107
274
  end
108
275
 
109
- def add_stored_values_to_xapian_doc(xdoc)
110
- stored_values = fields.reject { |k,v| ! db.store_values.include? k }
111
- stored_values.each do |k,v|
112
- xdoc.add_value(k.to_s.hash, v.to_s)
276
+ end
277
+
278
+
279
+ class StemFactory
280
+ # Return a Xapian::Stem object for the given option. Accepts any
281
+ # string that the Xapian::Stem class accepts (Either the English
282
+ # name for the language or the two letter ISO639 code).
283
+ #
284
+ # If given false or nil, will return a "none" stemmer.
285
+ #
286
+ # It will also accept and return an existing Xapian::Stem object.
287
+ #
288
+ def self.stemmer_for(stemmer)
289
+ if stemmer.is_a? Xapian::Stem
290
+ stemmer
291
+ elsif stemmer.is_a?(String) or stemmer.is_a?(Symbol)
292
+ Xapian::Stem.new(stemmer.to_s)
293
+ else
294
+ Xapian::Stem.new("none")
113
295
  end
114
- xdoc
115
296
  end
116
297
  end
298
+
117
299
  end
@@ -0,0 +1,125 @@
1
+ class Integer #:nodoc:
2
+ def to_xapian_fu_storage_value
3
+ [self].pack("l")
4
+ end
5
+
6
+ def self.from_xapian_fu_storage_value(value)
7
+ value.unpack("l").first
8
+ end
9
+ end
10
+
11
+ class Bignum #:nodoc:
12
+ def to_xapian_fu_storage_value
13
+ [self].pack("G")
14
+ end
15
+
16
+ def self.from_xapian_fu_storage_value(value)
17
+ value.unpack("G").first
18
+ end
19
+ end
20
+
21
+ class Float #:nodoc:
22
+ def to_xapian_fu_storage_value
23
+ [self].pack("G")
24
+ end
25
+
26
+ def self.from_xapian_fu_storage_value(value)
27
+ value.unpack("G").first
28
+ end
29
+ end
30
+
31
+ class Time #:nodoc:
32
+ def to_xapian_fu_storage_value
33
+ [self.utc.to_f].pack("G")
34
+ end
35
+
36
+ def self.from_xapian_fu_storage_value(value)
37
+ Time.at(value.unpack("G").first)
38
+ end
39
+ end
40
+
41
+ class Date #:nodoc:
42
+ def to_xapian_fu_storage_value
43
+ to_s
44
+ end
45
+
46
+ def self.from_xapian_fu_storage_value(value)
47
+ self.parse(value)
48
+ end
49
+ end
50
+
51
+ module XapianFu #:nodoc:
52
+
53
+ # A XapianDocValueAccessor is used to provide the XapianDoc#values
54
+ # interface to read and write field values to a XapianDb. It is
55
+ # usually set up by a XapianDoc so you shouldn't need to set up your
56
+ # own.
57
+ class XapianDocValueAccessor
58
+ def initialize(xapian_doc)
59
+ @doc = xapian_doc
60
+ end
61
+
62
+ # Add the given <tt>value</tt> with the given <tt>key</tt> to the
63
+ # XapianDoc. If the value has a
64
+ # <tt>to_xapian_fu_storage_value</tt> method then it is used to
65
+ # generate the final value to be stored, otherwise <tt>to_s</tt>
66
+ # is used. This is usually paired with a
67
+ # <tt>from_xapian_fu_storage_value</tt> class method on retrieval.
68
+ def store(key, value, type = nil)
69
+ type = @doc.db.fields[key] if type.nil? and @doc.db
70
+ if type and value.is_a?(type) and value.respond_to?(:to_xapian_fu_storage_value)
71
+ converted_value = value.to_xapian_fu_storage_value
72
+ else
73
+ converted_value = value.to_s
74
+ end
75
+ @doc.xapian_document.add_value(value_key(key), converted_value)
76
+ value
77
+ end
78
+ alias_method "[]=", :store
79
+
80
+ # Retrieve the value with the given <tt>key</tt> from the
81
+ # XapianDoc. <tt>key</tt> can be a symbol or string, in which case
82
+ # it's hashed to get an integer value number. Or you can give the
83
+ # integer value number if you know it.
84
+ #
85
+ # If the class specified in the database fields for this key (or
86
+ # as the optional argument) has a
87
+ # <tt>from_xapian_fu_storage_value</tt> method then it is used to
88
+ # instaniate the object from the stored value. This is usually
89
+ # paired with a <tt>to_xapian_fu_storage_value</tt> instance
90
+ # method.
91
+ #
92
+ # Due to the design of Xapian, if the value does not exist then an
93
+ # empty string is returned.
94
+ def fetch(key, type = nil)
95
+ value = @doc.xapian_document.value(value_key(key))
96
+ type = @doc.db.fields[key] if type.nil? and @doc.db
97
+ if type and type.respond_to?(:from_xapian_fu_storage_value)
98
+ type.from_xapian_fu_storage_value(value)
99
+ else
100
+ value
101
+ end
102
+ end
103
+ alias_method "[]", :fetch
104
+
105
+ # Count the values stored in the XapianDoc
106
+ def size
107
+ @doc.xapian_document.values_count
108
+ end
109
+
110
+ # Remove the value with the given key from the XapianDoc and return it
111
+ def delete(key)
112
+ value = fetch(key)
113
+ @doc.xapian_document.remove_value(value_key(key))
114
+ value
115
+ end
116
+
117
+ private
118
+
119
+ # Convert the given key to an integer that can be used as a Xapian
120
+ # value number
121
+ def value_key(key)
122
+ key.is_a?(Integer) ? key : key.to_s.hash
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,82 @@
1
+ module XapianFu
2
+ # A XapianDocumentsAccessor is used to provide the
3
+ # XapianDb#documents interface. It is usually set up by a XapianDb
4
+ # so you shouldn't need to set up your own.
5
+ class XapianDocumentsAccessor
6
+ def initialize(xdb) #:nodoc:
7
+ @xdb = xdb
8
+ end
9
+
10
+ # Build a new XapianDoc for this database
11
+ def new(doc = nil, options = { })
12
+ options = options.merge({ :xapian_db => @xdb })
13
+ XapianDoc.new(doc, options)
14
+ end
15
+
16
+ # Add a document to the index. A document can be just a hash, the
17
+ # keys representing field names and their values the data to be
18
+ # indexed. Or it can be a XapianDoc, or any object with a to_s method.
19
+ #
20
+ # If the document has an :id field, it is used as the primary key
21
+ # in the Xapian database.
22
+ #
23
+ # If the document object reponds to the method :data, whatever it
24
+ # returns is marshalled and stored in the Xapian database. Any
25
+ # arbitrary data up to Xmeg can be stored here.
26
+ #
27
+ # Currently, all fields are stored in the database. This will
28
+ # change to store only those fields requested to be stored.
29
+ def add(doc)
30
+ doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
31
+ doc.db = @xdb
32
+ doc.save
33
+ doc
34
+ end
35
+ alias_method "<<", :add
36
+
37
+ # Return the document with the given id from the
38
+ # database. Raises a XapianFu::DocNotFoundError exception
39
+ # if it doesn't exist.
40
+ def find(doc_id)
41
+ xdoc = @xdb.ro.document(doc_id)
42
+ XapianDoc.new(xdoc, :xapian_db => @xdb)
43
+ rescue RuntimeError => e
44
+ raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
45
+ end
46
+
47
+ # Return the document with the given id from the database or nil
48
+ # if it doesn't exist
49
+ def [](doc_id)
50
+ find(doc_id)
51
+ rescue XapianFu::DocNotFound
52
+ nil
53
+ end
54
+
55
+ # Delete the given document from the database and return the
56
+ # document id, or nil if it doesn't exist
57
+ def delete(doc)
58
+ if doc.respond_to?(:to_i)
59
+ @xdb.rw.delete_document(doc.to_i)
60
+ doc.to_i
61
+ end
62
+ rescue RuntimeError => e
63
+ raise e unless e.to_s =~ /^DocNotFoundError/
64
+ end
65
+
66
+ # Return the document with the highest value in the specified field or nil if it doesn't exist
67
+ def max(key = :id)
68
+ if key == :id
69
+ # for :id we can use lastdocid
70
+ find(@xdb.ro.lastdocid) rescue nil
71
+ else
72
+ # for other values, we do a search ordered by that key in descening order
73
+ query = Xapian::Query.new(Xapian::Query::OP_VALUE_GE, key.to_s.hash, "0")
74
+ e = Xapian::Enquire.new(@xdb.ro)
75
+ e.query = query
76
+ e.sort_by_value!(key.to_s.hash)
77
+ r = e.mset(0, 1).matches.first
78
+ find(r.docid) rescue nil
79
+ end
80
+ end
81
+ end
82
+ end
data/lib/xapian_fu.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  $:.unshift File.join(File.dirname(__FILE__), 'xapian_fu')
2
2
  require 'xapian_db'
3
3
  require 'xapian_doc'
4
+ require 'stopper_factory'
@@ -0,0 +1,43 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+
5
+ describe QueryParser do
6
+ describe "parse_query" do
7
+ it "should use the database's stopper" do
8
+ xdb = XapianDb.new(:stopper => :french)
9
+ qp = QueryParser.new(:database => xdb)
10
+ terms = qp.parse_query("avec and").terms.collect { |t| t.term }
11
+ terms.should_not include "Zavec"
12
+ terms.should include "Zand"
13
+ end
14
+
15
+ it "should use the database's stemmer" do
16
+ xdb = XapianDb.new(:stemmer => :french)
17
+ qp = QueryParser.new(:database => xdb)
18
+ terms = qp.parse_query("contournait fishing").terms.collect { |t| t.term }
19
+ terms.should include "Zcontourn"
20
+ terms.should_not include "Zfish"
21
+ end
22
+
23
+ it "should use the :fields option to set field names" do
24
+ qp = QueryParser.new(:fields => [:name, :age])
25
+ terms = qp.parse_query("name:john age:30").terms.collect { |t| t.term }
26
+ terms.should include "XNAMEjohn"
27
+ terms.should_not include "john"
28
+ terms.should include "XAGE30"
29
+ terms.should_not include "30"
30
+ end
31
+
32
+ it "should use the database's field names as prefixes" do
33
+ xdb = XapianDb.new(:fields => [:name], :stemmer => :none)
34
+ qp = QueryParser.new(:database => xdb)
35
+ terms = qp.parse_query("name:john").terms.collect { |t| t.term }
36
+ terms.should include "XNAMEjohn"
37
+ terms.should_not include "john"
38
+ end
39
+
40
+ end
41
+
42
+ end
43
+
@@ -0,0 +1,57 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ describe StopperFactory do
7
+ describe "stopper_for" do
8
+ it "should return a SimpleStopper loaded with the given languages stop words" do
9
+ stopper = StopperFactory.stopper_for(:english)
10
+ stopper.should be_a_kind_of Xapian::SimpleStopper
11
+ stopper.call("and").should be_true
12
+ stopper.call("theremin").should_not be_true
13
+ end
14
+
15
+ it "should return the given stopper unmodified if given a Xapian::Stopper object" do
16
+ stopper = Xapian::SimpleStopper.new
17
+ StopperFactory.stopper_for(stopper).should === stopper
18
+ end
19
+ end
20
+
21
+ describe "stop_words_for" do
22
+
23
+ it "should return an array of words for the given language" do
24
+ words = StopperFactory.stop_words_for(:english)
25
+ words.should be_a_kind_of Array
26
+ words.should_not be_empty
27
+ words.should include 'and'
28
+ words.should include "they're"
29
+ end
30
+
31
+ %w(danish dutch english finnish french german hungarian italian norwegian portuguese russian spanish swedish).each do |lang|
32
+ describe lang do
33
+ it "should return an array of words" do
34
+ words = StopperFactory.stop_words_for(lang.to_sym)
35
+ words.should_not be_empty
36
+ end
37
+
38
+ it "should return an array with no empty strings, nils or pipes" do
39
+ StopperFactory.stop_words_for(lang.to_sym).should_not include ''
40
+ StopperFactory.stop_words_for(lang.to_sym).should_not include nil
41
+ StopperFactory.stop_words_for(lang.to_sym).should_not include '|'
42
+ end
43
+ end
44
+ end
45
+
46
+
47
+ it "should raise a UnsupportedStopperLanguage error if there is no data for the given language" do
48
+ Proc.new { StopperFactory.stop_words_for(:no_existy) }.should raise_error UnsupportedStopperLanguage
49
+ end
50
+
51
+ it "should return characters in utf8" do
52
+ words = StopperFactory.stop_words_for(:russian)
53
+ words.should include "человек"
54
+ end
55
+
56
+ end
57
+ end