xapian-fu 0.2 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,20 +1,72 @@
1
- module XapianFu
1
+ class Time #:nodoc:
2
+ def to_xapian_fu_string
3
+ utc.strftime("%Y%m%d%H%M%S")
4
+ end
5
+ end
6
+
7
+ class Date #:nodoc:
8
+ def to_xapian_fu_string
9
+ strftime("%Y%m%d")
10
+ end
11
+ end
12
+
13
+ require 'date'
14
+
15
+ class DateTime #:nodoc:
16
+ def to_xapian_fu_string
17
+ strftime("%Y%m%d%H%M%S")
18
+ end
19
+ end
20
+
21
+ module XapianFu #:nodoc:
22
+ require 'xapian_doc_value_accessor'
2
23
 
24
+ # Raised whenever a XapianDb is needed but has not been provided,
25
+ # such as when retrieving the terms list for a document
3
26
  class XapianDbNotSet < XapianFuError ; end
4
- class XapianDocNotSet < XapianFuError ; end
27
+ # Raised if a given value cannot be stored in the database (anything
28
+ # without a to_s method)
5
29
  class XapianTypeError < XapianFuError ; end
6
-
30
+
31
+ # A XapianDoc represents a document in a XapianDb. Searches return
32
+ # XapianDoc objects and they are used internally when adding new
33
+ # documents to the database. You usually don't need to instantiate
34
+ # them yourself unless you're doing something a bit advanced.
7
35
  class XapianDoc
8
- attr_reader :fields, :data, :weight, :match
9
- attr_reader :xapian_document
10
- attr_accessor :id, :db
36
+
37
+ # A hash of the fields given to this object on initialize
38
+ attr_reader :fields
39
+
40
+ # An abitrary blob of data stored alongside the document in the
41
+ # Xapian database.
42
+ attr_reader :data
43
+
44
+ # The search score of this document when returned as part of a
45
+ # search result
46
+ attr_reader :weight
47
+
48
+ # The Xapian::Match object for this document when returned as part
49
+ # of a search result.
50
+ attr_reader :match
51
+
52
+ # The unsigned integer "primary key" for this document in the
53
+ # Xapian database.
54
+ attr_accessor :id
55
+
56
+ # The XapianDb object that this document was retrieved from, or
57
+ # should be stored in.
58
+ attr_accessor :db
11
59
 
12
60
  # Expects a Xapian::Document, a Hash-like object, or anything that
13
61
  # with a to_s method. Anything else raises a XapianTypeError.
14
- # Options can be <tt>:weight</tt> to set the search weight or
15
- # <tt>:data</tt> to set some additional data to be stored with the
16
- # record in the database.
62
+ # The <tt>:weight</tt> option sets the search weight when setting
63
+ # up search results. The <tt>:data</tt> option sets some
64
+ # additional data to be stored with the document in the database.
65
+ # The <tt>:xapian_db</tt> option sets the XapianDb to allow saves
66
+ # and term enumeration.
17
67
  def initialize(doc, options = {})
68
+ @options = options
69
+
18
70
  @fields = {}
19
71
  if doc.is_a? Xapian::Match
20
72
  match = doc
@@ -28,19 +80,8 @@ module XapianFu
28
80
  if doc.is_a?(Xapian::Document)
29
81
  @xapian_document = doc
30
82
  @id = doc.docid
31
- begin
32
- xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
33
- rescue ArgumentError
34
- @data = nil
35
- end
36
- if xdoc_data.is_a? Hash
37
- @data = xdoc_data.delete(:__data)
38
- @fields = xdoc_data
39
- else
40
- @data = xdoc_data
41
- end
42
83
  # Handle initialisation from a hash-like object
43
- elsif doc.respond_to?("[]") and doc.respond_to?(:has_key?)
84
+ elsif doc.respond_to?(:has_key?) and doc.respond_to?("[]")
44
85
  @fields = doc
45
86
  @id = doc[:id] if doc.has_key?(:id)
46
87
  # Handle initialisation from anything else that can be coerced
@@ -52,18 +93,21 @@ module XapianFu
52
93
  end
53
94
  @weight = options[:weight] if options[:weight]
54
95
  @data = options[:data] if options[:data]
96
+ @db = options[:xapian_db] if options[:xapian_db]
55
97
  end
56
98
 
57
- # Retrieve the given Xapianvalue from the XapianDb. <tt>vkey</tt>
58
- # can be a symbol or string, in which case it's hashed to get an
59
- # integer value number. Or you can give the integer value number
60
- # if you know it.
61
- def get_value(vkey)
62
- raise XapianDocNotSet unless @xapian_document
63
- vkey = vkey.to_s.hash unless vkey.is_a? Integer
64
- @xapian_document.value(vkey)
99
+ # The arbitrary data stored in the Xapian database with this
100
+ # document. Returns an empty string if none available.
101
+ def data
102
+ @data ||= xapian_document.data
65
103
  end
66
104
 
105
+ # The XapianFu::XapianDocValueAccessor for accessing the values in
106
+ # this document.
107
+ def values
108
+ @value_accessor ||= XapianDocValueAccessor.new(self)
109
+ end
110
+
67
111
  # Return a list of terms that the db has for this document.
68
112
  def terms
69
113
  raise XapianFu::XapianDbNotSet unless db
@@ -74,17 +118,25 @@ module XapianFu
74
118
  # database. Requires that the db attribute has been set up.
75
119
  def to_xapian_document
76
120
  raise XapianFu::XapianDbNotSet unless db
77
- xdoc = Xapian::Document.new
78
- add_stored_fields_to_xapian_doc(xdoc)
79
- add_stored_values_to_xapian_doc(xdoc)
80
- xdoc
121
+ xapian_document.data = data
122
+ # Clear and add values
123
+ xapian_document.clear_values
124
+ add_values_to_xapian_document
125
+ # Clear and add terms
126
+ xapian_document.clear_terms
127
+ generate_terms
128
+ xapian_document
81
129
  end
82
130
 
83
- # Return text for indexing from the fields
84
- def text
85
- fields.keys.collect { |key| fields[key].to_s }.join(' ')
131
+ # The Xapian::Document for this XapianFu::Document. If this
132
+ # document was retrieved from a XapianDb then this will have been
133
+ # initialized by Xapian, otherwise a new Xapian::Document.new is
134
+ # allocated.
135
+ def xapian_document
136
+ @xapian_document ||= Xapian::Document.new
86
137
  end
87
138
 
139
+ # Compare IDs with another XapianDoc
88
140
  def ==(b)
89
141
  if b.is_a?(XapianDoc)
90
142
  id == b.id
@@ -94,24 +146,154 @@ module XapianFu
94
146
  end
95
147
 
96
148
  def inspect
97
- "<#{self.class.to_s} id=#{id}>"
149
+ s = ["<#{self.class.to_s} id=#{id}"]
150
+ s << "weight=%.5f" % weight if weight
151
+ s << "db=#{db.nil? ? 'nil' : db}"
152
+ s.join(' ') + ">"
153
+ end
154
+
155
+ # Add this document to the Xapian Database, or replace it if it
156
+ # already has an id.
157
+ def save
158
+ id ? update : create
159
+ end
160
+
161
+ # Add this document to the Xapian Database
162
+ def create
163
+ self.id = db.rw.add_document(to_xapian_document)
164
+ end
165
+
166
+ # Update this document in the Xapian Database
167
+ def update
168
+ db.rw.replace_document(id, to_xapian_document)
169
+ end
170
+
171
+ # Set the stemmer to use for this document. Accepts any string
172
+ # that the Xapian::Stem class accepts (Either the English name for
173
+ # the language or the two letter ISO639 code). Can also be an
174
+ # existing Xapian::Stem object.
175
+ def stemmer=(s)
176
+ @stemmer = StemFactory.stemmer_for(s)
177
+ end
178
+
179
+ # Return the stemmer for this document. If not set on initialize
180
+ # by the :stemmer or :language option, it will try the database's
181
+ # stemmer and otherwise defaults to an English stemmer.
182
+ def stemmer
183
+ if @stemmer
184
+ @stemmer
185
+ else
186
+ @stemmer =
187
+ if ! @options[:stemmer].nil?
188
+ @options[:stemmer]
189
+ elsif @options[:language]
190
+ @options[:language]
191
+ elsif db
192
+ db.stemmer
193
+ else
194
+ :english
195
+ end
196
+ @stemmer = StemFactory.stemmer_for(@stemmer)
197
+ end
198
+ end
199
+
200
+ # Return the stopper for this document. If not set on initialize
201
+ # by the :stopper or :language option, it will try the database's
202
+ # stopper and otherwise default to an English stopper..
203
+ def stopper
204
+ if @stopper
205
+ @stopper
206
+ else
207
+ @stopper =
208
+ if ! @options[:stopper].nil?
209
+ @options[:stopper]
210
+ elsif @options[:language]
211
+ @options[:language]
212
+ elsif db
213
+ db.stopper
214
+ else
215
+ :english
216
+ end
217
+ @stopper = StopperFactory.stopper_for(@stopper)
218
+ end
219
+ end
220
+
221
+ # Return this document's language which is set on initialize, inherited
222
+ # from the database or defaults to :english
223
+ def language
224
+ if @language
225
+ @language
226
+ else
227
+ @language =
228
+ if ! @options[:language].nil?
229
+ @options[:language]
230
+ elsif db and db.language
231
+ db.language
232
+ else
233
+ :english
234
+ end
235
+ end
98
236
  end
99
237
 
100
238
  private
101
239
 
102
- def add_stored_fields_to_xapian_doc(xdoc)
103
- stored_fields = fields.reject { |k,v| ! db.store_fields.include? k }
104
- stored_fields[:__data] = data if data
105
- xdoc.data = Marshal.dump(stored_fields) unless stored_fields.empty?
106
- xdoc
240
+ # Array of field names not to run through the TermGenerator
241
+ def unindexed_fields
242
+ db ? db.unindexed_fields : []
243
+ end
244
+
245
+ # Add all the fields to be stored as XapianDb values
246
+ def add_values_to_xapian_document
247
+ db.store_values.collect do |key|
248
+ values[key] = fields[key]
249
+ key
250
+ end
251
+ end
252
+
253
+ # Run the Xapian term generator against this documents text
254
+ def generate_terms
255
+ tg = Xapian::TermGenerator.new
256
+ tg.database = db.rw
257
+ tg.document = xapian_document
258
+ tg.stopper = stopper
259
+ tg.stemmer = stemmer
260
+ index_method = db.index_positions ? :index_text : :index_text_without_positions
261
+ fields.each do |k,v|
262
+ next if unindexed_fields.include?(k)
263
+ if v.respond_to?(:to_xapian_fu_string)
264
+ v = v.to_xapian_fu_string
265
+ else
266
+ v = v.to_s
267
+ end
268
+ # add value with field name
269
+ tg.send(index_method, v, 1, 'X' + k.to_s.upcase)
270
+ # add value without field name
271
+ tg.send(index_method, v)
272
+ end
273
+ xapian_document
107
274
  end
108
275
 
109
- def add_stored_values_to_xapian_doc(xdoc)
110
- stored_values = fields.reject { |k,v| ! db.store_values.include? k }
111
- stored_values.each do |k,v|
112
- xdoc.add_value(k.to_s.hash, v.to_s)
276
+ end
277
+
278
+
279
+ class StemFactory
280
+ # Return a Xapian::Stem object for the given option. Accepts any
281
+ # string that the Xapian::Stem class accepts (Either the English
282
+ # name for the language or the two letter ISO639 code).
283
+ #
284
+ # If given false or nil, will return a "none" stemmer.
285
+ #
286
+ # It will also accept and return an existing Xapian::Stem object.
287
+ #
288
+ def self.stemmer_for(stemmer)
289
+ if stemmer.is_a? Xapian::Stem
290
+ stemmer
291
+ elsif stemmer.is_a?(String) or stemmer.is_a?(Symbol)
292
+ Xapian::Stem.new(stemmer.to_s)
293
+ else
294
+ Xapian::Stem.new("none")
113
295
  end
114
- xdoc
115
296
  end
116
297
  end
298
+
117
299
  end
@@ -0,0 +1,125 @@
1
+ class Integer #:nodoc:
2
+ def to_xapian_fu_storage_value
3
+ [self].pack("l")
4
+ end
5
+
6
+ def self.from_xapian_fu_storage_value(value)
7
+ value.unpack("l").first
8
+ end
9
+ end
10
+
11
+ class Bignum #:nodoc:
12
+ def to_xapian_fu_storage_value
13
+ [self].pack("G")
14
+ end
15
+
16
+ def self.from_xapian_fu_storage_value(value)
17
+ value.unpack("G").first
18
+ end
19
+ end
20
+
21
+ class Float #:nodoc:
22
+ def to_xapian_fu_storage_value
23
+ [self].pack("G")
24
+ end
25
+
26
+ def self.from_xapian_fu_storage_value(value)
27
+ value.unpack("G").first
28
+ end
29
+ end
30
+
31
+ class Time #:nodoc:
32
+ def to_xapian_fu_storage_value
33
+ [self.utc.to_f].pack("G")
34
+ end
35
+
36
+ def self.from_xapian_fu_storage_value(value)
37
+ Time.at(value.unpack("G").first)
38
+ end
39
+ end
40
+
41
+ class Date #:nodoc:
42
+ def to_xapian_fu_storage_value
43
+ to_s
44
+ end
45
+
46
+ def self.from_xapian_fu_storage_value(value)
47
+ self.parse(value)
48
+ end
49
+ end
50
+
51
+ module XapianFu #:nodoc:
52
+
53
+ # A XapianDocValueAccessor is used to provide the XapianDoc#values
54
+ # interface to read and write field values to a XapianDb. It is
55
+ # usually set up by a XapianDoc so you shouldn't need to set up your
56
+ # own.
57
+ class XapianDocValueAccessor
58
+ def initialize(xapian_doc)
59
+ @doc = xapian_doc
60
+ end
61
+
62
+ # Add the given <tt>value</tt> with the given <tt>key</tt> to the
63
+ # XapianDoc. If the value has a
64
+ # <tt>to_xapian_fu_storage_value</tt> method then it is used to
65
+ # generate the final value to be stored, otherwise <tt>to_s</tt>
66
+ # is used. This is usually paired with a
67
+ # <tt>from_xapian_fu_storage_value</tt> class method on retrieval.
68
+ def store(key, value, type = nil)
69
+ type = @doc.db.fields[key] if type.nil? and @doc.db
70
+ if type and value.is_a?(type) and value.respond_to?(:to_xapian_fu_storage_value)
71
+ converted_value = value.to_xapian_fu_storage_value
72
+ else
73
+ converted_value = value.to_s
74
+ end
75
+ @doc.xapian_document.add_value(value_key(key), converted_value)
76
+ value
77
+ end
78
+ alias_method "[]=", :store
79
+
80
+ # Retrieve the value with the given <tt>key</tt> from the
81
+ # XapianDoc. <tt>key</tt> can be a symbol or string, in which case
82
+ # it's hashed to get an integer value number. Or you can give the
83
+ # integer value number if you know it.
84
+ #
85
+ # If the class specified in the database fields for this key (or
86
+ # as the optional argument) has a
87
+ # <tt>from_xapian_fu_storage_value</tt> method then it is used to
88
+ # instaniate the object from the stored value. This is usually
89
+ # paired with a <tt>to_xapian_fu_storage_value</tt> instance
90
+ # method.
91
+ #
92
+ # Due to the design of Xapian, if the value does not exist then an
93
+ # empty string is returned.
94
+ def fetch(key, type = nil)
95
+ value = @doc.xapian_document.value(value_key(key))
96
+ type = @doc.db.fields[key] if type.nil? and @doc.db
97
+ if type and type.respond_to?(:from_xapian_fu_storage_value)
98
+ type.from_xapian_fu_storage_value(value)
99
+ else
100
+ value
101
+ end
102
+ end
103
+ alias_method "[]", :fetch
104
+
105
+ # Count the values stored in the XapianDoc
106
+ def size
107
+ @doc.xapian_document.values_count
108
+ end
109
+
110
+ # Remove the value with the given key from the XapianDoc and return it
111
+ def delete(key)
112
+ value = fetch(key)
113
+ @doc.xapian_document.remove_value(value_key(key))
114
+ value
115
+ end
116
+
117
+ private
118
+
119
+ # Convert the given key to an integer that can be used as a Xapian
120
+ # value number
121
+ def value_key(key)
122
+ key.is_a?(Integer) ? key : key.to_s.hash
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,82 @@
1
+ module XapianFu
2
+ # A XapianDocumentsAccessor is used to provide the
3
+ # XapianDb#documents interface. It is usually set up by a XapianDb
4
+ # so you shouldn't need to set up your own.
5
+ class XapianDocumentsAccessor
6
+ def initialize(xdb) #:nodoc:
7
+ @xdb = xdb
8
+ end
9
+
10
+ # Build a new XapianDoc for this database
11
+ def new(doc = nil, options = { })
12
+ options = options.merge({ :xapian_db => @xdb })
13
+ XapianDoc.new(doc, options)
14
+ end
15
+
16
+ # Add a document to the index. A document can be just a hash, the
17
+ # keys representing field names and their values the data to be
18
+ # indexed. Or it can be a XapianDoc, or any object with a to_s method.
19
+ #
20
+ # If the document has an :id field, it is used as the primary key
21
+ # in the Xapian database.
22
+ #
23
+ # If the document object reponds to the method :data, whatever it
24
+ # returns is marshalled and stored in the Xapian database. Any
25
+ # arbitrary data up to Xmeg can be stored here.
26
+ #
27
+ # Currently, all fields are stored in the database. This will
28
+ # change to store only those fields requested to be stored.
29
+ def add(doc)
30
+ doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
31
+ doc.db = @xdb
32
+ doc.save
33
+ doc
34
+ end
35
+ alias_method "<<", :add
36
+
37
+ # Return the document with the given id from the
38
+ # database. Raises a XapianFu::DocNotFoundError exception
39
+ # if it doesn't exist.
40
+ def find(doc_id)
41
+ xdoc = @xdb.ro.document(doc_id)
42
+ XapianDoc.new(xdoc, :xapian_db => @xdb)
43
+ rescue RuntimeError => e
44
+ raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
45
+ end
46
+
47
+ # Return the document with the given id from the database or nil
48
+ # if it doesn't exist
49
+ def [](doc_id)
50
+ find(doc_id)
51
+ rescue XapianFu::DocNotFound
52
+ nil
53
+ end
54
+
55
+ # Delete the given document from the database and return the
56
+ # document id, or nil if it doesn't exist
57
+ def delete(doc)
58
+ if doc.respond_to?(:to_i)
59
+ @xdb.rw.delete_document(doc.to_i)
60
+ doc.to_i
61
+ end
62
+ rescue RuntimeError => e
63
+ raise e unless e.to_s =~ /^DocNotFoundError/
64
+ end
65
+
66
+ # Return the document with the highest value in the specified field or nil if it doesn't exist
67
+ def max(key = :id)
68
+ if key == :id
69
+ # for :id we can use lastdocid
70
+ find(@xdb.ro.lastdocid) rescue nil
71
+ else
72
+ # for other values, we do a search ordered by that key in descening order
73
+ query = Xapian::Query.new(Xapian::Query::OP_VALUE_GE, key.to_s.hash, "0")
74
+ e = Xapian::Enquire.new(@xdb.ro)
75
+ e.query = query
76
+ e.sort_by_value!(key.to_s.hash)
77
+ r = e.mset(0, 1).matches.first
78
+ find(r.docid) rescue nil
79
+ end
80
+ end
81
+ end
82
+ end
data/lib/xapian_fu.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  $:.unshift File.join(File.dirname(__FILE__), 'xapian_fu')
2
2
  require 'xapian_db'
3
3
  require 'xapian_doc'
4
+ require 'stopper_factory'
@@ -0,0 +1,43 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+
5
+ describe QueryParser do
6
+ describe "parse_query" do
7
+ it "should use the database's stopper" do
8
+ xdb = XapianDb.new(:stopper => :french)
9
+ qp = QueryParser.new(:database => xdb)
10
+ terms = qp.parse_query("avec and").terms.collect { |t| t.term }
11
+ terms.should_not include "Zavec"
12
+ terms.should include "Zand"
13
+ end
14
+
15
+ it "should use the database's stemmer" do
16
+ xdb = XapianDb.new(:stemmer => :french)
17
+ qp = QueryParser.new(:database => xdb)
18
+ terms = qp.parse_query("contournait fishing").terms.collect { |t| t.term }
19
+ terms.should include "Zcontourn"
20
+ terms.should_not include "Zfish"
21
+ end
22
+
23
+ it "should use the :fields option to set field names" do
24
+ qp = QueryParser.new(:fields => [:name, :age])
25
+ terms = qp.parse_query("name:john age:30").terms.collect { |t| t.term }
26
+ terms.should include "XNAMEjohn"
27
+ terms.should_not include "john"
28
+ terms.should include "XAGE30"
29
+ terms.should_not include "30"
30
+ end
31
+
32
+ it "should use the database's field names as prefixes" do
33
+ xdb = XapianDb.new(:fields => [:name], :stemmer => :none)
34
+ qp = QueryParser.new(:database => xdb)
35
+ terms = qp.parse_query("name:john").terms.collect { |t| t.term }
36
+ terms.should include "XNAMEjohn"
37
+ terms.should_not include "john"
38
+ end
39
+
40
+ end
41
+
42
+ end
43
+
@@ -0,0 +1,57 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ describe StopperFactory do
7
+ describe "stopper_for" do
8
+ it "should return a SimpleStopper loaded with the given languages stop words" do
9
+ stopper = StopperFactory.stopper_for(:english)
10
+ stopper.should be_a_kind_of Xapian::SimpleStopper
11
+ stopper.call("and").should be_true
12
+ stopper.call("theremin").should_not be_true
13
+ end
14
+
15
+ it "should return the given stopper unmodified if given a Xapian::Stopper object" do
16
+ stopper = Xapian::SimpleStopper.new
17
+ StopperFactory.stopper_for(stopper).should === stopper
18
+ end
19
+ end
20
+
21
+ describe "stop_words_for" do
22
+
23
+ it "should return an array of words for the given language" do
24
+ words = StopperFactory.stop_words_for(:english)
25
+ words.should be_a_kind_of Array
26
+ words.should_not be_empty
27
+ words.should include 'and'
28
+ words.should include "they're"
29
+ end
30
+
31
+ %w(danish dutch english finnish french german hungarian italian norwegian portuguese russian spanish swedish).each do |lang|
32
+ describe lang do
33
+ it "should return an array of words" do
34
+ words = StopperFactory.stop_words_for(lang.to_sym)
35
+ words.should_not be_empty
36
+ end
37
+
38
+ it "should return an array with no empty strings, nils or pipes" do
39
+ StopperFactory.stop_words_for(lang.to_sym).should_not include ''
40
+ StopperFactory.stop_words_for(lang.to_sym).should_not include nil
41
+ StopperFactory.stop_words_for(lang.to_sym).should_not include '|'
42
+ end
43
+ end
44
+ end
45
+
46
+
47
+ it "should raise a UnsupportedStopperLanguage error if there is no data for the given language" do
48
+ Proc.new { StopperFactory.stop_words_for(:no_existy) }.should raise_error UnsupportedStopperLanguage
49
+ end
50
+
51
+ it "should return characters in utf8" do
52
+ words = StopperFactory.stop_words_for(:russian)
53
+ words.should include "человек"
54
+ end
55
+
56
+ end
57
+ end