xapian-fu 0.2 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +152 -13
- data/examples/query.rb +34 -6
- data/examples/spider.rb +44 -15
- data/lib/xapian_fu/query_parser.rb +179 -0
- data/lib/xapian_fu/result_set.rb +52 -0
- data/lib/xapian_fu/stopper_factory.rb +40 -0
- data/lib/xapian_fu/stopwords/README +7 -0
- data/lib/xapian_fu/stopwords/danish.txt +102 -0
- data/lib/xapian_fu/stopwords/dutch.txt +113 -0
- data/lib/xapian_fu/stopwords/english.txt +312 -0
- data/lib/xapian_fu/stopwords/finnish.txt +89 -0
- data/lib/xapian_fu/stopwords/french.txt +168 -0
- data/lib/xapian_fu/stopwords/german.txt +286 -0
- data/lib/xapian_fu/stopwords/hungarian.txt +203 -0
- data/lib/xapian_fu/stopwords/italian.txt +295 -0
- data/lib/xapian_fu/stopwords/norwegian.txt +186 -0
- data/lib/xapian_fu/stopwords/portuguese.txt +245 -0
- data/lib/xapian_fu/stopwords/russian.txt +236 -0
- data/lib/xapian_fu/stopwords/spanish.txt +348 -0
- data/lib/xapian_fu/stopwords/swedish.txt +125 -0
- data/lib/xapian_fu/stopwords/update.rb +7 -0
- data/lib/xapian_fu/xapian_db.rb +215 -99
- data/lib/xapian_fu/xapian_doc.rb +229 -47
- data/lib/xapian_fu/xapian_doc_value_accessor.rb +125 -0
- data/lib/xapian_fu/xapian_documents_accessor.rb +82 -0
- data/lib/xapian_fu.rb +1 -0
- data/spec/query_parser_spec.rb +43 -0
- data/spec/stopper_factory_spec.rb +57 -0
- data/spec/xapian_db_spec.rb +458 -215
- data/spec/xapian_doc_spec.rb +180 -0
- data/spec/xapian_doc_value_accessor_spec.rb +92 -0
- metadata +29 -5
data/lib/xapian_fu/xapian_doc.rb
CHANGED
@@ -1,20 +1,72 @@
|
|
1
|
-
|
1
|
+
class Time #:nodoc:
|
2
|
+
def to_xapian_fu_string
|
3
|
+
utc.strftime("%Y%m%d%H%M%S")
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
class Date #:nodoc:
|
8
|
+
def to_xapian_fu_string
|
9
|
+
strftime("%Y%m%d")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
require 'date'
|
14
|
+
|
15
|
+
class DateTime #:nodoc:
|
16
|
+
def to_xapian_fu_string
|
17
|
+
strftime("%Y%m%d%H%M%S")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module XapianFu #:nodoc:
|
22
|
+
require 'xapian_doc_value_accessor'
|
2
23
|
|
24
|
+
# Raised whenever a XapianDb is needed but has not been provided,
|
25
|
+
# such as when retrieving the terms list for a document
|
3
26
|
class XapianDbNotSet < XapianFuError ; end
|
4
|
-
|
27
|
+
# Raised if a given value cannot be stored in the database (anything
|
28
|
+
# without a to_s method)
|
5
29
|
class XapianTypeError < XapianFuError ; end
|
6
|
-
|
30
|
+
|
31
|
+
# A XapianDoc represents a document in a XapianDb. Searches return
|
32
|
+
# XapianDoc objects and they are used internally when adding new
|
33
|
+
# documents to the database. You usually don't need to instantiate
|
34
|
+
# them yourself unless you're doing something a bit advanced.
|
7
35
|
class XapianDoc
|
8
|
-
|
9
|
-
|
10
|
-
|
36
|
+
|
37
|
+
# A hash of the fields given to this object on initialize
|
38
|
+
attr_reader :fields
|
39
|
+
|
40
|
+
# An abitrary blob of data stored alongside the document in the
|
41
|
+
# Xapian database.
|
42
|
+
attr_reader :data
|
43
|
+
|
44
|
+
# The search score of this document when returned as part of a
|
45
|
+
# search result
|
46
|
+
attr_reader :weight
|
47
|
+
|
48
|
+
# The Xapian::Match object for this document when returned as part
|
49
|
+
# of a search result.
|
50
|
+
attr_reader :match
|
51
|
+
|
52
|
+
# The unsigned integer "primary key" for this document in the
|
53
|
+
# Xapian database.
|
54
|
+
attr_accessor :id
|
55
|
+
|
56
|
+
# The XapianDb object that this document was retrieved from, or
|
57
|
+
# should be stored in.
|
58
|
+
attr_accessor :db
|
11
59
|
|
12
60
|
# Expects a Xapian::Document, a Hash-like object, or anything that
|
13
61
|
# with a to_s method. Anything else raises a XapianTypeError.
|
14
|
-
#
|
15
|
-
# <tt>:data</tt>
|
16
|
-
#
|
62
|
+
# The <tt>:weight</tt> option sets the search weight when setting
|
63
|
+
# up search results. The <tt>:data</tt> option sets some
|
64
|
+
# additional data to be stored with the document in the database.
|
65
|
+
# The <tt>:xapian_db</tt> option sets the XapianDb to allow saves
|
66
|
+
# and term enumeration.
|
17
67
|
def initialize(doc, options = {})
|
68
|
+
@options = options
|
69
|
+
|
18
70
|
@fields = {}
|
19
71
|
if doc.is_a? Xapian::Match
|
20
72
|
match = doc
|
@@ -28,19 +80,8 @@ module XapianFu
|
|
28
80
|
if doc.is_a?(Xapian::Document)
|
29
81
|
@xapian_document = doc
|
30
82
|
@id = doc.docid
|
31
|
-
begin
|
32
|
-
xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
|
33
|
-
rescue ArgumentError
|
34
|
-
@data = nil
|
35
|
-
end
|
36
|
-
if xdoc_data.is_a? Hash
|
37
|
-
@data = xdoc_data.delete(:__data)
|
38
|
-
@fields = xdoc_data
|
39
|
-
else
|
40
|
-
@data = xdoc_data
|
41
|
-
end
|
42
83
|
# Handle initialisation from a hash-like object
|
43
|
-
elsif doc.respond_to?(
|
84
|
+
elsif doc.respond_to?(:has_key?) and doc.respond_to?("[]")
|
44
85
|
@fields = doc
|
45
86
|
@id = doc[:id] if doc.has_key?(:id)
|
46
87
|
# Handle initialisation from anything else that can be coerced
|
@@ -52,18 +93,21 @@ module XapianFu
|
|
52
93
|
end
|
53
94
|
@weight = options[:weight] if options[:weight]
|
54
95
|
@data = options[:data] if options[:data]
|
96
|
+
@db = options[:xapian_db] if options[:xapian_db]
|
55
97
|
end
|
56
98
|
|
57
|
-
#
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
def get_value(vkey)
|
62
|
-
raise XapianDocNotSet unless @xapian_document
|
63
|
-
vkey = vkey.to_s.hash unless vkey.is_a? Integer
|
64
|
-
@xapian_document.value(vkey)
|
99
|
+
# The arbitrary data stored in the Xapian database with this
|
100
|
+
# document. Returns an empty string if none available.
|
101
|
+
def data
|
102
|
+
@data ||= xapian_document.data
|
65
103
|
end
|
66
104
|
|
105
|
+
# The XapianFu::XapianDocValueAccessor for accessing the values in
|
106
|
+
# this document.
|
107
|
+
def values
|
108
|
+
@value_accessor ||= XapianDocValueAccessor.new(self)
|
109
|
+
end
|
110
|
+
|
67
111
|
# Return a list of terms that the db has for this document.
|
68
112
|
def terms
|
69
113
|
raise XapianFu::XapianDbNotSet unless db
|
@@ -74,17 +118,25 @@ module XapianFu
|
|
74
118
|
# database. Requires that the db attribute has been set up.
|
75
119
|
def to_xapian_document
|
76
120
|
raise XapianFu::XapianDbNotSet unless db
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
121
|
+
xapian_document.data = data
|
122
|
+
# Clear and add values
|
123
|
+
xapian_document.clear_values
|
124
|
+
add_values_to_xapian_document
|
125
|
+
# Clear and add terms
|
126
|
+
xapian_document.clear_terms
|
127
|
+
generate_terms
|
128
|
+
xapian_document
|
81
129
|
end
|
82
130
|
|
83
|
-
#
|
84
|
-
|
85
|
-
|
131
|
+
# The Xapian::Document for this XapianFu::Document. If this
|
132
|
+
# document was retrieved from a XapianDb then this will have been
|
133
|
+
# initialized by Xapian, otherwise a new Xapian::Document.new is
|
134
|
+
# allocated.
|
135
|
+
def xapian_document
|
136
|
+
@xapian_document ||= Xapian::Document.new
|
86
137
|
end
|
87
138
|
|
139
|
+
# Compare IDs with another XapianDoc
|
88
140
|
def ==(b)
|
89
141
|
if b.is_a?(XapianDoc)
|
90
142
|
id == b.id
|
@@ -94,24 +146,154 @@ module XapianFu
|
|
94
146
|
end
|
95
147
|
|
96
148
|
def inspect
|
97
|
-
"<#{self.class.to_s} id=#{id}
|
149
|
+
s = ["<#{self.class.to_s} id=#{id}"]
|
150
|
+
s << "weight=%.5f" % weight if weight
|
151
|
+
s << "db=#{db.nil? ? 'nil' : db}"
|
152
|
+
s.join(' ') + ">"
|
153
|
+
end
|
154
|
+
|
155
|
+
# Add this document to the Xapian Database, or replace it if it
|
156
|
+
# already has an id.
|
157
|
+
def save
|
158
|
+
id ? update : create
|
159
|
+
end
|
160
|
+
|
161
|
+
# Add this document to the Xapian Database
|
162
|
+
def create
|
163
|
+
self.id = db.rw.add_document(to_xapian_document)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Update this document in the Xapian Database
|
167
|
+
def update
|
168
|
+
db.rw.replace_document(id, to_xapian_document)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Set the stemmer to use for this document. Accepts any string
|
172
|
+
# that the Xapian::Stem class accepts (Either the English name for
|
173
|
+
# the language or the two letter ISO639 code). Can also be an
|
174
|
+
# existing Xapian::Stem object.
|
175
|
+
def stemmer=(s)
|
176
|
+
@stemmer = StemFactory.stemmer_for(s)
|
177
|
+
end
|
178
|
+
|
179
|
+
# Return the stemmer for this document. If not set on initialize
|
180
|
+
# by the :stemmer or :language option, it will try the database's
|
181
|
+
# stemmer and otherwise defaults to an English stemmer.
|
182
|
+
def stemmer
|
183
|
+
if @stemmer
|
184
|
+
@stemmer
|
185
|
+
else
|
186
|
+
@stemmer =
|
187
|
+
if ! @options[:stemmer].nil?
|
188
|
+
@options[:stemmer]
|
189
|
+
elsif @options[:language]
|
190
|
+
@options[:language]
|
191
|
+
elsif db
|
192
|
+
db.stemmer
|
193
|
+
else
|
194
|
+
:english
|
195
|
+
end
|
196
|
+
@stemmer = StemFactory.stemmer_for(@stemmer)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Return the stopper for this document. If not set on initialize
|
201
|
+
# by the :stopper or :language option, it will try the database's
|
202
|
+
# stopper and otherwise default to an English stopper..
|
203
|
+
def stopper
|
204
|
+
if @stopper
|
205
|
+
@stopper
|
206
|
+
else
|
207
|
+
@stopper =
|
208
|
+
if ! @options[:stopper].nil?
|
209
|
+
@options[:stopper]
|
210
|
+
elsif @options[:language]
|
211
|
+
@options[:language]
|
212
|
+
elsif db
|
213
|
+
db.stopper
|
214
|
+
else
|
215
|
+
:english
|
216
|
+
end
|
217
|
+
@stopper = StopperFactory.stopper_for(@stopper)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# Return this document's language which is set on initialize, inherited
|
222
|
+
# from the database or defaults to :english
|
223
|
+
def language
|
224
|
+
if @language
|
225
|
+
@language
|
226
|
+
else
|
227
|
+
@language =
|
228
|
+
if ! @options[:language].nil?
|
229
|
+
@options[:language]
|
230
|
+
elsif db and db.language
|
231
|
+
db.language
|
232
|
+
else
|
233
|
+
:english
|
234
|
+
end
|
235
|
+
end
|
98
236
|
end
|
99
237
|
|
100
238
|
private
|
101
239
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
240
|
+
# Array of field names not to run through the TermGenerator
|
241
|
+
def unindexed_fields
|
242
|
+
db ? db.unindexed_fields : []
|
243
|
+
end
|
244
|
+
|
245
|
+
# Add all the fields to be stored as XapianDb values
|
246
|
+
def add_values_to_xapian_document
|
247
|
+
db.store_values.collect do |key|
|
248
|
+
values[key] = fields[key]
|
249
|
+
key
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# Run the Xapian term generator against this documents text
|
254
|
+
def generate_terms
|
255
|
+
tg = Xapian::TermGenerator.new
|
256
|
+
tg.database = db.rw
|
257
|
+
tg.document = xapian_document
|
258
|
+
tg.stopper = stopper
|
259
|
+
tg.stemmer = stemmer
|
260
|
+
index_method = db.index_positions ? :index_text : :index_text_without_positions
|
261
|
+
fields.each do |k,v|
|
262
|
+
next if unindexed_fields.include?(k)
|
263
|
+
if v.respond_to?(:to_xapian_fu_string)
|
264
|
+
v = v.to_xapian_fu_string
|
265
|
+
else
|
266
|
+
v = v.to_s
|
267
|
+
end
|
268
|
+
# add value with field name
|
269
|
+
tg.send(index_method, v, 1, 'X' + k.to_s.upcase)
|
270
|
+
# add value without field name
|
271
|
+
tg.send(index_method, v)
|
272
|
+
end
|
273
|
+
xapian_document
|
107
274
|
end
|
108
275
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
class StemFactory
|
280
|
+
# Return a Xapian::Stem object for the given option. Accepts any
|
281
|
+
# string that the Xapian::Stem class accepts (Either the English
|
282
|
+
# name for the language or the two letter ISO639 code).
|
283
|
+
#
|
284
|
+
# If given false or nil, will return a "none" stemmer.
|
285
|
+
#
|
286
|
+
# It will also accept and return an existing Xapian::Stem object.
|
287
|
+
#
|
288
|
+
def self.stemmer_for(stemmer)
|
289
|
+
if stemmer.is_a? Xapian::Stem
|
290
|
+
stemmer
|
291
|
+
elsif stemmer.is_a?(String) or stemmer.is_a?(Symbol)
|
292
|
+
Xapian::Stem.new(stemmer.to_s)
|
293
|
+
else
|
294
|
+
Xapian::Stem.new("none")
|
113
295
|
end
|
114
|
-
xdoc
|
115
296
|
end
|
116
297
|
end
|
298
|
+
|
117
299
|
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
class Integer #:nodoc:
|
2
|
+
def to_xapian_fu_storage_value
|
3
|
+
[self].pack("l")
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.from_xapian_fu_storage_value(value)
|
7
|
+
value.unpack("l").first
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Bignum #:nodoc:
|
12
|
+
def to_xapian_fu_storage_value
|
13
|
+
[self].pack("G")
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.from_xapian_fu_storage_value(value)
|
17
|
+
value.unpack("G").first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Float #:nodoc:
|
22
|
+
def to_xapian_fu_storage_value
|
23
|
+
[self].pack("G")
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.from_xapian_fu_storage_value(value)
|
27
|
+
value.unpack("G").first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Time #:nodoc:
|
32
|
+
def to_xapian_fu_storage_value
|
33
|
+
[self.utc.to_f].pack("G")
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.from_xapian_fu_storage_value(value)
|
37
|
+
Time.at(value.unpack("G").first)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Date #:nodoc:
|
42
|
+
def to_xapian_fu_storage_value
|
43
|
+
to_s
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.from_xapian_fu_storage_value(value)
|
47
|
+
self.parse(value)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
module XapianFu #:nodoc:
|
52
|
+
|
53
|
+
# A XapianDocValueAccessor is used to provide the XapianDoc#values
|
54
|
+
# interface to read and write field values to a XapianDb. It is
|
55
|
+
# usually set up by a XapianDoc so you shouldn't need to set up your
|
56
|
+
# own.
|
57
|
+
class XapianDocValueAccessor
|
58
|
+
def initialize(xapian_doc)
|
59
|
+
@doc = xapian_doc
|
60
|
+
end
|
61
|
+
|
62
|
+
# Add the given <tt>value</tt> with the given <tt>key</tt> to the
|
63
|
+
# XapianDoc. If the value has a
|
64
|
+
# <tt>to_xapian_fu_storage_value</tt> method then it is used to
|
65
|
+
# generate the final value to be stored, otherwise <tt>to_s</tt>
|
66
|
+
# is used. This is usually paired with a
|
67
|
+
# <tt>from_xapian_fu_storage_value</tt> class method on retrieval.
|
68
|
+
def store(key, value, type = nil)
|
69
|
+
type = @doc.db.fields[key] if type.nil? and @doc.db
|
70
|
+
if type and value.is_a?(type) and value.respond_to?(:to_xapian_fu_storage_value)
|
71
|
+
converted_value = value.to_xapian_fu_storage_value
|
72
|
+
else
|
73
|
+
converted_value = value.to_s
|
74
|
+
end
|
75
|
+
@doc.xapian_document.add_value(value_key(key), converted_value)
|
76
|
+
value
|
77
|
+
end
|
78
|
+
alias_method "[]=", :store
|
79
|
+
|
80
|
+
# Retrieve the value with the given <tt>key</tt> from the
|
81
|
+
# XapianDoc. <tt>key</tt> can be a symbol or string, in which case
|
82
|
+
# it's hashed to get an integer value number. Or you can give the
|
83
|
+
# integer value number if you know it.
|
84
|
+
#
|
85
|
+
# If the class specified in the database fields for this key (or
|
86
|
+
# as the optional argument) has a
|
87
|
+
# <tt>from_xapian_fu_storage_value</tt> method then it is used to
|
88
|
+
# instaniate the object from the stored value. This is usually
|
89
|
+
# paired with a <tt>to_xapian_fu_storage_value</tt> instance
|
90
|
+
# method.
|
91
|
+
#
|
92
|
+
# Due to the design of Xapian, if the value does not exist then an
|
93
|
+
# empty string is returned.
|
94
|
+
def fetch(key, type = nil)
|
95
|
+
value = @doc.xapian_document.value(value_key(key))
|
96
|
+
type = @doc.db.fields[key] if type.nil? and @doc.db
|
97
|
+
if type and type.respond_to?(:from_xapian_fu_storage_value)
|
98
|
+
type.from_xapian_fu_storage_value(value)
|
99
|
+
else
|
100
|
+
value
|
101
|
+
end
|
102
|
+
end
|
103
|
+
alias_method "[]", :fetch
|
104
|
+
|
105
|
+
# Count the values stored in the XapianDoc
|
106
|
+
def size
|
107
|
+
@doc.xapian_document.values_count
|
108
|
+
end
|
109
|
+
|
110
|
+
# Remove the value with the given key from the XapianDoc and return it
|
111
|
+
def delete(key)
|
112
|
+
value = fetch(key)
|
113
|
+
@doc.xapian_document.remove_value(value_key(key))
|
114
|
+
value
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
# Convert the given key to an integer that can be used as a Xapian
|
120
|
+
# value number
|
121
|
+
def value_key(key)
|
122
|
+
key.is_a?(Integer) ? key : key.to_s.hash
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module XapianFu
|
2
|
+
# A XapianDocumentsAccessor is used to provide the
|
3
|
+
# XapianDb#documents interface. It is usually set up by a XapianDb
|
4
|
+
# so you shouldn't need to set up your own.
|
5
|
+
class XapianDocumentsAccessor
|
6
|
+
def initialize(xdb) #:nodoc:
|
7
|
+
@xdb = xdb
|
8
|
+
end
|
9
|
+
|
10
|
+
# Build a new XapianDoc for this database
|
11
|
+
def new(doc = nil, options = { })
|
12
|
+
options = options.merge({ :xapian_db => @xdb })
|
13
|
+
XapianDoc.new(doc, options)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Add a document to the index. A document can be just a hash, the
|
17
|
+
# keys representing field names and their values the data to be
|
18
|
+
# indexed. Or it can be a XapianDoc, or any object with a to_s method.
|
19
|
+
#
|
20
|
+
# If the document has an :id field, it is used as the primary key
|
21
|
+
# in the Xapian database.
|
22
|
+
#
|
23
|
+
# If the document object reponds to the method :data, whatever it
|
24
|
+
# returns is marshalled and stored in the Xapian database. Any
|
25
|
+
# arbitrary data up to Xmeg can be stored here.
|
26
|
+
#
|
27
|
+
# Currently, all fields are stored in the database. This will
|
28
|
+
# change to store only those fields requested to be stored.
|
29
|
+
def add(doc)
|
30
|
+
doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
|
31
|
+
doc.db = @xdb
|
32
|
+
doc.save
|
33
|
+
doc
|
34
|
+
end
|
35
|
+
alias_method "<<", :add
|
36
|
+
|
37
|
+
# Return the document with the given id from the
|
38
|
+
# database. Raises a XapianFu::DocNotFoundError exception
|
39
|
+
# if it doesn't exist.
|
40
|
+
def find(doc_id)
|
41
|
+
xdoc = @xdb.ro.document(doc_id)
|
42
|
+
XapianDoc.new(xdoc, :xapian_db => @xdb)
|
43
|
+
rescue RuntimeError => e
|
44
|
+
raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the document with the given id from the database or nil
|
48
|
+
# if it doesn't exist
|
49
|
+
def [](doc_id)
|
50
|
+
find(doc_id)
|
51
|
+
rescue XapianFu::DocNotFound
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
# Delete the given document from the database and return the
|
56
|
+
# document id, or nil if it doesn't exist
|
57
|
+
def delete(doc)
|
58
|
+
if doc.respond_to?(:to_i)
|
59
|
+
@xdb.rw.delete_document(doc.to_i)
|
60
|
+
doc.to_i
|
61
|
+
end
|
62
|
+
rescue RuntimeError => e
|
63
|
+
raise e unless e.to_s =~ /^DocNotFoundError/
|
64
|
+
end
|
65
|
+
|
66
|
+
# Return the document with the highest value in the specified field or nil if it doesn't exist
|
67
|
+
def max(key = :id)
|
68
|
+
if key == :id
|
69
|
+
# for :id we can use lastdocid
|
70
|
+
find(@xdb.ro.lastdocid) rescue nil
|
71
|
+
else
|
72
|
+
# for other values, we do a search ordered by that key in descening order
|
73
|
+
query = Xapian::Query.new(Xapian::Query::OP_VALUE_GE, key.to_s.hash, "0")
|
74
|
+
e = Xapian::Enquire.new(@xdb.ro)
|
75
|
+
e.query = query
|
76
|
+
e.sort_by_value!(key.to_s.hash)
|
77
|
+
r = e.mset(0, 1).matches.first
|
78
|
+
find(r.docid) rescue nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/xapian_fu.rb
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
|
5
|
+
describe QueryParser do
|
6
|
+
describe "parse_query" do
|
7
|
+
it "should use the database's stopper" do
|
8
|
+
xdb = XapianDb.new(:stopper => :french)
|
9
|
+
qp = QueryParser.new(:database => xdb)
|
10
|
+
terms = qp.parse_query("avec and").terms.collect { |t| t.term }
|
11
|
+
terms.should_not include "Zavec"
|
12
|
+
terms.should include "Zand"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should use the database's stemmer" do
|
16
|
+
xdb = XapianDb.new(:stemmer => :french)
|
17
|
+
qp = QueryParser.new(:database => xdb)
|
18
|
+
terms = qp.parse_query("contournait fishing").terms.collect { |t| t.term }
|
19
|
+
terms.should include "Zcontourn"
|
20
|
+
terms.should_not include "Zfish"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should use the :fields option to set field names" do
|
24
|
+
qp = QueryParser.new(:fields => [:name, :age])
|
25
|
+
terms = qp.parse_query("name:john age:30").terms.collect { |t| t.term }
|
26
|
+
terms.should include "XNAMEjohn"
|
27
|
+
terms.should_not include "john"
|
28
|
+
terms.should include "XAGE30"
|
29
|
+
terms.should_not include "30"
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should use the database's field names as prefixes" do
|
33
|
+
xdb = XapianDb.new(:fields => [:name], :stemmer => :none)
|
34
|
+
qp = QueryParser.new(:database => xdb)
|
35
|
+
terms = qp.parse_query("name:john").terms.collect { |t| t.term }
|
36
|
+
terms.should include "XNAMEjohn"
|
37
|
+
terms.should_not include "john"
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
describe StopperFactory do
|
7
|
+
describe "stopper_for" do
|
8
|
+
it "should return a SimpleStopper loaded with the given languages stop words" do
|
9
|
+
stopper = StopperFactory.stopper_for(:english)
|
10
|
+
stopper.should be_a_kind_of Xapian::SimpleStopper
|
11
|
+
stopper.call("and").should be_true
|
12
|
+
stopper.call("theremin").should_not be_true
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return the given stopper unmodified if given a Xapian::Stopper object" do
|
16
|
+
stopper = Xapian::SimpleStopper.new
|
17
|
+
StopperFactory.stopper_for(stopper).should === stopper
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "stop_words_for" do
|
22
|
+
|
23
|
+
it "should return an array of words for the given language" do
|
24
|
+
words = StopperFactory.stop_words_for(:english)
|
25
|
+
words.should be_a_kind_of Array
|
26
|
+
words.should_not be_empty
|
27
|
+
words.should include 'and'
|
28
|
+
words.should include "they're"
|
29
|
+
end
|
30
|
+
|
31
|
+
%w(danish dutch english finnish french german hungarian italian norwegian portuguese russian spanish swedish).each do |lang|
|
32
|
+
describe lang do
|
33
|
+
it "should return an array of words" do
|
34
|
+
words = StopperFactory.stop_words_for(lang.to_sym)
|
35
|
+
words.should_not be_empty
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should return an array with no empty strings, nils or pipes" do
|
39
|
+
StopperFactory.stop_words_for(lang.to_sym).should_not include ''
|
40
|
+
StopperFactory.stop_words_for(lang.to_sym).should_not include nil
|
41
|
+
StopperFactory.stop_words_for(lang.to_sym).should_not include '|'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
it "should raise a UnsupportedStopperLanguage error if there is no data for the given language" do
|
48
|
+
Proc.new { StopperFactory.stop_words_for(:no_existy) }.should raise_error UnsupportedStopperLanguage
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should return characters in utf8" do
|
52
|
+
words = StopperFactory.stop_words_for(:russian)
|
53
|
+
words.should include "человек"
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|