xapian-fu 0.2 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +152 -13
- data/examples/query.rb +34 -6
- data/examples/spider.rb +44 -15
- data/lib/xapian_fu/query_parser.rb +179 -0
- data/lib/xapian_fu/result_set.rb +52 -0
- data/lib/xapian_fu/stopper_factory.rb +40 -0
- data/lib/xapian_fu/stopwords/README +7 -0
- data/lib/xapian_fu/stopwords/danish.txt +102 -0
- data/lib/xapian_fu/stopwords/dutch.txt +113 -0
- data/lib/xapian_fu/stopwords/english.txt +312 -0
- data/lib/xapian_fu/stopwords/finnish.txt +89 -0
- data/lib/xapian_fu/stopwords/french.txt +168 -0
- data/lib/xapian_fu/stopwords/german.txt +286 -0
- data/lib/xapian_fu/stopwords/hungarian.txt +203 -0
- data/lib/xapian_fu/stopwords/italian.txt +295 -0
- data/lib/xapian_fu/stopwords/norwegian.txt +186 -0
- data/lib/xapian_fu/stopwords/portuguese.txt +245 -0
- data/lib/xapian_fu/stopwords/russian.txt +236 -0
- data/lib/xapian_fu/stopwords/spanish.txt +348 -0
- data/lib/xapian_fu/stopwords/swedish.txt +125 -0
- data/lib/xapian_fu/stopwords/update.rb +7 -0
- data/lib/xapian_fu/xapian_db.rb +215 -99
- data/lib/xapian_fu/xapian_doc.rb +229 -47
- data/lib/xapian_fu/xapian_doc_value_accessor.rb +125 -0
- data/lib/xapian_fu/xapian_documents_accessor.rb +82 -0
- data/lib/xapian_fu.rb +1 -0
- data/spec/query_parser_spec.rb +43 -0
- data/spec/stopper_factory_spec.rb +57 -0
- data/spec/xapian_db_spec.rb +458 -215
- data/spec/xapian_doc_spec.rb +180 -0
- data/spec/xapian_doc_value_accessor_spec.rb +92 -0
- metadata +29 -5
data/lib/xapian_fu/xapian_doc.rb
CHANGED
@@ -1,20 +1,72 @@
|
|
1
|
-
|
1
|
+
class Time #:nodoc:
|
2
|
+
def to_xapian_fu_string
|
3
|
+
utc.strftime("%Y%m%d%H%M%S")
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
class Date #:nodoc:
|
8
|
+
def to_xapian_fu_string
|
9
|
+
strftime("%Y%m%d")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
require 'date'
|
14
|
+
|
15
|
+
class DateTime #:nodoc:
|
16
|
+
def to_xapian_fu_string
|
17
|
+
strftime("%Y%m%d%H%M%S")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module XapianFu #:nodoc:
|
22
|
+
require 'xapian_doc_value_accessor'
|
2
23
|
|
24
|
+
# Raised whenever a XapianDb is needed but has not been provided,
|
25
|
+
# such as when retrieving the terms list for a document
|
3
26
|
class XapianDbNotSet < XapianFuError ; end
|
4
|
-
|
27
|
+
# Raised if a given value cannot be stored in the database (anything
|
28
|
+
# without a to_s method)
|
5
29
|
class XapianTypeError < XapianFuError ; end
|
6
|
-
|
30
|
+
|
31
|
+
# A XapianDoc represents a document in a XapianDb. Searches return
|
32
|
+
# XapianDoc objects and they are used internally when adding new
|
33
|
+
# documents to the database. You usually don't need to instantiate
|
34
|
+
# them yourself unless you're doing something a bit advanced.
|
7
35
|
class XapianDoc
|
8
|
-
|
9
|
-
|
10
|
-
|
36
|
+
|
37
|
+
# A hash of the fields given to this object on initialize
|
38
|
+
attr_reader :fields
|
39
|
+
|
40
|
+
# An abitrary blob of data stored alongside the document in the
|
41
|
+
# Xapian database.
|
42
|
+
attr_reader :data
|
43
|
+
|
44
|
+
# The search score of this document when returned as part of a
|
45
|
+
# search result
|
46
|
+
attr_reader :weight
|
47
|
+
|
48
|
+
# The Xapian::Match object for this document when returned as part
|
49
|
+
# of a search result.
|
50
|
+
attr_reader :match
|
51
|
+
|
52
|
+
# The unsigned integer "primary key" for this document in the
|
53
|
+
# Xapian database.
|
54
|
+
attr_accessor :id
|
55
|
+
|
56
|
+
# The XapianDb object that this document was retrieved from, or
|
57
|
+
# should be stored in.
|
58
|
+
attr_accessor :db
|
11
59
|
|
12
60
|
# Expects a Xapian::Document, a Hash-like object, or anything that
|
13
61
|
# with a to_s method. Anything else raises a XapianTypeError.
|
14
|
-
#
|
15
|
-
# <tt>:data</tt>
|
16
|
-
#
|
62
|
+
# The <tt>:weight</tt> option sets the search weight when setting
|
63
|
+
# up search results. The <tt>:data</tt> option sets some
|
64
|
+
# additional data to be stored with the document in the database.
|
65
|
+
# The <tt>:xapian_db</tt> option sets the XapianDb to allow saves
|
66
|
+
# and term enumeration.
|
17
67
|
def initialize(doc, options = {})
|
68
|
+
@options = options
|
69
|
+
|
18
70
|
@fields = {}
|
19
71
|
if doc.is_a? Xapian::Match
|
20
72
|
match = doc
|
@@ -28,19 +80,8 @@ module XapianFu
|
|
28
80
|
if doc.is_a?(Xapian::Document)
|
29
81
|
@xapian_document = doc
|
30
82
|
@id = doc.docid
|
31
|
-
begin
|
32
|
-
xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
|
33
|
-
rescue ArgumentError
|
34
|
-
@data = nil
|
35
|
-
end
|
36
|
-
if xdoc_data.is_a? Hash
|
37
|
-
@data = xdoc_data.delete(:__data)
|
38
|
-
@fields = xdoc_data
|
39
|
-
else
|
40
|
-
@data = xdoc_data
|
41
|
-
end
|
42
83
|
# Handle initialisation from a hash-like object
|
43
|
-
elsif doc.respond_to?(
|
84
|
+
elsif doc.respond_to?(:has_key?) and doc.respond_to?("[]")
|
44
85
|
@fields = doc
|
45
86
|
@id = doc[:id] if doc.has_key?(:id)
|
46
87
|
# Handle initialisation from anything else that can be coerced
|
@@ -52,18 +93,21 @@ module XapianFu
|
|
52
93
|
end
|
53
94
|
@weight = options[:weight] if options[:weight]
|
54
95
|
@data = options[:data] if options[:data]
|
96
|
+
@db = options[:xapian_db] if options[:xapian_db]
|
55
97
|
end
|
56
98
|
|
57
|
-
#
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
def get_value(vkey)
|
62
|
-
raise XapianDocNotSet unless @xapian_document
|
63
|
-
vkey = vkey.to_s.hash unless vkey.is_a? Integer
|
64
|
-
@xapian_document.value(vkey)
|
99
|
+
# The arbitrary data stored in the Xapian database with this
|
100
|
+
# document. Returns an empty string if none available.
|
101
|
+
def data
|
102
|
+
@data ||= xapian_document.data
|
65
103
|
end
|
66
104
|
|
105
|
+
# The XapianFu::XapianDocValueAccessor for accessing the values in
|
106
|
+
# this document.
|
107
|
+
def values
|
108
|
+
@value_accessor ||= XapianDocValueAccessor.new(self)
|
109
|
+
end
|
110
|
+
|
67
111
|
# Return a list of terms that the db has for this document.
|
68
112
|
def terms
|
69
113
|
raise XapianFu::XapianDbNotSet unless db
|
@@ -74,17 +118,25 @@ module XapianFu
|
|
74
118
|
# database. Requires that the db attribute has been set up.
|
75
119
|
def to_xapian_document
|
76
120
|
raise XapianFu::XapianDbNotSet unless db
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
121
|
+
xapian_document.data = data
|
122
|
+
# Clear and add values
|
123
|
+
xapian_document.clear_values
|
124
|
+
add_values_to_xapian_document
|
125
|
+
# Clear and add terms
|
126
|
+
xapian_document.clear_terms
|
127
|
+
generate_terms
|
128
|
+
xapian_document
|
81
129
|
end
|
82
130
|
|
83
|
-
#
|
84
|
-
|
85
|
-
|
131
|
+
# The Xapian::Document for this XapianFu::Document. If this
|
132
|
+
# document was retrieved from a XapianDb then this will have been
|
133
|
+
# initialized by Xapian, otherwise a new Xapian::Document.new is
|
134
|
+
# allocated.
|
135
|
+
def xapian_document
|
136
|
+
@xapian_document ||= Xapian::Document.new
|
86
137
|
end
|
87
138
|
|
139
|
+
# Compare IDs with another XapianDoc
|
88
140
|
def ==(b)
|
89
141
|
if b.is_a?(XapianDoc)
|
90
142
|
id == b.id
|
@@ -94,24 +146,154 @@ module XapianFu
|
|
94
146
|
end
|
95
147
|
|
96
148
|
def inspect
|
97
|
-
"<#{self.class.to_s} id=#{id}
|
149
|
+
s = ["<#{self.class.to_s} id=#{id}"]
|
150
|
+
s << "weight=%.5f" % weight if weight
|
151
|
+
s << "db=#{db.nil? ? 'nil' : db}"
|
152
|
+
s.join(' ') + ">"
|
153
|
+
end
|
154
|
+
|
155
|
+
# Add this document to the Xapian Database, or replace it if it
|
156
|
+
# already has an id.
|
157
|
+
def save
|
158
|
+
id ? update : create
|
159
|
+
end
|
160
|
+
|
161
|
+
# Add this document to the Xapian Database
|
162
|
+
def create
|
163
|
+
self.id = db.rw.add_document(to_xapian_document)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Update this document in the Xapian Database
|
167
|
+
def update
|
168
|
+
db.rw.replace_document(id, to_xapian_document)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Set the stemmer to use for this document. Accepts any string
|
172
|
+
# that the Xapian::Stem class accepts (Either the English name for
|
173
|
+
# the language or the two letter ISO639 code). Can also be an
|
174
|
+
# existing Xapian::Stem object.
|
175
|
+
def stemmer=(s)
|
176
|
+
@stemmer = StemFactory.stemmer_for(s)
|
177
|
+
end
|
178
|
+
|
179
|
+
# Return the stemmer for this document. If not set on initialize
|
180
|
+
# by the :stemmer or :language option, it will try the database's
|
181
|
+
# stemmer and otherwise defaults to an English stemmer.
|
182
|
+
def stemmer
|
183
|
+
if @stemmer
|
184
|
+
@stemmer
|
185
|
+
else
|
186
|
+
@stemmer =
|
187
|
+
if ! @options[:stemmer].nil?
|
188
|
+
@options[:stemmer]
|
189
|
+
elsif @options[:language]
|
190
|
+
@options[:language]
|
191
|
+
elsif db
|
192
|
+
db.stemmer
|
193
|
+
else
|
194
|
+
:english
|
195
|
+
end
|
196
|
+
@stemmer = StemFactory.stemmer_for(@stemmer)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Return the stopper for this document. If not set on initialize
|
201
|
+
# by the :stopper or :language option, it will try the database's
|
202
|
+
# stopper and otherwise default to an English stopper..
|
203
|
+
def stopper
|
204
|
+
if @stopper
|
205
|
+
@stopper
|
206
|
+
else
|
207
|
+
@stopper =
|
208
|
+
if ! @options[:stopper].nil?
|
209
|
+
@options[:stopper]
|
210
|
+
elsif @options[:language]
|
211
|
+
@options[:language]
|
212
|
+
elsif db
|
213
|
+
db.stopper
|
214
|
+
else
|
215
|
+
:english
|
216
|
+
end
|
217
|
+
@stopper = StopperFactory.stopper_for(@stopper)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# Return this document's language which is set on initialize, inherited
|
222
|
+
# from the database or defaults to :english
|
223
|
+
def language
|
224
|
+
if @language
|
225
|
+
@language
|
226
|
+
else
|
227
|
+
@language =
|
228
|
+
if ! @options[:language].nil?
|
229
|
+
@options[:language]
|
230
|
+
elsif db and db.language
|
231
|
+
db.language
|
232
|
+
else
|
233
|
+
:english
|
234
|
+
end
|
235
|
+
end
|
98
236
|
end
|
99
237
|
|
100
238
|
private
|
101
239
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
240
|
+
# Array of field names not to run through the TermGenerator
|
241
|
+
def unindexed_fields
|
242
|
+
db ? db.unindexed_fields : []
|
243
|
+
end
|
244
|
+
|
245
|
+
# Add all the fields to be stored as XapianDb values
|
246
|
+
def add_values_to_xapian_document
|
247
|
+
db.store_values.collect do |key|
|
248
|
+
values[key] = fields[key]
|
249
|
+
key
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# Run the Xapian term generator against this documents text
|
254
|
+
def generate_terms
|
255
|
+
tg = Xapian::TermGenerator.new
|
256
|
+
tg.database = db.rw
|
257
|
+
tg.document = xapian_document
|
258
|
+
tg.stopper = stopper
|
259
|
+
tg.stemmer = stemmer
|
260
|
+
index_method = db.index_positions ? :index_text : :index_text_without_positions
|
261
|
+
fields.each do |k,v|
|
262
|
+
next if unindexed_fields.include?(k)
|
263
|
+
if v.respond_to?(:to_xapian_fu_string)
|
264
|
+
v = v.to_xapian_fu_string
|
265
|
+
else
|
266
|
+
v = v.to_s
|
267
|
+
end
|
268
|
+
# add value with field name
|
269
|
+
tg.send(index_method, v, 1, 'X' + k.to_s.upcase)
|
270
|
+
# add value without field name
|
271
|
+
tg.send(index_method, v)
|
272
|
+
end
|
273
|
+
xapian_document
|
107
274
|
end
|
108
275
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
class StemFactory
|
280
|
+
# Return a Xapian::Stem object for the given option. Accepts any
|
281
|
+
# string that the Xapian::Stem class accepts (Either the English
|
282
|
+
# name for the language or the two letter ISO639 code).
|
283
|
+
#
|
284
|
+
# If given false or nil, will return a "none" stemmer.
|
285
|
+
#
|
286
|
+
# It will also accept and return an existing Xapian::Stem object.
|
287
|
+
#
|
288
|
+
def self.stemmer_for(stemmer)
|
289
|
+
if stemmer.is_a? Xapian::Stem
|
290
|
+
stemmer
|
291
|
+
elsif stemmer.is_a?(String) or stemmer.is_a?(Symbol)
|
292
|
+
Xapian::Stem.new(stemmer.to_s)
|
293
|
+
else
|
294
|
+
Xapian::Stem.new("none")
|
113
295
|
end
|
114
|
-
xdoc
|
115
296
|
end
|
116
297
|
end
|
298
|
+
|
117
299
|
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
class Integer #:nodoc:
|
2
|
+
def to_xapian_fu_storage_value
|
3
|
+
[self].pack("l")
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.from_xapian_fu_storage_value(value)
|
7
|
+
value.unpack("l").first
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class Bignum #:nodoc:
|
12
|
+
def to_xapian_fu_storage_value
|
13
|
+
[self].pack("G")
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.from_xapian_fu_storage_value(value)
|
17
|
+
value.unpack("G").first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Float #:nodoc:
|
22
|
+
def to_xapian_fu_storage_value
|
23
|
+
[self].pack("G")
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.from_xapian_fu_storage_value(value)
|
27
|
+
value.unpack("G").first
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Time #:nodoc:
|
32
|
+
def to_xapian_fu_storage_value
|
33
|
+
[self.utc.to_f].pack("G")
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.from_xapian_fu_storage_value(value)
|
37
|
+
Time.at(value.unpack("G").first)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Date #:nodoc:
|
42
|
+
def to_xapian_fu_storage_value
|
43
|
+
to_s
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.from_xapian_fu_storage_value(value)
|
47
|
+
self.parse(value)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
module XapianFu #:nodoc:
|
52
|
+
|
53
|
+
# A XapianDocValueAccessor is used to provide the XapianDoc#values
|
54
|
+
# interface to read and write field values to a XapianDb. It is
|
55
|
+
# usually set up by a XapianDoc so you shouldn't need to set up your
|
56
|
+
# own.
|
57
|
+
class XapianDocValueAccessor
|
58
|
+
def initialize(xapian_doc)
|
59
|
+
@doc = xapian_doc
|
60
|
+
end
|
61
|
+
|
62
|
+
# Add the given <tt>value</tt> with the given <tt>key</tt> to the
|
63
|
+
# XapianDoc. If the value has a
|
64
|
+
# <tt>to_xapian_fu_storage_value</tt> method then it is used to
|
65
|
+
# generate the final value to be stored, otherwise <tt>to_s</tt>
|
66
|
+
# is used. This is usually paired with a
|
67
|
+
# <tt>from_xapian_fu_storage_value</tt> class method on retrieval.
|
68
|
+
def store(key, value, type = nil)
|
69
|
+
type = @doc.db.fields[key] if type.nil? and @doc.db
|
70
|
+
if type and value.is_a?(type) and value.respond_to?(:to_xapian_fu_storage_value)
|
71
|
+
converted_value = value.to_xapian_fu_storage_value
|
72
|
+
else
|
73
|
+
converted_value = value.to_s
|
74
|
+
end
|
75
|
+
@doc.xapian_document.add_value(value_key(key), converted_value)
|
76
|
+
value
|
77
|
+
end
|
78
|
+
alias_method "[]=", :store
|
79
|
+
|
80
|
+
# Retrieve the value with the given <tt>key</tt> from the
|
81
|
+
# XapianDoc. <tt>key</tt> can be a symbol or string, in which case
|
82
|
+
# it's hashed to get an integer value number. Or you can give the
|
83
|
+
# integer value number if you know it.
|
84
|
+
#
|
85
|
+
# If the class specified in the database fields for this key (or
|
86
|
+
# as the optional argument) has a
|
87
|
+
# <tt>from_xapian_fu_storage_value</tt> method then it is used to
|
88
|
+
# instaniate the object from the stored value. This is usually
|
89
|
+
# paired with a <tt>to_xapian_fu_storage_value</tt> instance
|
90
|
+
# method.
|
91
|
+
#
|
92
|
+
# Due to the design of Xapian, if the value does not exist then an
|
93
|
+
# empty string is returned.
|
94
|
+
def fetch(key, type = nil)
|
95
|
+
value = @doc.xapian_document.value(value_key(key))
|
96
|
+
type = @doc.db.fields[key] if type.nil? and @doc.db
|
97
|
+
if type and type.respond_to?(:from_xapian_fu_storage_value)
|
98
|
+
type.from_xapian_fu_storage_value(value)
|
99
|
+
else
|
100
|
+
value
|
101
|
+
end
|
102
|
+
end
|
103
|
+
alias_method "[]", :fetch
|
104
|
+
|
105
|
+
# Count the values stored in the XapianDoc
|
106
|
+
def size
|
107
|
+
@doc.xapian_document.values_count
|
108
|
+
end
|
109
|
+
|
110
|
+
# Remove the value with the given key from the XapianDoc and return it
|
111
|
+
def delete(key)
|
112
|
+
value = fetch(key)
|
113
|
+
@doc.xapian_document.remove_value(value_key(key))
|
114
|
+
value
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
# Convert the given key to an integer that can be used as a Xapian
|
120
|
+
# value number
|
121
|
+
def value_key(key)
|
122
|
+
key.is_a?(Integer) ? key : key.to_s.hash
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module XapianFu
|
2
|
+
# A XapianDocumentsAccessor is used to provide the
|
3
|
+
# XapianDb#documents interface. It is usually set up by a XapianDb
|
4
|
+
# so you shouldn't need to set up your own.
|
5
|
+
class XapianDocumentsAccessor
|
6
|
+
def initialize(xdb) #:nodoc:
|
7
|
+
@xdb = xdb
|
8
|
+
end
|
9
|
+
|
10
|
+
# Build a new XapianDoc for this database
|
11
|
+
def new(doc = nil, options = { })
|
12
|
+
options = options.merge({ :xapian_db => @xdb })
|
13
|
+
XapianDoc.new(doc, options)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Add a document to the index. A document can be just a hash, the
|
17
|
+
# keys representing field names and their values the data to be
|
18
|
+
# indexed. Or it can be a XapianDoc, or any object with a to_s method.
|
19
|
+
#
|
20
|
+
# If the document has an :id field, it is used as the primary key
|
21
|
+
# in the Xapian database.
|
22
|
+
#
|
23
|
+
# If the document object reponds to the method :data, whatever it
|
24
|
+
# returns is marshalled and stored in the Xapian database. Any
|
25
|
+
# arbitrary data up to Xmeg can be stored here.
|
26
|
+
#
|
27
|
+
# Currently, all fields are stored in the database. This will
|
28
|
+
# change to store only those fields requested to be stored.
|
29
|
+
def add(doc)
|
30
|
+
doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
|
31
|
+
doc.db = @xdb
|
32
|
+
doc.save
|
33
|
+
doc
|
34
|
+
end
|
35
|
+
alias_method "<<", :add
|
36
|
+
|
37
|
+
# Return the document with the given id from the
|
38
|
+
# database. Raises a XapianFu::DocNotFoundError exception
|
39
|
+
# if it doesn't exist.
|
40
|
+
def find(doc_id)
|
41
|
+
xdoc = @xdb.ro.document(doc_id)
|
42
|
+
XapianDoc.new(xdoc, :xapian_db => @xdb)
|
43
|
+
rescue RuntimeError => e
|
44
|
+
raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the document with the given id from the database or nil
|
48
|
+
# if it doesn't exist
|
49
|
+
def [](doc_id)
|
50
|
+
find(doc_id)
|
51
|
+
rescue XapianFu::DocNotFound
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
# Delete the given document from the database and return the
|
56
|
+
# document id, or nil if it doesn't exist
|
57
|
+
def delete(doc)
|
58
|
+
if doc.respond_to?(:to_i)
|
59
|
+
@xdb.rw.delete_document(doc.to_i)
|
60
|
+
doc.to_i
|
61
|
+
end
|
62
|
+
rescue RuntimeError => e
|
63
|
+
raise e unless e.to_s =~ /^DocNotFoundError/
|
64
|
+
end
|
65
|
+
|
66
|
+
# Return the document with the highest value in the specified field or nil if it doesn't exist
|
67
|
+
def max(key = :id)
|
68
|
+
if key == :id
|
69
|
+
# for :id we can use lastdocid
|
70
|
+
find(@xdb.ro.lastdocid) rescue nil
|
71
|
+
else
|
72
|
+
# for other values, we do a search ordered by that key in descening order
|
73
|
+
query = Xapian::Query.new(Xapian::Query::OP_VALUE_GE, key.to_s.hash, "0")
|
74
|
+
e = Xapian::Enquire.new(@xdb.ro)
|
75
|
+
e.query = query
|
76
|
+
e.sort_by_value!(key.to_s.hash)
|
77
|
+
r = e.mset(0, 1).matches.first
|
78
|
+
find(r.docid) rescue nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/xapian_fu.rb
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
|
5
|
+
describe QueryParser do
|
6
|
+
describe "parse_query" do
|
7
|
+
it "should use the database's stopper" do
|
8
|
+
xdb = XapianDb.new(:stopper => :french)
|
9
|
+
qp = QueryParser.new(:database => xdb)
|
10
|
+
terms = qp.parse_query("avec and").terms.collect { |t| t.term }
|
11
|
+
terms.should_not include "Zavec"
|
12
|
+
terms.should include "Zand"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should use the database's stemmer" do
|
16
|
+
xdb = XapianDb.new(:stemmer => :french)
|
17
|
+
qp = QueryParser.new(:database => xdb)
|
18
|
+
terms = qp.parse_query("contournait fishing").terms.collect { |t| t.term }
|
19
|
+
terms.should include "Zcontourn"
|
20
|
+
terms.should_not include "Zfish"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should use the :fields option to set field names" do
|
24
|
+
qp = QueryParser.new(:fields => [:name, :age])
|
25
|
+
terms = qp.parse_query("name:john age:30").terms.collect { |t| t.term }
|
26
|
+
terms.should include "XNAMEjohn"
|
27
|
+
terms.should_not include "john"
|
28
|
+
terms.should include "XAGE30"
|
29
|
+
terms.should_not include "30"
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should use the database's field names as prefixes" do
|
33
|
+
xdb = XapianDb.new(:fields => [:name], :stemmer => :none)
|
34
|
+
qp = QueryParser.new(:database => xdb)
|
35
|
+
terms = qp.parse_query("name:john").terms.collect { |t| t.term }
|
36
|
+
terms.should include "XNAMEjohn"
|
37
|
+
terms.should_not include "john"
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
describe StopperFactory do
|
7
|
+
describe "stopper_for" do
|
8
|
+
it "should return a SimpleStopper loaded with the given languages stop words" do
|
9
|
+
stopper = StopperFactory.stopper_for(:english)
|
10
|
+
stopper.should be_a_kind_of Xapian::SimpleStopper
|
11
|
+
stopper.call("and").should be_true
|
12
|
+
stopper.call("theremin").should_not be_true
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return the given stopper unmodified if given a Xapian::Stopper object" do
|
16
|
+
stopper = Xapian::SimpleStopper.new
|
17
|
+
StopperFactory.stopper_for(stopper).should === stopper
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "stop_words_for" do
|
22
|
+
|
23
|
+
it "should return an array of words for the given language" do
|
24
|
+
words = StopperFactory.stop_words_for(:english)
|
25
|
+
words.should be_a_kind_of Array
|
26
|
+
words.should_not be_empty
|
27
|
+
words.should include 'and'
|
28
|
+
words.should include "they're"
|
29
|
+
end
|
30
|
+
|
31
|
+
%w(danish dutch english finnish french german hungarian italian norwegian portuguese russian spanish swedish).each do |lang|
|
32
|
+
describe lang do
|
33
|
+
it "should return an array of words" do
|
34
|
+
words = StopperFactory.stop_words_for(lang.to_sym)
|
35
|
+
words.should_not be_empty
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should return an array with no empty strings, nils or pipes" do
|
39
|
+
StopperFactory.stop_words_for(lang.to_sym).should_not include ''
|
40
|
+
StopperFactory.stop_words_for(lang.to_sym).should_not include nil
|
41
|
+
StopperFactory.stop_words_for(lang.to_sym).should_not include '|'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
it "should raise a UnsupportedStopperLanguage error if there is no data for the given language" do
|
48
|
+
Proc.new { StopperFactory.stop_words_for(:no_existy) }.should raise_error UnsupportedStopperLanguage
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should return characters in utf8" do
|
52
|
+
words = StopperFactory.stop_words_for(:russian)
|
53
|
+
words.should include "человек"
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|