xapian-fu 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.rdoc +39 -0
- data/examples/ar_query.rb +35 -0
- data/examples/ar_spider.rb +37 -0
- data/examples/query.rb +16 -0
- data/examples/spider.rb +28 -0
- data/lib/xapian_fu.rb +3 -0
- data/lib/xapian_fu/xapian_db.rb +193 -0
- data/lib/xapian_fu/xapian_doc.rb +117 -0
- data/spec/spec.opts +5 -0
- data/spec/xapian_db_spec.rb +295 -0
- data/spec/xapian_doc_spec.rb +16 -0
- metadata +69 -0
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
xapian_fu is released under the MIT License.
|
2
|
+
|
3
|
+
Copyright (c) 2009 John Leach
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of the acts_as_xapian software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including without
|
8
|
+
limitation the rights to use, copy, modify, merge, publish, distribute,
|
9
|
+
sublicense, and/or sell copies of the Software, and to permit persons to whom
|
10
|
+
the Software is furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
= Xapian Fu
|
2
|
+
|
3
|
+
XapianFu is a Ruby library for working with
|
4
|
+
{Xapian}[http://xapian.org/] databases. It builds on the GPL licensed
|
5
|
+
Xapian Ruby bindings but provides an interface more in-line with "The
|
6
|
+
Ruby Way"(tm).
|
7
|
+
|
8
|
+
== Example
|
9
|
+
|
10
|
+
Create a database, add 3 documents to it and then search and retrieve
|
11
|
+
them.
|
12
|
+
|
13
|
+
db = XapianDb.new(:dir => 'example.db', :create => true,
|
14
|
+
:store => [:title, :year])
|
15
|
+
db << { :title => 'Brokeback Mountain', :year => 2005 }
|
16
|
+
db << { :title => 'Cold Mountain', :year => 2004 }
|
17
|
+
db << { :title => 'Yes Man', :year => 2008 }
|
18
|
+
db.search("mountain").each do |match|
|
19
|
+
puts match.fields[:title]
|
20
|
+
end
|
21
|
+
|
22
|
+
== ActiveRecord Example
|
23
|
+
|
24
|
+
You could use it with something like ActiveRecord to index database
|
25
|
+
records:
|
26
|
+
|
27
|
+
db = XapianDb.new(:dir => 'posts.db', :create => true,
|
28
|
+
:store => :id)
|
29
|
+
Post.all.each { db << p.attributes }
|
30
|
+
db.search("custard").collect do |doc|
|
31
|
+
Post.find(doc.id)
|
32
|
+
end
|
33
|
+
|
34
|
+
= More Info
|
35
|
+
|
36
|
+
Author:: John Leach (mailto:john@johnleach.co.uk)
|
37
|
+
Copyright:: Copyright (c) 2009 John Leach
|
38
|
+
License:: GPL v2
|
39
|
+
Github:: http://github.com/johnl/xapian-fu/tree/master
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
include XapianFu
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
ActiveRecord::Base.establish_connection(
|
10
|
+
:adapter => "mysql",
|
11
|
+
:host => "localhost",
|
12
|
+
:username => "john",
|
13
|
+
:password => "john",
|
14
|
+
:database => "john_fametastic_dev" )
|
15
|
+
|
16
|
+
class WpPost < ActiveRecord::Base
|
17
|
+
set_primary_key :ID
|
18
|
+
end
|
19
|
+
|
20
|
+
#puts WpPost.new.attributes.keys.join(' ')
|
21
|
+
db = XapianDb.new(:dir => 'ar_spider.db')
|
22
|
+
|
23
|
+
results = nil
|
24
|
+
bm = Benchmark.measure do
|
25
|
+
results = db.search(ARGV.join(' '))
|
26
|
+
end
|
27
|
+
|
28
|
+
posts = WpPost.find(results.collect { |r| r.id })
|
29
|
+
|
30
|
+
puts "Weight\tTitle"
|
31
|
+
posts.each_with_index do |p,i|
|
32
|
+
puts "%.3f\t#{p.post_title}" % results[i].weight
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "Search took %.5f seconds" % bm.total
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
include XapianFu
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
ActiveRecord::Base.establish_connection(
|
10
|
+
:adapter => "mysql",
|
11
|
+
:host => "localhost",
|
12
|
+
:username => "john",
|
13
|
+
:password => "john",
|
14
|
+
:database => "john_fametastic_dev" )
|
15
|
+
|
16
|
+
class WpPost < ActiveRecord::Base
|
17
|
+
set_primary_key :ID
|
18
|
+
end
|
19
|
+
|
20
|
+
#puts WpPost.new.attributes.keys.join(' ')
|
21
|
+
db = XapianDb.new(:dir => 'ar_spider.db', :overwrite => true)
|
22
|
+
|
23
|
+
count = 0
|
24
|
+
indexing_time = 0.0
|
25
|
+
WpPost.find_in_batches do |posts|
|
26
|
+
db.transaction do
|
27
|
+
puts "Indexing wp_posts #{count} to #{count += posts.size}"
|
28
|
+
posts.each do |post|
|
29
|
+
bm = Benchmark.measure do
|
30
|
+
db << XapianDoc.new(post.attributes.merge({ :id => post.id }))
|
31
|
+
end
|
32
|
+
indexing_time += bm.total
|
33
|
+
end
|
34
|
+
end
|
35
|
+
indexing_time += Benchmark.measure { db.flush }.total
|
36
|
+
end
|
37
|
+
puts "%i documents took %.4f seconds. %.2f per second" % [count, indexing_time, count / indexing_time]
|
data/examples/query.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
|
7
|
+
query_string = ARGV.join(" ")
|
8
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db')
|
9
|
+
results = nil
|
10
|
+
bm = Benchmark.measure { results = db.search(query_string) }
|
11
|
+
puts "Weight\tFilename"
|
12
|
+
results.each do |result|
|
13
|
+
puts "%.2f\t%s" % [result.weight, result.fields[:filename]]
|
14
|
+
end
|
15
|
+
puts "Search took %.5f seconds" % bm.total
|
16
|
+
|
data/examples/spider.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
|
7
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => :filename,
|
8
|
+
:overwrite => true)
|
9
|
+
|
10
|
+
base_path = ARGV[0] || '.'
|
11
|
+
|
12
|
+
docs = 0
|
13
|
+
indexing_time = 0.0
|
14
|
+
Dir.glob(File.join(base_path, "/**/*")) do |filename|
|
15
|
+
next unless File.file?(filename)
|
16
|
+
next unless filename =~ /\.(txt|doc|README|c|h|rb|py|note|xml)$/i
|
17
|
+
puts "Indexing #{filename}"
|
18
|
+
text = File.open(filename) { |f| f.read(10 * 1024) }
|
19
|
+
bm = Benchmark.measure do
|
20
|
+
db << XapianFu::XapianDoc.new({:text => text, :filename => filename,
|
21
|
+
:filesize => File.size(filename) })
|
22
|
+
end
|
23
|
+
indexing_time += bm.total
|
24
|
+
docs += 1
|
25
|
+
break if docs == 10000
|
26
|
+
end
|
27
|
+
indexing_time += Benchmark.measure { db.flush }.total
|
28
|
+
puts "#{docs} docs indexed in #{indexing_time} seconds (#{docs / indexing_time} docs per second)"
|
data/lib/xapian_fu.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
module XapianFu
|
2
|
+
class XapianFuError < StandardError ; end
|
3
|
+
|
4
|
+
require 'xapian'
|
5
|
+
require 'xapian_doc'
|
6
|
+
require 'thread'
|
7
|
+
|
8
|
+
class ConcurrencyError < XapianFuError ; end
|
9
|
+
class DocNotFound < XapianFuError ; end
|
10
|
+
|
11
|
+
class XapianDb
|
12
|
+
attr_reader :dir, :db_flag, :query_parser
|
13
|
+
attr_reader :store_fields, :store_values
|
14
|
+
|
15
|
+
def initialize( options = { } )
|
16
|
+
@dir = options[:dir]
|
17
|
+
@db_flag = Xapian::DB_OPEN
|
18
|
+
@db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
|
19
|
+
@db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
|
20
|
+
@store_fields = Array.new(1, options[:store]).compact
|
21
|
+
@store_values = Array.new(1, options[:sortable]).compact
|
22
|
+
@store_values += Array.new(1, options[:collapsible]).compact
|
23
|
+
rw.flush if options[:create]
|
24
|
+
@tx_mutex = Mutex.new
|
25
|
+
end
|
26
|
+
|
27
|
+
# Return the writable Xapian database
|
28
|
+
def rw
|
29
|
+
@rw ||= setup_rw_db
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return the read-only Xapian database
|
33
|
+
def ro
|
34
|
+
@ro ||= setup_ro_db
|
35
|
+
end
|
36
|
+
|
37
|
+
# Return the number of docs in the Xapian database
|
38
|
+
def size
|
39
|
+
ro.doccount
|
40
|
+
end
|
41
|
+
|
42
|
+
# Return the XapianDocumentsAccessor for this database
|
43
|
+
def documents
|
44
|
+
@documents_accessor ||= XapianDocumentsAccessor.new(self)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Add a document to the index. A document can be just a hash, the
|
48
|
+
# keys representing field names and their values the data to be
|
49
|
+
# indexed. Or it can be a XapianDoc, or any object with a to_s method.
|
50
|
+
#
|
51
|
+
# If the document object reponds to the method :data, whatever it
|
52
|
+
# returns is marshalled and stored in the Xapian database. Any
|
53
|
+
# arbitrary data up to Xmeg can be stored here.
|
54
|
+
#
|
55
|
+
# Currently, all fields are stored in the database. This will
|
56
|
+
# change to store only those fields requested to be stored.
|
57
|
+
def add_doc(doc)
|
58
|
+
doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
|
59
|
+
doc.db = self
|
60
|
+
xdoc = doc.to_xapian_document
|
61
|
+
tg = Xapian::TermGenerator.new
|
62
|
+
tg.database = rw
|
63
|
+
tg.document = xdoc
|
64
|
+
tg.index_text( doc.text )
|
65
|
+
if doc.id
|
66
|
+
rw.replace_document(doc.id, xdoc)
|
67
|
+
else
|
68
|
+
doc.id = rw.add_document(xdoc)
|
69
|
+
end
|
70
|
+
doc
|
71
|
+
end
|
72
|
+
alias_method "<<", :add_doc
|
73
|
+
|
74
|
+
# Conduct a search on the Xapian database, returning an array of
|
75
|
+
# XapianDoc objects for the matches
|
76
|
+
def search(q, options = {})
|
77
|
+
defaults = { :page => 1, :per_page => 10, :reverse => false }
|
78
|
+
options = defaults.merge(options)
|
79
|
+
page = options[:page].to_i rescue 1
|
80
|
+
page = page > 1 ? page - 1 : 0
|
81
|
+
per_page = options[:per_page].to_i rescue 10
|
82
|
+
offset = page * per_page
|
83
|
+
query = query_parser.parse_query(q, Xapian::QueryParser::FLAG_WILDCARD && Xapian::QueryParser::FLAG_LOVEHATE)
|
84
|
+
if options[:order]
|
85
|
+
enquiry.sort_by_value!(options[:order].to_s.hash, options[:reverse])
|
86
|
+
end
|
87
|
+
if options[:collapse]
|
88
|
+
enquiry.collapse_key = options[:collapse].to_s.hash
|
89
|
+
end
|
90
|
+
enquiry.query = query
|
91
|
+
enquiry.mset(offset, per_page).matches.collect { |m| XapianDoc.new(m) }
|
92
|
+
end
|
93
|
+
|
94
|
+
# Run the given block in a XapianDB transaction. Any changes to the
|
95
|
+
# Xapian database made in the block will be atomically committed at the end.
|
96
|
+
#
|
97
|
+
# If an exception is raised by the block, all changes are discarded and the
|
98
|
+
# exception re-raised.
|
99
|
+
#
|
100
|
+
# Xapian does not support multiple concurrent transactions on the
|
101
|
+
# same Xapian database. Any attempts at this will be serialized by
|
102
|
+
# XapianFu, which is not perfect but probably better than just
|
103
|
+
# kicking up an exception.
|
104
|
+
#
|
105
|
+
def transaction
|
106
|
+
@tx_mutex.synchronize do
|
107
|
+
rw.begin_transaction
|
108
|
+
yield
|
109
|
+
rw.commit_transaction
|
110
|
+
end
|
111
|
+
rescue Exception => e
|
112
|
+
rw.cancel_transaction
|
113
|
+
raise e
|
114
|
+
end
|
115
|
+
|
116
|
+
# Flush any changes to disk and reopen the read-only database.
|
117
|
+
# Raises ConcurrencyError if a transaction is in process
|
118
|
+
def flush
|
119
|
+
raise ConcurrencyError if @tx_mutex.locked?
|
120
|
+
rw.flush
|
121
|
+
ro.reopen
|
122
|
+
end
|
123
|
+
|
124
|
+
def query_parser
|
125
|
+
unless @query_parser
|
126
|
+
@query_parser = Xapian::QueryParser.new
|
127
|
+
@query_parser.database = ro
|
128
|
+
end
|
129
|
+
@query_parser
|
130
|
+
end
|
131
|
+
|
132
|
+
def enquiry
|
133
|
+
@enquiry ||= Xapian::Enquire.new(ro)
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def setup_rw_db
|
139
|
+
if dir
|
140
|
+
@rw = Xapian::WritableDatabase.new(dir, db_flag)
|
141
|
+
else
|
142
|
+
# In memory database
|
143
|
+
@rw = Xapian::inmemory_open
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def setup_ro_db
|
148
|
+
if dir
|
149
|
+
@ro = Xapian::Database.new(dir)
|
150
|
+
else
|
151
|
+
# In memory db
|
152
|
+
@ro = rw
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
class XapianDocumentsAccessor
|
158
|
+
def initialize(xdb)
|
159
|
+
@xdb = xdb
|
160
|
+
end
|
161
|
+
|
162
|
+
# Return the document with the given id from the
|
163
|
+
# database. Raises a XapianFu::DocNotFoundError exception
|
164
|
+
# if it doesn't exist.
|
165
|
+
def find(doc_id)
|
166
|
+
xdoc = @xdb.ro.document(doc_id)
|
167
|
+
XapianDoc.new(xdoc)
|
168
|
+
rescue RuntimeError => e
|
169
|
+
raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
|
170
|
+
end
|
171
|
+
|
172
|
+
# Return the document with the given id from the database or nil
|
173
|
+
# if it doesn't exist
|
174
|
+
def [](doc_id)
|
175
|
+
find(doc_id)
|
176
|
+
rescue XapianFu::DocNotFound
|
177
|
+
nil
|
178
|
+
end
|
179
|
+
|
180
|
+
# Delete the given document from the database and return the
|
181
|
+
# document id, or nil if it doesn't exist
|
182
|
+
def delete(doc)
|
183
|
+
if doc.respond_to?(:to_i)
|
184
|
+
@xdb.rw.delete_document(doc.to_i)
|
185
|
+
doc.to_i
|
186
|
+
end
|
187
|
+
rescue RuntimeError => e
|
188
|
+
raise e unless e.to_s =~ /^DocNotFoundError/
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module XapianFu
|
2
|
+
|
3
|
+
class XapianDbNotSet < XapianFuError ; end
|
4
|
+
class XapianDocNotSet < XapianFuError ; end
|
5
|
+
class XapianTypeError < XapianFuError ; end
|
6
|
+
|
7
|
+
class XapianDoc
|
8
|
+
attr_reader :fields, :data, :weight, :match
|
9
|
+
attr_reader :xapian_document
|
10
|
+
attr_accessor :id, :db
|
11
|
+
|
12
|
+
# Expects a Xapian::Document, a Hash-like object, or anything that
|
13
|
+
# with a to_s method. Anything else raises a XapianTypeError.
|
14
|
+
# Options can be <tt>:weight</tt> to set the search weight or
|
15
|
+
# <tt>:data</tt> to set some additional data to be stored with the
|
16
|
+
# record in the database.
|
17
|
+
def initialize(doc, options = {})
|
18
|
+
@fields = {}
|
19
|
+
if doc.is_a? Xapian::Match
|
20
|
+
match = doc
|
21
|
+
doc = match.document
|
22
|
+
@match = match
|
23
|
+
@weight = @match.weight
|
24
|
+
end
|
25
|
+
|
26
|
+
# Handle initialisation from a Xapian::Document, which is
|
27
|
+
# usually a search result from a Xapian database
|
28
|
+
if doc.is_a?(Xapian::Document)
|
29
|
+
@xapian_document = doc
|
30
|
+
@id = doc.docid
|
31
|
+
begin
|
32
|
+
xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
|
33
|
+
rescue ArgumentError
|
34
|
+
@data = nil
|
35
|
+
end
|
36
|
+
if xdoc_data.is_a? Hash
|
37
|
+
@data = xdoc_data.delete(:__data)
|
38
|
+
@fields = xdoc_data
|
39
|
+
else
|
40
|
+
@data = xdoc_data
|
41
|
+
end
|
42
|
+
# Handle initialisation from a hash-like object
|
43
|
+
elsif doc.respond_to?("[]") and doc.respond_to?(:has_key?)
|
44
|
+
@fields = doc
|
45
|
+
@id = doc[:id] if doc.has_key?(:id)
|
46
|
+
# Handle initialisation from anything else that can be coerced
|
47
|
+
# into a string
|
48
|
+
elsif doc.respond_to? :to_s
|
49
|
+
@fields = { :content => doc.to_s }
|
50
|
+
else
|
51
|
+
raise XapianTypeError, "Can't handle indexing a '#{doc.class}' object"
|
52
|
+
end
|
53
|
+
@weight = options[:weight] if options[:weight]
|
54
|
+
@data = options[:data] if options[:data]
|
55
|
+
end
|
56
|
+
|
57
|
+
# Retrieve the given Xapianvalue from the XapianDb. <tt>vkey</tt>
|
58
|
+
# can be a symbol or string, in which case it's hashed to get an
|
59
|
+
# integer value number. Or you can give the integer value number
|
60
|
+
# if you know it.
|
61
|
+
def get_value(vkey)
|
62
|
+
raise XapianDocNotSet unless @xapian_document
|
63
|
+
vkey = vkey.to_s.hash unless vkey.is_a? Integer
|
64
|
+
@xapian_document.value(vkey)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return a list of terms that the db has for this document.
|
68
|
+
def terms
|
69
|
+
raise XapianFu::XapianDbNotSet unless db
|
70
|
+
db.ro.termlist(id) if db.respond_to?(:ro) and db.ro and id
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return a Xapian::Document ready for putting into a Xapian
|
74
|
+
# database. Requires that the db attribute has been set up.
|
75
|
+
def to_xapian_document
|
76
|
+
raise XapianFu::XapianDbNotSet unless db
|
77
|
+
xdoc = Xapian::Document.new
|
78
|
+
add_stored_fields_to_xapian_doc(xdoc)
|
79
|
+
add_stored_values_to_xapian_doc(xdoc)
|
80
|
+
xdoc
|
81
|
+
end
|
82
|
+
|
83
|
+
# Return text for indexing from the fields
|
84
|
+
def text
|
85
|
+
fields.keys.collect { |key| fields[key].to_s }.join(' ')
|
86
|
+
end
|
87
|
+
|
88
|
+
def ==(b)
|
89
|
+
if b.is_a?(XapianDoc)
|
90
|
+
id == b.id
|
91
|
+
else
|
92
|
+
super(b)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def inspect
|
97
|
+
"<#{self.class.to_s} id=#{id}>"
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def add_stored_fields_to_xapian_doc(xdoc)
|
103
|
+
stored_fields = fields.reject { |k,v| ! db.store_fields.include? k }
|
104
|
+
stored_fields[:__data] = data if data
|
105
|
+
xdoc.data = Marshal.dump(stored_fields) unless stored_fields.empty?
|
106
|
+
xdoc
|
107
|
+
end
|
108
|
+
|
109
|
+
def add_stored_values_to_xapian_doc(xdoc)
|
110
|
+
stored_values = fields.reject { |k,v| ! db.store_values.include? k }
|
111
|
+
stored_values.each do |k,v|
|
112
|
+
xdoc.add_value(k.to_s.hash, v.to_s)
|
113
|
+
end
|
114
|
+
xdoc
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1,295 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
# Will be deleted
|
7
|
+
tmp_dir = '/tmp/xapian_fu_test.db'
|
8
|
+
|
9
|
+
describe XapianDb do
|
10
|
+
before do
|
11
|
+
FileUtils.rm_rf tmp_dir if File.exists?(tmp_dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should make an in-memory database by default" do
|
15
|
+
xdb = XapianDb.new
|
16
|
+
xdb.ro.should be_a_kind_of(Xapian::Database)
|
17
|
+
xdb.rw.should === xdb.ro
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should make an on-disk database when given a :dir option" do
|
21
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
22
|
+
File.exists?(tmp_dir).should be_true
|
23
|
+
xdb.should respond_to(:dir)
|
24
|
+
xdb.dir.should == tmp_dir
|
25
|
+
xdb.rw.should be_a_kind_of(Xapian::WritableDatabase)
|
26
|
+
xdb.ro.should be_a_kind_of(Xapian::Database)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should flush documents to the index when flush is called" do
|
30
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
31
|
+
xdb.size.should == 0
|
32
|
+
xdb << "Once upon a time"
|
33
|
+
xdb.size.should == 0
|
34
|
+
xdb.flush
|
35
|
+
xdb.size.should == 1
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should support transactions" do
|
39
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
40
|
+
xdb << "Once upon a time"
|
41
|
+
xdb.transaction do
|
42
|
+
xdb << "Once upon a time"
|
43
|
+
xdb.size.should == 1
|
44
|
+
end
|
45
|
+
xdb.flush
|
46
|
+
xdb.size.should == 2
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should serialize attempts at concurrent transactions" do
|
50
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
51
|
+
thread = Thread.new do
|
52
|
+
xdb.transaction do
|
53
|
+
sleep 0.1
|
54
|
+
xdb << "Once upon a time"
|
55
|
+
sleep 0.1
|
56
|
+
xdb << "Once upon a time"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
xdb.transaction do
|
60
|
+
xdb << "Once upon a time"
|
61
|
+
sleep 0.1
|
62
|
+
xdb << "Once upon a time"
|
63
|
+
end
|
64
|
+
thread.join
|
65
|
+
xdb.flush
|
66
|
+
xdb.size.should == 4
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should abort a transaction on an exception" do
|
70
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
71
|
+
xdb << "Once upon a time"
|
72
|
+
begin
|
73
|
+
xdb.transaction do
|
74
|
+
xdb << "Once upon a time"
|
75
|
+
raise StandardError
|
76
|
+
end
|
77
|
+
rescue StandardError
|
78
|
+
end
|
79
|
+
xdb.flush
|
80
|
+
xdb.size.should == 1
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should index a XapianDoc" do
|
84
|
+
xdb = XapianDb.new
|
85
|
+
xdb << XapianDoc.new({ :text => "once upon a time", :title => "A story" })
|
86
|
+
xdb.flush
|
87
|
+
xdb.size.should == 1
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should index a Hash" do
|
91
|
+
xdb = XapianDb.new
|
92
|
+
xdb << { :text => "once upon a time", :title => "A story" }
|
93
|
+
xdb.flush
|
94
|
+
xdb.size.should == 1
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should index a string" do
|
98
|
+
xdb = XapianDb.new
|
99
|
+
xdb << "once upon a time"
|
100
|
+
xdb.size.should == 1
|
101
|
+
xdb << XapianDoc.new("once upon a time")
|
102
|
+
xdb.size.should == 2
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should raise a XapianFu::DocNotFound error on find if the document doesn't exist" do
|
106
|
+
xdb = XapianDb.new
|
107
|
+
xdb << "once upon a time"
|
108
|
+
xdb.flush
|
109
|
+
lambda { xdb.documents.find(10) }.should raise_error XapianFu::DocNotFound
|
110
|
+
end
|
111
|
+
|
112
|
+
it "should retrieve documents with the find method" do
|
113
|
+
xdb = XapianDb.new
|
114
|
+
xdb << "Once upon a time"
|
115
|
+
xdb.flush
|
116
|
+
xdb.documents.find(1).should be_a_kind_of(XapianDoc)
|
117
|
+
end
|
118
|
+
|
119
|
+
it "should retrieve documents like an array and return a XapianDoc" do
|
120
|
+
xdb = XapianDb.new
|
121
|
+
xdb << "once upon a time"
|
122
|
+
xdb.flush
|
123
|
+
xdb.documents[1].should be_a_kind_of(XapianDoc)
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should provide the id of retrieved documents" do
|
127
|
+
xdb = XapianDb.new
|
128
|
+
xdb << "once upon a time"
|
129
|
+
xdb.documents[1].id.should == 1
|
130
|
+
end
|
131
|
+
|
132
|
+
it "should store data in the database" do
|
133
|
+
xdb = XapianDb.new
|
134
|
+
xdb << XapianDoc.new({ :text => "once upon a time" }, :data => { :thing => 0xdeadbeef })
|
135
|
+
xdb.size.should == 1
|
136
|
+
doc = xdb.documents[1]
|
137
|
+
doc.data.should == { :thing => 0xdeadbeef }
|
138
|
+
end
|
139
|
+
|
140
|
+
it "should return a XapianDoc with an id after indexing" do
|
141
|
+
xdb = XapianDb.new
|
142
|
+
doc = XapianDoc.new("once upon a time")
|
143
|
+
doc.id.should == nil
|
144
|
+
new_doc = xdb << doc
|
145
|
+
new_doc.id.should == 1
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should replace docs that already have an id when adding to the db" do
|
149
|
+
xdb = XapianDb.new
|
150
|
+
doc = xdb << XapianDoc.new("Once upon a time")
|
151
|
+
xdb.flush
|
152
|
+
xdb.size.should == 1
|
153
|
+
doc.id.should == 1
|
154
|
+
updated_doc = xdb << doc
|
155
|
+
xdb.flush
|
156
|
+
xdb.size.should == 1
|
157
|
+
updated_doc.id.should == doc.id
|
158
|
+
end
|
159
|
+
|
160
|
+
it "should delete docs by id" do
|
161
|
+
xdb = XapianDb.new
|
162
|
+
doc = xdb << XapianDoc.new("Once upon a time")
|
163
|
+
xdb.flush
|
164
|
+
xdb.size.should == 1
|
165
|
+
xdb.documents.delete(doc.id).should == 1
|
166
|
+
xdb.flush
|
167
|
+
xdb.size.should == 0
|
168
|
+
end
|
169
|
+
|
170
|
+
it "should handle being asked to delete docs that don't exist in the db" do
|
171
|
+
xdb = XapianDb.new
|
172
|
+
doc = xdb << XapianDoc.new("Once upon a time")
|
173
|
+
xdb.flush
|
174
|
+
xdb.documents.delete(100000).should == nil
|
175
|
+
end
|
176
|
+
|
177
|
+
it "should add new docs with the given id" do
|
178
|
+
xdb = XapianDb.new
|
179
|
+
doc = xdb << XapianDoc.new(:id => 0xbeef, :title => "Once upon a time")
|
180
|
+
xdb.flush
|
181
|
+
xdb.documents[0xbeef].id.should == 0xbeef
|
182
|
+
doc.id.should == 0xbeef
|
183
|
+
end
|
184
|
+
|
185
|
+
it "should tokenize strings" do
|
186
|
+
xdb = XapianDb.new
|
187
|
+
doc = xdb << XapianDoc.new("once upon a time")
|
188
|
+
doc.terms.should be_a_kind_of Array
|
189
|
+
doc.terms.last.should be_a_kind_of Xapian::Term
|
190
|
+
doc.terms.last.term.should == "upon"
|
191
|
+
end
|
192
|
+
|
193
|
+
it "should tokenize a hash" do
|
194
|
+
xdb = XapianDb.new
|
195
|
+
doc = xdb << XapianDoc.new(:title => 'once upon a time')
|
196
|
+
doc.terms.should be_a_kind_of Array
|
197
|
+
doc.terms.last.should be_a_kind_of Xapian::Term
|
198
|
+
doc.terms.last.term.should == "upon"
|
199
|
+
end
|
200
|
+
|
201
|
+
it "should return a list of XapianDocs with the weight and match set when returning search results" do
|
202
|
+
xdb = XapianDb.new
|
203
|
+
xdb << XapianDoc.new(:title => 'once upon a time')
|
204
|
+
xdb << XapianDoc.new(:title => 'three little pings')
|
205
|
+
results = xdb.search("pings")
|
206
|
+
results.should be_a_kind_of Array
|
207
|
+
results.size.should == 1
|
208
|
+
results.first.should be_a_kind_of XapianDoc
|
209
|
+
results.first.match.should be_a_kind_of Xapian::Match
|
210
|
+
results.first.weight.should be_a_kind_of Float
|
211
|
+
end
|
212
|
+
|
213
|
+
it "should support searching with :page and :per_page options" do
|
214
|
+
xdb = XapianDb.new
|
215
|
+
content = "word"
|
216
|
+
200.times { xdb << XapianDoc.new(content) }
|
217
|
+
xdb.size.should == 200
|
218
|
+
results = xdb.search(content, :page => 1, :per_page => 12)
|
219
|
+
results.first.id.should == 1
|
220
|
+
results.size.should == 12
|
221
|
+
results = xdb.search(content, :page => 5, :per_page => 18)
|
222
|
+
results.first.id.should == 18 * 4 + 1
|
223
|
+
results.size.should == 18
|
224
|
+
results = xdb.search(content, :page => 100, :per_page => 12)
|
225
|
+
results.size.should == 0
|
226
|
+
end
|
227
|
+
|
228
|
+
it "should store no fields by default" do
|
229
|
+
xdb = XapianDb.new
|
230
|
+
xdb << XapianDoc.new(:title => "Once upon a time")
|
231
|
+
xdb.flush
|
232
|
+
xdb.documents.find(1).fields[:title].should be_nil
|
233
|
+
end
|
234
|
+
|
235
|
+
it "should store fields declared as to be stored" do
|
236
|
+
xdb = XapianDb.new(:store => :title)
|
237
|
+
xdb << XapianDoc.new(:title => "Once upon a time", :author => "Jim Jones")
|
238
|
+
xdb.flush
|
239
|
+
doc = xdb.documents.find(1)
|
240
|
+
doc.fields[:title].should == "Once upon a time"
|
241
|
+
doc.fields[:author].should be_nil
|
242
|
+
end
|
243
|
+
|
244
|
+
it "should store values declared as to be sortable" do
|
245
|
+
xdb = XapianDb.new(:sortable => :created_at)
|
246
|
+
time = Time.now
|
247
|
+
xdb << XapianDoc.new(:created_at => time.to_i.to_s, :author => "Jim Jones")
|
248
|
+
xdb.flush
|
249
|
+
doc = xdb.documents.find(1)
|
250
|
+
doc.get_value(:created_at).should == time.to_i.to_s
|
251
|
+
end
|
252
|
+
|
253
|
+
it "should store values declared as to be collapsible" do
|
254
|
+
xdb = XapianDb.new(:collapsible => :group_id)
|
255
|
+
xdb << XapianDoc.new(:group_id => "666", :author => "Jim Jones")
|
256
|
+
xdb.flush
|
257
|
+
doc = xdb.documents.find(1)
|
258
|
+
doc.get_value(:group_id).should == "666"
|
259
|
+
end
|
260
|
+
|
261
|
+
describe "search results sort order" do
|
262
|
+
before(:each) do
|
263
|
+
@xdb = XapianDb.new(:sortable => :number)
|
264
|
+
@expected_results = []
|
265
|
+
@expected_results << (@xdb << XapianDoc.new(:words => "cow dog cat", :number => 1))
|
266
|
+
@expected_results << (@xdb << XapianDoc.new(:words => "cow dog", :number => 3))
|
267
|
+
@expected_results << (@xdb << XapianDoc.new(:words => "cow", :number => 2))
|
268
|
+
end
|
269
|
+
|
270
|
+
it "should be by relevance by default" do
|
271
|
+
results = @xdb.search("cow dog cat")
|
272
|
+
results.should == @expected_results
|
273
|
+
end
|
274
|
+
|
275
|
+
it "should be by the value specified in descending numerical order" do
|
276
|
+
results = @xdb.search("cow dog cat", :order => :number)
|
277
|
+
results.should == @expected_results.sort_by { |r| r.fields[:number] }
|
278
|
+
end
|
279
|
+
|
280
|
+
it "should be reversed when the reverse option is set to true" do
|
281
|
+
results = @xdb.search("cow dog cat", :order => :number, :reverse => true)
|
282
|
+
results.should == @expected_results.sort_by { |r| r.fields[:number] }.reverse
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
it "should collapse results by the value specified by the :collapse option" do
|
287
|
+
xdb = XapianDb.new(:collapsible => :group)
|
288
|
+
alpha1 = xdb << XapianDoc.new(:words => "cow dog cat", :group => "alpha")
|
289
|
+
alpha2 = xdb << XapianDoc.new(:words => "cow dog", :group => "alpha")
|
290
|
+
beta1 = xdb << XapianDoc.new(:words => "cow", :group => "beta")
|
291
|
+
results = xdb.search("cow dog cat", :collapse => :group)
|
292
|
+
results.should == [alpha1, beta1]
|
293
|
+
end
|
294
|
+
|
295
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
describe XapianDoc do
|
7
|
+
|
8
|
+
it "should be equal to other XapianDoc objects with the same id" do
|
9
|
+
XapianDoc.new(:id => 666).should == XapianDoc.new(:id => 666)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should not be equal to other XapianDoc objects with different ids" do
|
13
|
+
XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: xapian-fu
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.2"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Leach
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-20 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: A library to provide a more Ruby-like interface to the Xapian search engine.
|
17
|
+
email: john@johnleach.co.uk
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.rdoc
|
24
|
+
- LICENSE
|
25
|
+
files:
|
26
|
+
- lib/xapian_fu.rb
|
27
|
+
- lib/xapian_fu
|
28
|
+
- lib/xapian_fu/xapian_db.rb
|
29
|
+
- lib/xapian_fu/xapian_doc.rb
|
30
|
+
- examples/ar_spider.rb
|
31
|
+
- examples/query.rb
|
32
|
+
- examples/spider.rb
|
33
|
+
- examples/ar_query.rb
|
34
|
+
- README.rdoc
|
35
|
+
- LICENSE
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://github.com/johnl/xapian-fu/tree/master
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options:
|
40
|
+
- --title
|
41
|
+
- Xapian Fu
|
42
|
+
- --main
|
43
|
+
- README.rdoc
|
44
|
+
- --line-numbers
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: "0"
|
52
|
+
version:
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: "0"
|
58
|
+
version:
|
59
|
+
requirements: []
|
60
|
+
|
61
|
+
rubyforge_project: xapian-fu
|
62
|
+
rubygems_version: 1.3.1
|
63
|
+
signing_key:
|
64
|
+
specification_version: 2
|
65
|
+
summary: A Ruby interface to the Xapian search engine
|
66
|
+
test_files:
|
67
|
+
- spec/xapian_doc_spec.rb
|
68
|
+
- spec/xapian_db_spec.rb
|
69
|
+
- spec/spec.opts
|