xapian-fu 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README.rdoc +39 -0
- data/examples/ar_query.rb +35 -0
- data/examples/ar_spider.rb +37 -0
- data/examples/query.rb +16 -0
- data/examples/spider.rb +28 -0
- data/lib/xapian_fu.rb +3 -0
- data/lib/xapian_fu/xapian_db.rb +193 -0
- data/lib/xapian_fu/xapian_doc.rb +117 -0
- data/spec/spec.opts +5 -0
- data/spec/xapian_db_spec.rb +295 -0
- data/spec/xapian_doc_spec.rb +16 -0
- metadata +69 -0
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
xapian_fu is released under the MIT License.
|
2
|
+
|
3
|
+
Copyright (c) 2009 John Leach
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of the acts_as_xapian software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including without
|
8
|
+
limitation the rights to use, copy, modify, merge, publish, distribute,
|
9
|
+
sublicense, and/or sell copies of the Software, and to permit persons to whom
|
10
|
+
the Software is furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
= Xapian Fu
|
2
|
+
|
3
|
+
XapianFu is a Ruby library for working with
|
4
|
+
{Xapian}[http://xapian.org/] databases. It builds on the GPL licensed
|
5
|
+
Xapian Ruby bindings but provides an interface more in-line with "The
|
6
|
+
Ruby Way"(tm).
|
7
|
+
|
8
|
+
== Example
|
9
|
+
|
10
|
+
Create a database, add 3 documents to it and then search and retrieve
|
11
|
+
them.
|
12
|
+
|
13
|
+
db = XapianDb.new(:dir => 'example.db', :create => true,
|
14
|
+
:store => [:title, :year])
|
15
|
+
db << { :title => 'Brokeback Mountain', :year => 2005 }
|
16
|
+
db << { :title => 'Cold Mountain', :year => 2004 }
|
17
|
+
db << { :title => 'Yes Man', :year => 2008 }
|
18
|
+
db.search("mountain").each do |match|
|
19
|
+
puts match.fields[:title]
|
20
|
+
end
|
21
|
+
|
22
|
+
== ActiveRecord Example
|
23
|
+
|
24
|
+
You could use it with something like ActiveRecord to index database
|
25
|
+
records:
|
26
|
+
|
27
|
+
db = XapianDb.new(:dir => 'posts.db', :create => true,
|
28
|
+
:store => :id)
|
29
|
+
Post.all.each { db << p.attributes }
|
30
|
+
db.search("custard").collect do |doc|
|
31
|
+
Post.find(doc.id)
|
32
|
+
end
|
33
|
+
|
34
|
+
= More Info
|
35
|
+
|
36
|
+
Author:: John Leach (mailto:john@johnleach.co.uk)
|
37
|
+
Copyright:: Copyright (c) 2009 John Leach
|
38
|
+
License:: GPL v2
|
39
|
+
Github:: http://github.com/johnl/xapian-fu/tree/master
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
include XapianFu
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
ActiveRecord::Base.establish_connection(
|
10
|
+
:adapter => "mysql",
|
11
|
+
:host => "localhost",
|
12
|
+
:username => "john",
|
13
|
+
:password => "john",
|
14
|
+
:database => "john_fametastic_dev" )
|
15
|
+
|
16
|
+
class WpPost < ActiveRecord::Base
|
17
|
+
set_primary_key :ID
|
18
|
+
end
|
19
|
+
|
20
|
+
#puts WpPost.new.attributes.keys.join(' ')
|
21
|
+
db = XapianDb.new(:dir => 'ar_spider.db')
|
22
|
+
|
23
|
+
results = nil
|
24
|
+
bm = Benchmark.measure do
|
25
|
+
results = db.search(ARGV.join(' '))
|
26
|
+
end
|
27
|
+
|
28
|
+
posts = WpPost.find(results.collect { |r| r.id })
|
29
|
+
|
30
|
+
puts "Weight\tTitle"
|
31
|
+
posts.each_with_index do |p,i|
|
32
|
+
puts "%.3f\t#{p.post_title}" % results[i].weight
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "Search took %.5f seconds" % bm.total
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
include XapianFu
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
ActiveRecord::Base.establish_connection(
|
10
|
+
:adapter => "mysql",
|
11
|
+
:host => "localhost",
|
12
|
+
:username => "john",
|
13
|
+
:password => "john",
|
14
|
+
:database => "john_fametastic_dev" )
|
15
|
+
|
16
|
+
class WpPost < ActiveRecord::Base
|
17
|
+
set_primary_key :ID
|
18
|
+
end
|
19
|
+
|
20
|
+
#puts WpPost.new.attributes.keys.join(' ')
|
21
|
+
db = XapianDb.new(:dir => 'ar_spider.db', :overwrite => true)
|
22
|
+
|
23
|
+
count = 0
|
24
|
+
indexing_time = 0.0
|
25
|
+
WpPost.find_in_batches do |posts|
|
26
|
+
db.transaction do
|
27
|
+
puts "Indexing wp_posts #{count} to #{count += posts.size}"
|
28
|
+
posts.each do |post|
|
29
|
+
bm = Benchmark.measure do
|
30
|
+
db << XapianDoc.new(post.attributes.merge({ :id => post.id }))
|
31
|
+
end
|
32
|
+
indexing_time += bm.total
|
33
|
+
end
|
34
|
+
end
|
35
|
+
indexing_time += Benchmark.measure { db.flush }.total
|
36
|
+
end
|
37
|
+
puts "%i documents took %.4f seconds. %.2f per second" % [count, indexing_time, count / indexing_time]
|
data/examples/query.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
|
7
|
+
query_string = ARGV.join(" ")
|
8
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db')
|
9
|
+
results = nil
|
10
|
+
bm = Benchmark.measure { results = db.search(query_string) }
|
11
|
+
puts "Weight\tFilename"
|
12
|
+
results.each do |result|
|
13
|
+
puts "%.2f\t%s" % [result.weight, result.fields[:filename]]
|
14
|
+
end
|
15
|
+
puts "Search took %.5f seconds" % bm.total
|
16
|
+
|
data/examples/spider.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'benchmark'
|
5
|
+
require 'lib/xapian_fu'
|
6
|
+
|
7
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => :filename,
|
8
|
+
:overwrite => true)
|
9
|
+
|
10
|
+
base_path = ARGV[0] || '.'
|
11
|
+
|
12
|
+
docs = 0
|
13
|
+
indexing_time = 0.0
|
14
|
+
Dir.glob(File.join(base_path, "/**/*")) do |filename|
|
15
|
+
next unless File.file?(filename)
|
16
|
+
next unless filename =~ /\.(txt|doc|README|c|h|rb|py|note|xml)$/i
|
17
|
+
puts "Indexing #{filename}"
|
18
|
+
text = File.open(filename) { |f| f.read(10 * 1024) }
|
19
|
+
bm = Benchmark.measure do
|
20
|
+
db << XapianFu::XapianDoc.new({:text => text, :filename => filename,
|
21
|
+
:filesize => File.size(filename) })
|
22
|
+
end
|
23
|
+
indexing_time += bm.total
|
24
|
+
docs += 1
|
25
|
+
break if docs == 10000
|
26
|
+
end
|
27
|
+
indexing_time += Benchmark.measure { db.flush }.total
|
28
|
+
puts "#{docs} docs indexed in #{indexing_time} seconds (#{docs / indexing_time} docs per second)"
|
data/lib/xapian_fu.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
module XapianFu
|
2
|
+
class XapianFuError < StandardError ; end
|
3
|
+
|
4
|
+
require 'xapian'
|
5
|
+
require 'xapian_doc'
|
6
|
+
require 'thread'
|
7
|
+
|
8
|
+
class ConcurrencyError < XapianFuError ; end
|
9
|
+
class DocNotFound < XapianFuError ; end
|
10
|
+
|
11
|
+
class XapianDb
|
12
|
+
attr_reader :dir, :db_flag, :query_parser
|
13
|
+
attr_reader :store_fields, :store_values
|
14
|
+
|
15
|
+
def initialize( options = { } )
|
16
|
+
@dir = options[:dir]
|
17
|
+
@db_flag = Xapian::DB_OPEN
|
18
|
+
@db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
|
19
|
+
@db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
|
20
|
+
@store_fields = Array.new(1, options[:store]).compact
|
21
|
+
@store_values = Array.new(1, options[:sortable]).compact
|
22
|
+
@store_values += Array.new(1, options[:collapsible]).compact
|
23
|
+
rw.flush if options[:create]
|
24
|
+
@tx_mutex = Mutex.new
|
25
|
+
end
|
26
|
+
|
27
|
+
# Return the writable Xapian database
|
28
|
+
def rw
|
29
|
+
@rw ||= setup_rw_db
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return the read-only Xapian database
|
33
|
+
def ro
|
34
|
+
@ro ||= setup_ro_db
|
35
|
+
end
|
36
|
+
|
37
|
+
# Return the number of docs in the Xapian database
|
38
|
+
def size
|
39
|
+
ro.doccount
|
40
|
+
end
|
41
|
+
|
42
|
+
# Return the XapianDocumentsAccessor for this database
|
43
|
+
def documents
|
44
|
+
@documents_accessor ||= XapianDocumentsAccessor.new(self)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Add a document to the index. A document can be just a hash, the
|
48
|
+
# keys representing field names and their values the data to be
|
49
|
+
# indexed. Or it can be a XapianDoc, or any object with a to_s method.
|
50
|
+
#
|
51
|
+
# If the document object reponds to the method :data, whatever it
|
52
|
+
# returns is marshalled and stored in the Xapian database. Any
|
53
|
+
# arbitrary data up to Xmeg can be stored here.
|
54
|
+
#
|
55
|
+
# Currently, all fields are stored in the database. This will
|
56
|
+
# change to store only those fields requested to be stored.
|
57
|
+
def add_doc(doc)
|
58
|
+
doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
|
59
|
+
doc.db = self
|
60
|
+
xdoc = doc.to_xapian_document
|
61
|
+
tg = Xapian::TermGenerator.new
|
62
|
+
tg.database = rw
|
63
|
+
tg.document = xdoc
|
64
|
+
tg.index_text( doc.text )
|
65
|
+
if doc.id
|
66
|
+
rw.replace_document(doc.id, xdoc)
|
67
|
+
else
|
68
|
+
doc.id = rw.add_document(xdoc)
|
69
|
+
end
|
70
|
+
doc
|
71
|
+
end
|
72
|
+
alias_method "<<", :add_doc
|
73
|
+
|
74
|
+
# Conduct a search on the Xapian database, returning an array of
|
75
|
+
# XapianDoc objects for the matches
|
76
|
+
def search(q, options = {})
|
77
|
+
defaults = { :page => 1, :per_page => 10, :reverse => false }
|
78
|
+
options = defaults.merge(options)
|
79
|
+
page = options[:page].to_i rescue 1
|
80
|
+
page = page > 1 ? page - 1 : 0
|
81
|
+
per_page = options[:per_page].to_i rescue 10
|
82
|
+
offset = page * per_page
|
83
|
+
query = query_parser.parse_query(q, Xapian::QueryParser::FLAG_WILDCARD && Xapian::QueryParser::FLAG_LOVEHATE)
|
84
|
+
if options[:order]
|
85
|
+
enquiry.sort_by_value!(options[:order].to_s.hash, options[:reverse])
|
86
|
+
end
|
87
|
+
if options[:collapse]
|
88
|
+
enquiry.collapse_key = options[:collapse].to_s.hash
|
89
|
+
end
|
90
|
+
enquiry.query = query
|
91
|
+
enquiry.mset(offset, per_page).matches.collect { |m| XapianDoc.new(m) }
|
92
|
+
end
|
93
|
+
|
94
|
+
# Run the given block in a XapianDB transaction. Any changes to the
|
95
|
+
# Xapian database made in the block will be atomically committed at the end.
|
96
|
+
#
|
97
|
+
# If an exception is raised by the block, all changes are discarded and the
|
98
|
+
# exception re-raised.
|
99
|
+
#
|
100
|
+
# Xapian does not support multiple concurrent transactions on the
|
101
|
+
# same Xapian database. Any attempts at this will be serialized by
|
102
|
+
# XapianFu, which is not perfect but probably better than just
|
103
|
+
# kicking up an exception.
|
104
|
+
#
|
105
|
+
def transaction
|
106
|
+
@tx_mutex.synchronize do
|
107
|
+
rw.begin_transaction
|
108
|
+
yield
|
109
|
+
rw.commit_transaction
|
110
|
+
end
|
111
|
+
rescue Exception => e
|
112
|
+
rw.cancel_transaction
|
113
|
+
raise e
|
114
|
+
end
|
115
|
+
|
116
|
+
# Flush any changes to disk and reopen the read-only database.
|
117
|
+
# Raises ConcurrencyError if a transaction is in process
|
118
|
+
def flush
|
119
|
+
raise ConcurrencyError if @tx_mutex.locked?
|
120
|
+
rw.flush
|
121
|
+
ro.reopen
|
122
|
+
end
|
123
|
+
|
124
|
+
def query_parser
|
125
|
+
unless @query_parser
|
126
|
+
@query_parser = Xapian::QueryParser.new
|
127
|
+
@query_parser.database = ro
|
128
|
+
end
|
129
|
+
@query_parser
|
130
|
+
end
|
131
|
+
|
132
|
+
def enquiry
|
133
|
+
@enquiry ||= Xapian::Enquire.new(ro)
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def setup_rw_db
|
139
|
+
if dir
|
140
|
+
@rw = Xapian::WritableDatabase.new(dir, db_flag)
|
141
|
+
else
|
142
|
+
# In memory database
|
143
|
+
@rw = Xapian::inmemory_open
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def setup_ro_db
|
148
|
+
if dir
|
149
|
+
@ro = Xapian::Database.new(dir)
|
150
|
+
else
|
151
|
+
# In memory db
|
152
|
+
@ro = rw
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
class XapianDocumentsAccessor
|
158
|
+
def initialize(xdb)
|
159
|
+
@xdb = xdb
|
160
|
+
end
|
161
|
+
|
162
|
+
# Return the document with the given id from the
|
163
|
+
# database. Raises a XapianFu::DocNotFoundError exception
|
164
|
+
# if it doesn't exist.
|
165
|
+
def find(doc_id)
|
166
|
+
xdoc = @xdb.ro.document(doc_id)
|
167
|
+
XapianDoc.new(xdoc)
|
168
|
+
rescue RuntimeError => e
|
169
|
+
raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
|
170
|
+
end
|
171
|
+
|
172
|
+
# Return the document with the given id from the database or nil
|
173
|
+
# if it doesn't exist
|
174
|
+
def [](doc_id)
|
175
|
+
find(doc_id)
|
176
|
+
rescue XapianFu::DocNotFound
|
177
|
+
nil
|
178
|
+
end
|
179
|
+
|
180
|
+
# Delete the given document from the database and return the
|
181
|
+
# document id, or nil if it doesn't exist
|
182
|
+
def delete(doc)
|
183
|
+
if doc.respond_to?(:to_i)
|
184
|
+
@xdb.rw.delete_document(doc.to_i)
|
185
|
+
doc.to_i
|
186
|
+
end
|
187
|
+
rescue RuntimeError => e
|
188
|
+
raise e unless e.to_s =~ /^DocNotFoundError/
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module XapianFu
|
2
|
+
|
3
|
+
class XapianDbNotSet < XapianFuError ; end
|
4
|
+
class XapianDocNotSet < XapianFuError ; end
|
5
|
+
class XapianTypeError < XapianFuError ; end
|
6
|
+
|
7
|
+
class XapianDoc
|
8
|
+
attr_reader :fields, :data, :weight, :match
|
9
|
+
attr_reader :xapian_document
|
10
|
+
attr_accessor :id, :db
|
11
|
+
|
12
|
+
# Expects a Xapian::Document, a Hash-like object, or anything that
|
13
|
+
# with a to_s method. Anything else raises a XapianTypeError.
|
14
|
+
# Options can be <tt>:weight</tt> to set the search weight or
|
15
|
+
# <tt>:data</tt> to set some additional data to be stored with the
|
16
|
+
# record in the database.
|
17
|
+
def initialize(doc, options = {})
|
18
|
+
@fields = {}
|
19
|
+
if doc.is_a? Xapian::Match
|
20
|
+
match = doc
|
21
|
+
doc = match.document
|
22
|
+
@match = match
|
23
|
+
@weight = @match.weight
|
24
|
+
end
|
25
|
+
|
26
|
+
# Handle initialisation from a Xapian::Document, which is
|
27
|
+
# usually a search result from a Xapian database
|
28
|
+
if doc.is_a?(Xapian::Document)
|
29
|
+
@xapian_document = doc
|
30
|
+
@id = doc.docid
|
31
|
+
begin
|
32
|
+
xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
|
33
|
+
rescue ArgumentError
|
34
|
+
@data = nil
|
35
|
+
end
|
36
|
+
if xdoc_data.is_a? Hash
|
37
|
+
@data = xdoc_data.delete(:__data)
|
38
|
+
@fields = xdoc_data
|
39
|
+
else
|
40
|
+
@data = xdoc_data
|
41
|
+
end
|
42
|
+
# Handle initialisation from a hash-like object
|
43
|
+
elsif doc.respond_to?("[]") and doc.respond_to?(:has_key?)
|
44
|
+
@fields = doc
|
45
|
+
@id = doc[:id] if doc.has_key?(:id)
|
46
|
+
# Handle initialisation from anything else that can be coerced
|
47
|
+
# into a string
|
48
|
+
elsif doc.respond_to? :to_s
|
49
|
+
@fields = { :content => doc.to_s }
|
50
|
+
else
|
51
|
+
raise XapianTypeError, "Can't handle indexing a '#{doc.class}' object"
|
52
|
+
end
|
53
|
+
@weight = options[:weight] if options[:weight]
|
54
|
+
@data = options[:data] if options[:data]
|
55
|
+
end
|
56
|
+
|
57
|
+
# Retrieve the given Xapianvalue from the XapianDb. <tt>vkey</tt>
|
58
|
+
# can be a symbol or string, in which case it's hashed to get an
|
59
|
+
# integer value number. Or you can give the integer value number
|
60
|
+
# if you know it.
|
61
|
+
def get_value(vkey)
|
62
|
+
raise XapianDocNotSet unless @xapian_document
|
63
|
+
vkey = vkey.to_s.hash unless vkey.is_a? Integer
|
64
|
+
@xapian_document.value(vkey)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return a list of terms that the db has for this document.
|
68
|
+
def terms
|
69
|
+
raise XapianFu::XapianDbNotSet unless db
|
70
|
+
db.ro.termlist(id) if db.respond_to?(:ro) and db.ro and id
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return a Xapian::Document ready for putting into a Xapian
|
74
|
+
# database. Requires that the db attribute has been set up.
|
75
|
+
def to_xapian_document
|
76
|
+
raise XapianFu::XapianDbNotSet unless db
|
77
|
+
xdoc = Xapian::Document.new
|
78
|
+
add_stored_fields_to_xapian_doc(xdoc)
|
79
|
+
add_stored_values_to_xapian_doc(xdoc)
|
80
|
+
xdoc
|
81
|
+
end
|
82
|
+
|
83
|
+
# Return text for indexing from the fields
|
84
|
+
def text
|
85
|
+
fields.keys.collect { |key| fields[key].to_s }.join(' ')
|
86
|
+
end
|
87
|
+
|
88
|
+
def ==(b)
|
89
|
+
if b.is_a?(XapianDoc)
|
90
|
+
id == b.id
|
91
|
+
else
|
92
|
+
super(b)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def inspect
|
97
|
+
"<#{self.class.to_s} id=#{id}>"
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def add_stored_fields_to_xapian_doc(xdoc)
|
103
|
+
stored_fields = fields.reject { |k,v| ! db.store_fields.include? k }
|
104
|
+
stored_fields[:__data] = data if data
|
105
|
+
xdoc.data = Marshal.dump(stored_fields) unless stored_fields.empty?
|
106
|
+
xdoc
|
107
|
+
end
|
108
|
+
|
109
|
+
def add_stored_values_to_xapian_doc(xdoc)
|
110
|
+
stored_values = fields.reject { |k,v| ! db.store_values.include? k }
|
111
|
+
stored_values.each do |k,v|
|
112
|
+
xdoc.add_value(k.to_s.hash, v.to_s)
|
113
|
+
end
|
114
|
+
xdoc
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1,295 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
# Will be deleted
|
7
|
+
tmp_dir = '/tmp/xapian_fu_test.db'
|
8
|
+
|
9
|
+
describe XapianDb do
|
10
|
+
before do
|
11
|
+
FileUtils.rm_rf tmp_dir if File.exists?(tmp_dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should make an in-memory database by default" do
|
15
|
+
xdb = XapianDb.new
|
16
|
+
xdb.ro.should be_a_kind_of(Xapian::Database)
|
17
|
+
xdb.rw.should === xdb.ro
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should make an on-disk database when given a :dir option" do
|
21
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
22
|
+
File.exists?(tmp_dir).should be_true
|
23
|
+
xdb.should respond_to(:dir)
|
24
|
+
xdb.dir.should == tmp_dir
|
25
|
+
xdb.rw.should be_a_kind_of(Xapian::WritableDatabase)
|
26
|
+
xdb.ro.should be_a_kind_of(Xapian::Database)
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should flush documents to the index when flush is called" do
|
30
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
31
|
+
xdb.size.should == 0
|
32
|
+
xdb << "Once upon a time"
|
33
|
+
xdb.size.should == 0
|
34
|
+
xdb.flush
|
35
|
+
xdb.size.should == 1
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should support transactions" do
|
39
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
40
|
+
xdb << "Once upon a time"
|
41
|
+
xdb.transaction do
|
42
|
+
xdb << "Once upon a time"
|
43
|
+
xdb.size.should == 1
|
44
|
+
end
|
45
|
+
xdb.flush
|
46
|
+
xdb.size.should == 2
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should serialize attempts at concurrent transactions" do
|
50
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
51
|
+
thread = Thread.new do
|
52
|
+
xdb.transaction do
|
53
|
+
sleep 0.1
|
54
|
+
xdb << "Once upon a time"
|
55
|
+
sleep 0.1
|
56
|
+
xdb << "Once upon a time"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
xdb.transaction do
|
60
|
+
xdb << "Once upon a time"
|
61
|
+
sleep 0.1
|
62
|
+
xdb << "Once upon a time"
|
63
|
+
end
|
64
|
+
thread.join
|
65
|
+
xdb.flush
|
66
|
+
xdb.size.should == 4
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should abort a transaction on an exception" do
|
70
|
+
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
71
|
+
xdb << "Once upon a time"
|
72
|
+
begin
|
73
|
+
xdb.transaction do
|
74
|
+
xdb << "Once upon a time"
|
75
|
+
raise StandardError
|
76
|
+
end
|
77
|
+
rescue StandardError
|
78
|
+
end
|
79
|
+
xdb.flush
|
80
|
+
xdb.size.should == 1
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should index a XapianDoc" do
|
84
|
+
xdb = XapianDb.new
|
85
|
+
xdb << XapianDoc.new({ :text => "once upon a time", :title => "A story" })
|
86
|
+
xdb.flush
|
87
|
+
xdb.size.should == 1
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should index a Hash" do
|
91
|
+
xdb = XapianDb.new
|
92
|
+
xdb << { :text => "once upon a time", :title => "A story" }
|
93
|
+
xdb.flush
|
94
|
+
xdb.size.should == 1
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should index a string" do
|
98
|
+
xdb = XapianDb.new
|
99
|
+
xdb << "once upon a time"
|
100
|
+
xdb.size.should == 1
|
101
|
+
xdb << XapianDoc.new("once upon a time")
|
102
|
+
xdb.size.should == 2
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should raise a XapianFu::DocNotFound error on find if the document doesn't exist" do
|
106
|
+
xdb = XapianDb.new
|
107
|
+
xdb << "once upon a time"
|
108
|
+
xdb.flush
|
109
|
+
lambda { xdb.documents.find(10) }.should raise_error XapianFu::DocNotFound
|
110
|
+
end
|
111
|
+
|
112
|
+
it "should retrieve documents with the find method" do
|
113
|
+
xdb = XapianDb.new
|
114
|
+
xdb << "Once upon a time"
|
115
|
+
xdb.flush
|
116
|
+
xdb.documents.find(1).should be_a_kind_of(XapianDoc)
|
117
|
+
end
|
118
|
+
|
119
|
+
it "should retrieve documents like an array and return a XapianDoc" do
|
120
|
+
xdb = XapianDb.new
|
121
|
+
xdb << "once upon a time"
|
122
|
+
xdb.flush
|
123
|
+
xdb.documents[1].should be_a_kind_of(XapianDoc)
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should provide the id of retrieved documents" do
|
127
|
+
xdb = XapianDb.new
|
128
|
+
xdb << "once upon a time"
|
129
|
+
xdb.documents[1].id.should == 1
|
130
|
+
end
|
131
|
+
|
132
|
+
it "should store data in the database" do
|
133
|
+
xdb = XapianDb.new
|
134
|
+
xdb << XapianDoc.new({ :text => "once upon a time" }, :data => { :thing => 0xdeadbeef })
|
135
|
+
xdb.size.should == 1
|
136
|
+
doc = xdb.documents[1]
|
137
|
+
doc.data.should == { :thing => 0xdeadbeef }
|
138
|
+
end
|
139
|
+
|
140
|
+
it "should return a XapianDoc with an id after indexing" do
|
141
|
+
xdb = XapianDb.new
|
142
|
+
doc = XapianDoc.new("once upon a time")
|
143
|
+
doc.id.should == nil
|
144
|
+
new_doc = xdb << doc
|
145
|
+
new_doc.id.should == 1
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should replace docs that already have an id when adding to the db" do
|
149
|
+
xdb = XapianDb.new
|
150
|
+
doc = xdb << XapianDoc.new("Once upon a time")
|
151
|
+
xdb.flush
|
152
|
+
xdb.size.should == 1
|
153
|
+
doc.id.should == 1
|
154
|
+
updated_doc = xdb << doc
|
155
|
+
xdb.flush
|
156
|
+
xdb.size.should == 1
|
157
|
+
updated_doc.id.should == doc.id
|
158
|
+
end
|
159
|
+
|
160
|
+
it "should delete docs by id" do
|
161
|
+
xdb = XapianDb.new
|
162
|
+
doc = xdb << XapianDoc.new("Once upon a time")
|
163
|
+
xdb.flush
|
164
|
+
xdb.size.should == 1
|
165
|
+
xdb.documents.delete(doc.id).should == 1
|
166
|
+
xdb.flush
|
167
|
+
xdb.size.should == 0
|
168
|
+
end
|
169
|
+
|
170
|
+
it "should handle being asked to delete docs that don't exist in the db" do
|
171
|
+
xdb = XapianDb.new
|
172
|
+
doc = xdb << XapianDoc.new("Once upon a time")
|
173
|
+
xdb.flush
|
174
|
+
xdb.documents.delete(100000).should == nil
|
175
|
+
end
|
176
|
+
|
177
|
+
it "should add new docs with the given id" do
|
178
|
+
xdb = XapianDb.new
|
179
|
+
doc = xdb << XapianDoc.new(:id => 0xbeef, :title => "Once upon a time")
|
180
|
+
xdb.flush
|
181
|
+
xdb.documents[0xbeef].id.should == 0xbeef
|
182
|
+
doc.id.should == 0xbeef
|
183
|
+
end
|
184
|
+
|
185
|
+
it "should tokenize strings" do
|
186
|
+
xdb = XapianDb.new
|
187
|
+
doc = xdb << XapianDoc.new("once upon a time")
|
188
|
+
doc.terms.should be_a_kind_of Array
|
189
|
+
doc.terms.last.should be_a_kind_of Xapian::Term
|
190
|
+
doc.terms.last.term.should == "upon"
|
191
|
+
end
|
192
|
+
|
193
|
+
it "should tokenize a hash" do
|
194
|
+
xdb = XapianDb.new
|
195
|
+
doc = xdb << XapianDoc.new(:title => 'once upon a time')
|
196
|
+
doc.terms.should be_a_kind_of Array
|
197
|
+
doc.terms.last.should be_a_kind_of Xapian::Term
|
198
|
+
doc.terms.last.term.should == "upon"
|
199
|
+
end
|
200
|
+
|
201
|
+
it "should return a list of XapianDocs with the weight and match set when returning search results" do
|
202
|
+
xdb = XapianDb.new
|
203
|
+
xdb << XapianDoc.new(:title => 'once upon a time')
|
204
|
+
xdb << XapianDoc.new(:title => 'three little pings')
|
205
|
+
results = xdb.search("pings")
|
206
|
+
results.should be_a_kind_of Array
|
207
|
+
results.size.should == 1
|
208
|
+
results.first.should be_a_kind_of XapianDoc
|
209
|
+
results.first.match.should be_a_kind_of Xapian::Match
|
210
|
+
results.first.weight.should be_a_kind_of Float
|
211
|
+
end
|
212
|
+
|
213
|
+
it "should support searching with :page and :per_page options" do
|
214
|
+
xdb = XapianDb.new
|
215
|
+
content = "word"
|
216
|
+
200.times { xdb << XapianDoc.new(content) }
|
217
|
+
xdb.size.should == 200
|
218
|
+
results = xdb.search(content, :page => 1, :per_page => 12)
|
219
|
+
results.first.id.should == 1
|
220
|
+
results.size.should == 12
|
221
|
+
results = xdb.search(content, :page => 5, :per_page => 18)
|
222
|
+
results.first.id.should == 18 * 4 + 1
|
223
|
+
results.size.should == 18
|
224
|
+
results = xdb.search(content, :page => 100, :per_page => 12)
|
225
|
+
results.size.should == 0
|
226
|
+
end
|
227
|
+
|
228
|
+
it "should store no fields by default" do
|
229
|
+
xdb = XapianDb.new
|
230
|
+
xdb << XapianDoc.new(:title => "Once upon a time")
|
231
|
+
xdb.flush
|
232
|
+
xdb.documents.find(1).fields[:title].should be_nil
|
233
|
+
end
|
234
|
+
|
235
|
+
it "should store fields declared as to be stored" do
|
236
|
+
xdb = XapianDb.new(:store => :title)
|
237
|
+
xdb << XapianDoc.new(:title => "Once upon a time", :author => "Jim Jones")
|
238
|
+
xdb.flush
|
239
|
+
doc = xdb.documents.find(1)
|
240
|
+
doc.fields[:title].should == "Once upon a time"
|
241
|
+
doc.fields[:author].should be_nil
|
242
|
+
end
|
243
|
+
|
244
|
+
it "should store values declared as to be sortable" do
|
245
|
+
xdb = XapianDb.new(:sortable => :created_at)
|
246
|
+
time = Time.now
|
247
|
+
xdb << XapianDoc.new(:created_at => time.to_i.to_s, :author => "Jim Jones")
|
248
|
+
xdb.flush
|
249
|
+
doc = xdb.documents.find(1)
|
250
|
+
doc.get_value(:created_at).should == time.to_i.to_s
|
251
|
+
end
|
252
|
+
|
253
|
+
it "should store values declared as to be collapsible" do
|
254
|
+
xdb = XapianDb.new(:collapsible => :group_id)
|
255
|
+
xdb << XapianDoc.new(:group_id => "666", :author => "Jim Jones")
|
256
|
+
xdb.flush
|
257
|
+
doc = xdb.documents.find(1)
|
258
|
+
doc.get_value(:group_id).should == "666"
|
259
|
+
end
|
260
|
+
|
261
|
+
describe "search results sort order" do
|
262
|
+
before(:each) do
|
263
|
+
@xdb = XapianDb.new(:sortable => :number)
|
264
|
+
@expected_results = []
|
265
|
+
@expected_results << (@xdb << XapianDoc.new(:words => "cow dog cat", :number => 1))
|
266
|
+
@expected_results << (@xdb << XapianDoc.new(:words => "cow dog", :number => 3))
|
267
|
+
@expected_results << (@xdb << XapianDoc.new(:words => "cow", :number => 2))
|
268
|
+
end
|
269
|
+
|
270
|
+
it "should be by relevance by default" do
|
271
|
+
results = @xdb.search("cow dog cat")
|
272
|
+
results.should == @expected_results
|
273
|
+
end
|
274
|
+
|
275
|
+
it "should be by the value specified in descending numerical order" do
|
276
|
+
results = @xdb.search("cow dog cat", :order => :number)
|
277
|
+
results.should == @expected_results.sort_by { |r| r.fields[:number] }
|
278
|
+
end
|
279
|
+
|
280
|
+
it "should be reversed when the reverse option is set to true" do
|
281
|
+
results = @xdb.search("cow dog cat", :order => :number, :reverse => true)
|
282
|
+
results.should == @expected_results.sort_by { |r| r.fields[:number] }.reverse
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
it "should collapse results by the value specified by the :collapse option" do
|
287
|
+
xdb = XapianDb.new(:collapsible => :group)
|
288
|
+
alpha1 = xdb << XapianDoc.new(:words => "cow dog cat", :group => "alpha")
|
289
|
+
alpha2 = xdb << XapianDoc.new(:words => "cow dog", :group => "alpha")
|
290
|
+
beta1 = xdb << XapianDoc.new(:words => "cow", :group => "beta")
|
291
|
+
results = xdb.search("cow dog cat", :collapse => :group)
|
292
|
+
results.should == [alpha1, beta1]
|
293
|
+
end
|
294
|
+
|
295
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
describe XapianDoc do
|
7
|
+
|
8
|
+
it "should be equal to other XapianDoc objects with the same id" do
|
9
|
+
XapianDoc.new(:id => 666).should == XapianDoc.new(:id => 666)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should not be equal to other XapianDoc objects with different ids" do
|
13
|
+
XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: xapian-fu
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.2"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Leach
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-20 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: A library to provide a more Ruby-like interface to the Xapian search engine.
|
17
|
+
email: john@johnleach.co.uk
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.rdoc
|
24
|
+
- LICENSE
|
25
|
+
files:
|
26
|
+
- lib/xapian_fu.rb
|
27
|
+
- lib/xapian_fu
|
28
|
+
- lib/xapian_fu/xapian_db.rb
|
29
|
+
- lib/xapian_fu/xapian_doc.rb
|
30
|
+
- examples/ar_spider.rb
|
31
|
+
- examples/query.rb
|
32
|
+
- examples/spider.rb
|
33
|
+
- examples/ar_query.rb
|
34
|
+
- README.rdoc
|
35
|
+
- LICENSE
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://github.com/johnl/xapian-fu/tree/master
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options:
|
40
|
+
- --title
|
41
|
+
- Xapian Fu
|
42
|
+
- --main
|
43
|
+
- README.rdoc
|
44
|
+
- --line-numbers
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: "0"
|
52
|
+
version:
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: "0"
|
58
|
+
version:
|
59
|
+
requirements: []
|
60
|
+
|
61
|
+
rubyforge_project: xapian-fu
|
62
|
+
rubygems_version: 1.3.1
|
63
|
+
signing_key:
|
64
|
+
specification_version: 2
|
65
|
+
summary: A Ruby interface to the Xapian search engine
|
66
|
+
test_files:
|
67
|
+
- spec/xapian_doc_spec.rb
|
68
|
+
- spec/xapian_db_spec.rb
|
69
|
+
- spec/spec.opts
|