xapian-fu 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ xapian_fu is released under the MIT License.
2
+
3
+ Copyright (c) 2009 John Leach
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of the acts_as_xapian software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including without
8
+ limitation the rights to use, copy, modify, merge, publish, distribute,
9
+ sublicense, and/or sell copies of the Software, and to permit persons to whom
10
+ the Software is furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,39 @@
1
+ = Xapian Fu
2
+
3
+ XapianFu is a Ruby library for working with
4
+ {Xapian}[http://xapian.org/] databases. It builds on the GPL licensed
5
+ Xapian Ruby bindings but provides an interface more in-line with "The
6
+ Ruby Way"(tm).
7
+
8
+ == Example
9
+
10
+ Create a database, add 3 documents to it and then search and retrieve
11
+ them.
12
+
13
+ db = XapianDb.new(:dir => 'example.db', :create => true,
14
+ :store => [:title, :year])
15
+ db << { :title => 'Brokeback Mountain', :year => 2005 }
16
+ db << { :title => 'Cold Mountain', :year => 2004 }
17
+ db << { :title => 'Yes Man', :year => 2008 }
18
+ db.search("mountain").each do |match|
19
+ puts match.fields[:title]
20
+ end
21
+
22
+ == ActiveRecord Example
23
+
24
+ You could use it with something like ActiveRecord to index database
25
+ records:
26
+
27
+ db = XapianDb.new(:dir => 'posts.db', :create => true,
28
+ :store => :id)
29
+ Post.all.each { db << p.attributes }
30
+ db.search("custard").collect do |doc|
31
+ Post.find(doc.id)
32
+ end
33
+
34
+ = More Info
35
+
36
+ Author:: John Leach (mailto:john@johnleach.co.uk)
37
+ Copyright:: Copyright (c) 2009 John Leach
38
+ License:: GPL v2
39
+ Github:: http://github.com/johnl/xapian-fu/tree/master
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+ include XapianFu
7
+ require 'active_record'
8
+
9
+ ActiveRecord::Base.establish_connection(
10
+ :adapter => "mysql",
11
+ :host => "localhost",
12
+ :username => "john",
13
+ :password => "john",
14
+ :database => "john_fametastic_dev" )
15
+
16
+ class WpPost < ActiveRecord::Base
17
+ set_primary_key :ID
18
+ end
19
+
20
+ #puts WpPost.new.attributes.keys.join(' ')
21
+ db = XapianDb.new(:dir => 'ar_spider.db')
22
+
23
+ results = nil
24
+ bm = Benchmark.measure do
25
+ results = db.search(ARGV.join(' '))
26
+ end
27
+
28
+ posts = WpPost.find(results.collect { |r| r.id })
29
+
30
+ puts "Weight\tTitle"
31
+ posts.each_with_index do |p,i|
32
+ puts "%.3f\t#{p.post_title}" % results[i].weight
33
+ end
34
+
35
+ puts "Search took %.5f seconds" % bm.total
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+ include XapianFu
7
+ require 'active_record'
8
+
9
+ ActiveRecord::Base.establish_connection(
10
+ :adapter => "mysql",
11
+ :host => "localhost",
12
+ :username => "john",
13
+ :password => "john",
14
+ :database => "john_fametastic_dev" )
15
+
16
+ class WpPost < ActiveRecord::Base
17
+ set_primary_key :ID
18
+ end
19
+
20
+ #puts WpPost.new.attributes.keys.join(' ')
21
+ db = XapianDb.new(:dir => 'ar_spider.db', :overwrite => true)
22
+
23
+ count = 0
24
+ indexing_time = 0.0
25
+ WpPost.find_in_batches do |posts|
26
+ db.transaction do
27
+ puts "Indexing wp_posts #{count} to #{count += posts.size}"
28
+ posts.each do |post|
29
+ bm = Benchmark.measure do
30
+ db << XapianDoc.new(post.attributes.merge({ :id => post.id }))
31
+ end
32
+ indexing_time += bm.total
33
+ end
34
+ end
35
+ indexing_time += Benchmark.measure { db.flush }.total
36
+ end
37
+ puts "%i documents took %.4f seconds. %.2f per second" % [count, indexing_time, count / indexing_time]
data/examples/query.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+
7
+ query_string = ARGV.join(" ")
8
+ db = XapianFu::XapianDb.new(:dir => 'spider.db')
9
+ results = nil
10
+ bm = Benchmark.measure { results = db.search(query_string) }
11
+ puts "Weight\tFilename"
12
+ results.each do |result|
13
+ puts "%.2f\t%s" % [result.weight, result.fields[:filename]]
14
+ end
15
+ puts "Search took %.5f seconds" % bm.total
16
+
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+
7
+ db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => :filename,
8
+ :overwrite => true)
9
+
10
+ base_path = ARGV[0] || '.'
11
+
12
+ docs = 0
13
+ indexing_time = 0.0
14
+ Dir.glob(File.join(base_path, "/**/*")) do |filename|
15
+ next unless File.file?(filename)
16
+ next unless filename =~ /\.(txt|doc|README|c|h|rb|py|note|xml)$/i
17
+ puts "Indexing #{filename}"
18
+ text = File.open(filename) { |f| f.read(10 * 1024) }
19
+ bm = Benchmark.measure do
20
+ db << XapianFu::XapianDoc.new({:text => text, :filename => filename,
21
+ :filesize => File.size(filename) })
22
+ end
23
+ indexing_time += bm.total
24
+ docs += 1
25
+ break if docs == 10000
26
+ end
27
+ indexing_time += Benchmark.measure { db.flush }.total
28
+ puts "#{docs} docs indexed in #{indexing_time} seconds (#{docs / indexing_time} docs per second)"
data/lib/xapian_fu.rb ADDED
@@ -0,0 +1,3 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), 'xapian_fu')
2
+ require 'xapian_db'
3
+ require 'xapian_doc'
@@ -0,0 +1,193 @@
1
+ module XapianFu
2
+ class XapianFuError < StandardError ; end
3
+
4
+ require 'xapian'
5
+ require 'xapian_doc'
6
+ require 'thread'
7
+
8
+ class ConcurrencyError < XapianFuError ; end
9
+ class DocNotFound < XapianFuError ; end
10
+
11
+ class XapianDb
12
+ attr_reader :dir, :db_flag, :query_parser
13
+ attr_reader :store_fields, :store_values
14
+
15
+ def initialize( options = { } )
16
+ @dir = options[:dir]
17
+ @db_flag = Xapian::DB_OPEN
18
+ @db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
19
+ @db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
20
+ @store_fields = Array.new(1, options[:store]).compact
21
+ @store_values = Array.new(1, options[:sortable]).compact
22
+ @store_values += Array.new(1, options[:collapsible]).compact
23
+ rw.flush if options[:create]
24
+ @tx_mutex = Mutex.new
25
+ end
26
+
27
+ # Return the writable Xapian database
28
+ def rw
29
+ @rw ||= setup_rw_db
30
+ end
31
+
32
+ # Return the read-only Xapian database
33
+ def ro
34
+ @ro ||= setup_ro_db
35
+ end
36
+
37
+ # Return the number of docs in the Xapian database
38
+ def size
39
+ ro.doccount
40
+ end
41
+
42
+ # Return the XapianDocumentsAccessor for this database
43
+ def documents
44
+ @documents_accessor ||= XapianDocumentsAccessor.new(self)
45
+ end
46
+
47
+ # Add a document to the index. A document can be just a hash, the
48
+ # keys representing field names and their values the data to be
49
+ # indexed. Or it can be a XapianDoc, or any object with a to_s method.
50
+ #
51
+ # If the document object reponds to the method :data, whatever it
52
+ # returns is marshalled and stored in the Xapian database. Any
53
+ # arbitrary data up to Xmeg can be stored here.
54
+ #
55
+ # Currently, all fields are stored in the database. This will
56
+ # change to store only those fields requested to be stored.
57
+ def add_doc(doc)
58
+ doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
59
+ doc.db = self
60
+ xdoc = doc.to_xapian_document
61
+ tg = Xapian::TermGenerator.new
62
+ tg.database = rw
63
+ tg.document = xdoc
64
+ tg.index_text( doc.text )
65
+ if doc.id
66
+ rw.replace_document(doc.id, xdoc)
67
+ else
68
+ doc.id = rw.add_document(xdoc)
69
+ end
70
+ doc
71
+ end
72
+ alias_method "<<", :add_doc
73
+
74
+ # Conduct a search on the Xapian database, returning an array of
75
+ # XapianDoc objects for the matches
76
+ def search(q, options = {})
77
+ defaults = { :page => 1, :per_page => 10, :reverse => false }
78
+ options = defaults.merge(options)
79
+ page = options[:page].to_i rescue 1
80
+ page = page > 1 ? page - 1 : 0
81
+ per_page = options[:per_page].to_i rescue 10
82
+ offset = page * per_page
83
+ query = query_parser.parse_query(q, Xapian::QueryParser::FLAG_WILDCARD && Xapian::QueryParser::FLAG_LOVEHATE)
84
+ if options[:order]
85
+ enquiry.sort_by_value!(options[:order].to_s.hash, options[:reverse])
86
+ end
87
+ if options[:collapse]
88
+ enquiry.collapse_key = options[:collapse].to_s.hash
89
+ end
90
+ enquiry.query = query
91
+ enquiry.mset(offset, per_page).matches.collect { |m| XapianDoc.new(m) }
92
+ end
93
+
94
+ # Run the given block in a XapianDB transaction. Any changes to the
95
+ # Xapian database made in the block will be atomically committed at the end.
96
+ #
97
+ # If an exception is raised by the block, all changes are discarded and the
98
+ # exception re-raised.
99
+ #
100
+ # Xapian does not support multiple concurrent transactions on the
101
+ # same Xapian database. Any attempts at this will be serialized by
102
+ # XapianFu, which is not perfect but probably better than just
103
+ # kicking up an exception.
104
+ #
105
+ def transaction
106
+ @tx_mutex.synchronize do
107
+ rw.begin_transaction
108
+ yield
109
+ rw.commit_transaction
110
+ end
111
+ rescue Exception => e
112
+ rw.cancel_transaction
113
+ raise e
114
+ end
115
+
116
+ # Flush any changes to disk and reopen the read-only database.
117
+ # Raises ConcurrencyError if a transaction is in process
118
+ def flush
119
+ raise ConcurrencyError if @tx_mutex.locked?
120
+ rw.flush
121
+ ro.reopen
122
+ end
123
+
124
+ def query_parser
125
+ unless @query_parser
126
+ @query_parser = Xapian::QueryParser.new
127
+ @query_parser.database = ro
128
+ end
129
+ @query_parser
130
+ end
131
+
132
+ def enquiry
133
+ @enquiry ||= Xapian::Enquire.new(ro)
134
+ end
135
+
136
+ private
137
+
138
+ def setup_rw_db
139
+ if dir
140
+ @rw = Xapian::WritableDatabase.new(dir, db_flag)
141
+ else
142
+ # In memory database
143
+ @rw = Xapian::inmemory_open
144
+ end
145
+ end
146
+
147
+ def setup_ro_db
148
+ if dir
149
+ @ro = Xapian::Database.new(dir)
150
+ else
151
+ # In memory db
152
+ @ro = rw
153
+ end
154
+ end
155
+
156
+ #
157
+ class XapianDocumentsAccessor
158
+ def initialize(xdb)
159
+ @xdb = xdb
160
+ end
161
+
162
+ # Return the document with the given id from the
163
+ # database. Raises a XapianFu::DocNotFoundError exception
164
+ # if it doesn't exist.
165
+ def find(doc_id)
166
+ xdoc = @xdb.ro.document(doc_id)
167
+ XapianDoc.new(xdoc)
168
+ rescue RuntimeError => e
169
+ raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
170
+ end
171
+
172
+ # Return the document with the given id from the database or nil
173
+ # if it doesn't exist
174
+ def [](doc_id)
175
+ find(doc_id)
176
+ rescue XapianFu::DocNotFound
177
+ nil
178
+ end
179
+
180
+ # Delete the given document from the database and return the
181
+ # document id, or nil if it doesn't exist
182
+ def delete(doc)
183
+ if doc.respond_to?(:to_i)
184
+ @xdb.rw.delete_document(doc.to_i)
185
+ doc.to_i
186
+ end
187
+ rescue RuntimeError => e
188
+ raise e unless e.to_s =~ /^DocNotFoundError/
189
+ end
190
+ end
191
+ end
192
+
193
+ end
@@ -0,0 +1,117 @@
1
+ module XapianFu
2
+
3
+ class XapianDbNotSet < XapianFuError ; end
4
+ class XapianDocNotSet < XapianFuError ; end
5
+ class XapianTypeError < XapianFuError ; end
6
+
7
+ class XapianDoc
8
+ attr_reader :fields, :data, :weight, :match
9
+ attr_reader :xapian_document
10
+ attr_accessor :id, :db
11
+
12
+ # Expects a Xapian::Document, a Hash-like object, or anything that
13
+ # with a to_s method. Anything else raises a XapianTypeError.
14
+ # Options can be <tt>:weight</tt> to set the search weight or
15
+ # <tt>:data</tt> to set some additional data to be stored with the
16
+ # record in the database.
17
+ def initialize(doc, options = {})
18
+ @fields = {}
19
+ if doc.is_a? Xapian::Match
20
+ match = doc
21
+ doc = match.document
22
+ @match = match
23
+ @weight = @match.weight
24
+ end
25
+
26
+ # Handle initialisation from a Xapian::Document, which is
27
+ # usually a search result from a Xapian database
28
+ if doc.is_a?(Xapian::Document)
29
+ @xapian_document = doc
30
+ @id = doc.docid
31
+ begin
32
+ xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
33
+ rescue ArgumentError
34
+ @data = nil
35
+ end
36
+ if xdoc_data.is_a? Hash
37
+ @data = xdoc_data.delete(:__data)
38
+ @fields = xdoc_data
39
+ else
40
+ @data = xdoc_data
41
+ end
42
+ # Handle initialisation from a hash-like object
43
+ elsif doc.respond_to?("[]") and doc.respond_to?(:has_key?)
44
+ @fields = doc
45
+ @id = doc[:id] if doc.has_key?(:id)
46
+ # Handle initialisation from anything else that can be coerced
47
+ # into a string
48
+ elsif doc.respond_to? :to_s
49
+ @fields = { :content => doc.to_s }
50
+ else
51
+ raise XapianTypeError, "Can't handle indexing a '#{doc.class}' object"
52
+ end
53
+ @weight = options[:weight] if options[:weight]
54
+ @data = options[:data] if options[:data]
55
+ end
56
+
57
+ # Retrieve the given Xapianvalue from the XapianDb. <tt>vkey</tt>
58
+ # can be a symbol or string, in which case it's hashed to get an
59
+ # integer value number. Or you can give the integer value number
60
+ # if you know it.
61
+ def get_value(vkey)
62
+ raise XapianDocNotSet unless @xapian_document
63
+ vkey = vkey.to_s.hash unless vkey.is_a? Integer
64
+ @xapian_document.value(vkey)
65
+ end
66
+
67
+ # Return a list of terms that the db has for this document.
68
+ def terms
69
+ raise XapianFu::XapianDbNotSet unless db
70
+ db.ro.termlist(id) if db.respond_to?(:ro) and db.ro and id
71
+ end
72
+
73
+ # Return a Xapian::Document ready for putting into a Xapian
74
+ # database. Requires that the db attribute has been set up.
75
+ def to_xapian_document
76
+ raise XapianFu::XapianDbNotSet unless db
77
+ xdoc = Xapian::Document.new
78
+ add_stored_fields_to_xapian_doc(xdoc)
79
+ add_stored_values_to_xapian_doc(xdoc)
80
+ xdoc
81
+ end
82
+
83
+ # Return text for indexing from the fields
84
+ def text
85
+ fields.keys.collect { |key| fields[key].to_s }.join(' ')
86
+ end
87
+
88
+ def ==(b)
89
+ if b.is_a?(XapianDoc)
90
+ id == b.id
91
+ else
92
+ super(b)
93
+ end
94
+ end
95
+
96
+ def inspect
97
+ "<#{self.class.to_s} id=#{id}>"
98
+ end
99
+
100
+ private
101
+
102
+ def add_stored_fields_to_xapian_doc(xdoc)
103
+ stored_fields = fields.reject { |k,v| ! db.store_fields.include? k }
104
+ stored_fields[:__data] = data if data
105
+ xdoc.data = Marshal.dump(stored_fields) unless stored_fields.empty?
106
+ xdoc
107
+ end
108
+
109
+ def add_stored_values_to_xapian_doc(xdoc)
110
+ stored_values = fields.reject { |k,v| ! db.store_values.include? k }
111
+ stored_values.each do |k,v|
112
+ xdoc.add_value(k.to_s.hash, v.to_s)
113
+ end
114
+ xdoc
115
+ end
116
+ end
117
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,5 @@
1
+ --colour
2
+ --format n
3
+ --loadby mtime
4
+ --reverse
5
+
@@ -0,0 +1,295 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ # Will be deleted
7
+ tmp_dir = '/tmp/xapian_fu_test.db'
8
+
9
+ describe XapianDb do
10
+ before do
11
+ FileUtils.rm_rf tmp_dir if File.exists?(tmp_dir)
12
+ end
13
+
14
+ it "should make an in-memory database by default" do
15
+ xdb = XapianDb.new
16
+ xdb.ro.should be_a_kind_of(Xapian::Database)
17
+ xdb.rw.should === xdb.ro
18
+ end
19
+
20
+ it "should make an on-disk database when given a :dir option" do
21
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
22
+ File.exists?(tmp_dir).should be_true
23
+ xdb.should respond_to(:dir)
24
+ xdb.dir.should == tmp_dir
25
+ xdb.rw.should be_a_kind_of(Xapian::WritableDatabase)
26
+ xdb.ro.should be_a_kind_of(Xapian::Database)
27
+ end
28
+
29
+ it "should flush documents to the index when flush is called" do
30
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
31
+ xdb.size.should == 0
32
+ xdb << "Once upon a time"
33
+ xdb.size.should == 0
34
+ xdb.flush
35
+ xdb.size.should == 1
36
+ end
37
+
38
+ it "should support transactions" do
39
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
40
+ xdb << "Once upon a time"
41
+ xdb.transaction do
42
+ xdb << "Once upon a time"
43
+ xdb.size.should == 1
44
+ end
45
+ xdb.flush
46
+ xdb.size.should == 2
47
+ end
48
+
49
+ it "should serialize attempts at concurrent transactions" do
50
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
51
+ thread = Thread.new do
52
+ xdb.transaction do
53
+ sleep 0.1
54
+ xdb << "Once upon a time"
55
+ sleep 0.1
56
+ xdb << "Once upon a time"
57
+ end
58
+ end
59
+ xdb.transaction do
60
+ xdb << "Once upon a time"
61
+ sleep 0.1
62
+ xdb << "Once upon a time"
63
+ end
64
+ thread.join
65
+ xdb.flush
66
+ xdb.size.should == 4
67
+ end
68
+
69
+ it "should abort a transaction on an exception" do
70
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
71
+ xdb << "Once upon a time"
72
+ begin
73
+ xdb.transaction do
74
+ xdb << "Once upon a time"
75
+ raise StandardError
76
+ end
77
+ rescue StandardError
78
+ end
79
+ xdb.flush
80
+ xdb.size.should == 1
81
+ end
82
+
83
+ it "should index a XapianDoc" do
84
+ xdb = XapianDb.new
85
+ xdb << XapianDoc.new({ :text => "once upon a time", :title => "A story" })
86
+ xdb.flush
87
+ xdb.size.should == 1
88
+ end
89
+
90
+ it "should index a Hash" do
91
+ xdb = XapianDb.new
92
+ xdb << { :text => "once upon a time", :title => "A story" }
93
+ xdb.flush
94
+ xdb.size.should == 1
95
+ end
96
+
97
+ it "should index a string" do
98
+ xdb = XapianDb.new
99
+ xdb << "once upon a time"
100
+ xdb.size.should == 1
101
+ xdb << XapianDoc.new("once upon a time")
102
+ xdb.size.should == 2
103
+ end
104
+
105
+ it "should raise a XapianFu::DocNotFound error on find if the document doesn't exist" do
106
+ xdb = XapianDb.new
107
+ xdb << "once upon a time"
108
+ xdb.flush
109
+ lambda { xdb.documents.find(10) }.should raise_error XapianFu::DocNotFound
110
+ end
111
+
112
+ it "should retrieve documents with the find method" do
113
+ xdb = XapianDb.new
114
+ xdb << "Once upon a time"
115
+ xdb.flush
116
+ xdb.documents.find(1).should be_a_kind_of(XapianDoc)
117
+ end
118
+
119
+ it "should retrieve documents like an array and return a XapianDoc" do
120
+ xdb = XapianDb.new
121
+ xdb << "once upon a time"
122
+ xdb.flush
123
+ xdb.documents[1].should be_a_kind_of(XapianDoc)
124
+ end
125
+
126
+ it "should provide the id of retrieved documents" do
127
+ xdb = XapianDb.new
128
+ xdb << "once upon a time"
129
+ xdb.documents[1].id.should == 1
130
+ end
131
+
132
+ it "should store data in the database" do
133
+ xdb = XapianDb.new
134
+ xdb << XapianDoc.new({ :text => "once upon a time" }, :data => { :thing => 0xdeadbeef })
135
+ xdb.size.should == 1
136
+ doc = xdb.documents[1]
137
+ doc.data.should == { :thing => 0xdeadbeef }
138
+ end
139
+
140
+ it "should return a XapianDoc with an id after indexing" do
141
+ xdb = XapianDb.new
142
+ doc = XapianDoc.new("once upon a time")
143
+ doc.id.should == nil
144
+ new_doc = xdb << doc
145
+ new_doc.id.should == 1
146
+ end
147
+
148
+ it "should replace docs that already have an id when adding to the db" do
149
+ xdb = XapianDb.new
150
+ doc = xdb << XapianDoc.new("Once upon a time")
151
+ xdb.flush
152
+ xdb.size.should == 1
153
+ doc.id.should == 1
154
+ updated_doc = xdb << doc
155
+ xdb.flush
156
+ xdb.size.should == 1
157
+ updated_doc.id.should == doc.id
158
+ end
159
+
160
+ it "should delete docs by id" do
161
+ xdb = XapianDb.new
162
+ doc = xdb << XapianDoc.new("Once upon a time")
163
+ xdb.flush
164
+ xdb.size.should == 1
165
+ xdb.documents.delete(doc.id).should == 1
166
+ xdb.flush
167
+ xdb.size.should == 0
168
+ end
169
+
170
+ it "should handle being asked to delete docs that don't exist in the db" do
171
+ xdb = XapianDb.new
172
+ doc = xdb << XapianDoc.new("Once upon a time")
173
+ xdb.flush
174
+ xdb.documents.delete(100000).should == nil
175
+ end
176
+
177
+ it "should add new docs with the given id" do
178
+ xdb = XapianDb.new
179
+ doc = xdb << XapianDoc.new(:id => 0xbeef, :title => "Once upon a time")
180
+ xdb.flush
181
+ xdb.documents[0xbeef].id.should == 0xbeef
182
+ doc.id.should == 0xbeef
183
+ end
184
+
185
+ it "should tokenize strings" do
186
+ xdb = XapianDb.new
187
+ doc = xdb << XapianDoc.new("once upon a time")
188
+ doc.terms.should be_a_kind_of Array
189
+ doc.terms.last.should be_a_kind_of Xapian::Term
190
+ doc.terms.last.term.should == "upon"
191
+ end
192
+
193
+ it "should tokenize a hash" do
194
+ xdb = XapianDb.new
195
+ doc = xdb << XapianDoc.new(:title => 'once upon a time')
196
+ doc.terms.should be_a_kind_of Array
197
+ doc.terms.last.should be_a_kind_of Xapian::Term
198
+ doc.terms.last.term.should == "upon"
199
+ end
200
+
201
+ it "should return a list of XapianDocs with the weight and match set when returning search results" do
202
+ xdb = XapianDb.new
203
+ xdb << XapianDoc.new(:title => 'once upon a time')
204
+ xdb << XapianDoc.new(:title => 'three little pings')
205
+ results = xdb.search("pings")
206
+ results.should be_a_kind_of Array
207
+ results.size.should == 1
208
+ results.first.should be_a_kind_of XapianDoc
209
+ results.first.match.should be_a_kind_of Xapian::Match
210
+ results.first.weight.should be_a_kind_of Float
211
+ end
212
+
213
+ it "should support searching with :page and :per_page options" do
214
+ xdb = XapianDb.new
215
+ content = "word"
216
+ 200.times { xdb << XapianDoc.new(content) }
217
+ xdb.size.should == 200
218
+ results = xdb.search(content, :page => 1, :per_page => 12)
219
+ results.first.id.should == 1
220
+ results.size.should == 12
221
+ results = xdb.search(content, :page => 5, :per_page => 18)
222
+ results.first.id.should == 18 * 4 + 1
223
+ results.size.should == 18
224
+ results = xdb.search(content, :page => 100, :per_page => 12)
225
+ results.size.should == 0
226
+ end
227
+
228
+ it "should store no fields by default" do
229
+ xdb = XapianDb.new
230
+ xdb << XapianDoc.new(:title => "Once upon a time")
231
+ xdb.flush
232
+ xdb.documents.find(1).fields[:title].should be_nil
233
+ end
234
+
235
+ it "should store fields declared as to be stored" do
236
+ xdb = XapianDb.new(:store => :title)
237
+ xdb << XapianDoc.new(:title => "Once upon a time", :author => "Jim Jones")
238
+ xdb.flush
239
+ doc = xdb.documents.find(1)
240
+ doc.fields[:title].should == "Once upon a time"
241
+ doc.fields[:author].should be_nil
242
+ end
243
+
244
+ it "should store values declared as to be sortable" do
245
+ xdb = XapianDb.new(:sortable => :created_at)
246
+ time = Time.now
247
+ xdb << XapianDoc.new(:created_at => time.to_i.to_s, :author => "Jim Jones")
248
+ xdb.flush
249
+ doc = xdb.documents.find(1)
250
+ doc.get_value(:created_at).should == time.to_i.to_s
251
+ end
252
+
253
+ it "should store values declared as to be collapsible" do
254
+ xdb = XapianDb.new(:collapsible => :group_id)
255
+ xdb << XapianDoc.new(:group_id => "666", :author => "Jim Jones")
256
+ xdb.flush
257
+ doc = xdb.documents.find(1)
258
+ doc.get_value(:group_id).should == "666"
259
+ end
260
+
261
+ describe "search results sort order" do
262
+ before(:each) do
263
+ @xdb = XapianDb.new(:sortable => :number)
264
+ @expected_results = []
265
+ @expected_results << (@xdb << XapianDoc.new(:words => "cow dog cat", :number => 1))
266
+ @expected_results << (@xdb << XapianDoc.new(:words => "cow dog", :number => 3))
267
+ @expected_results << (@xdb << XapianDoc.new(:words => "cow", :number => 2))
268
+ end
269
+
270
+ it "should be by relevance by default" do
271
+ results = @xdb.search("cow dog cat")
272
+ results.should == @expected_results
273
+ end
274
+
275
+ it "should be by the value specified in descending numerical order" do
276
+ results = @xdb.search("cow dog cat", :order => :number)
277
+ results.should == @expected_results.sort_by { |r| r.fields[:number] }
278
+ end
279
+
280
+ it "should be reversed when the reverse option is set to true" do
281
+ results = @xdb.search("cow dog cat", :order => :number, :reverse => true)
282
+ results.should == @expected_results.sort_by { |r| r.fields[:number] }.reverse
283
+ end
284
+ end
285
+
286
+ it "should collapse results by the value specified by the :collapse option" do
287
+ xdb = XapianDb.new(:collapsible => :group)
288
+ alpha1 = xdb << XapianDoc.new(:words => "cow dog cat", :group => "alpha")
289
+ alpha2 = xdb << XapianDoc.new(:words => "cow dog", :group => "alpha")
290
+ beta1 = xdb << XapianDoc.new(:words => "cow", :group => "beta")
291
+ results = xdb.search("cow dog cat", :collapse => :group)
292
+ results.should == [alpha1, beta1]
293
+ end
294
+
295
+ end
@@ -0,0 +1,16 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ describe XapianDoc do
7
+
8
+ it "should be equal to other XapianDoc objects with the same id" do
9
+ XapianDoc.new(:id => 666).should == XapianDoc.new(:id => 666)
10
+ end
11
+
12
+ it "should not be equal to other XapianDoc objects with different ids" do
13
+ XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
14
+ end
15
+
16
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xapian-fu
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.2"
5
+ platform: ruby
6
+ authors:
7
+ - John Leach
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-20 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: A library to provide a more Ruby-like interface to the Xapian search engine.
17
+ email: john@johnleach.co.uk
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README.rdoc
24
+ - LICENSE
25
+ files:
26
+ - lib/xapian_fu.rb
27
+ - lib/xapian_fu
28
+ - lib/xapian_fu/xapian_db.rb
29
+ - lib/xapian_fu/xapian_doc.rb
30
+ - examples/ar_spider.rb
31
+ - examples/query.rb
32
+ - examples/spider.rb
33
+ - examples/ar_query.rb
34
+ - README.rdoc
35
+ - LICENSE
36
+ has_rdoc: true
37
+ homepage: http://github.com/johnl/xapian-fu/tree/master
38
+ post_install_message:
39
+ rdoc_options:
40
+ - --title
41
+ - Xapian Fu
42
+ - --main
43
+ - README.rdoc
44
+ - --line-numbers
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: "0"
52
+ version:
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ version:
59
+ requirements: []
60
+
61
+ rubyforge_project: xapian-fu
62
+ rubygems_version: 1.3.1
63
+ signing_key:
64
+ specification_version: 2
65
+ summary: A Ruby interface to the Xapian search engine
66
+ test_files:
67
+ - spec/xapian_doc_spec.rb
68
+ - spec/xapian_db_spec.rb
69
+ - spec/spec.opts