xapian-fu 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ xapian_fu is released under the MIT License.
2
+
3
+ Copyright (c) 2009 John Leach
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of the acts_as_xapian software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including without
8
+ limitation the rights to use, copy, modify, merge, publish, distribute,
9
+ sublicense, and/or sell copies of the Software, and to permit persons to whom
10
+ the Software is furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,39 @@
1
+ = Xapian Fu
2
+
3
+ XapianFu is a Ruby library for working with
4
+ {Xapian}[http://xapian.org/] databases. It builds on the GPL licensed
5
+ Xapian Ruby bindings but provides an interface more in-line with "The
6
+ Ruby Way"(tm).
7
+
8
+ == Example
9
+
10
+ Create a database, add 3 documents to it and then search and retrieve
11
+ them.
12
+
13
+ db = XapianDb.new(:dir => 'example.db', :create => true,
14
+ :store => [:title, :year])
15
+ db << { :title => 'Brokeback Mountain', :year => 2005 }
16
+ db << { :title => 'Cold Mountain', :year => 2004 }
17
+ db << { :title => 'Yes Man', :year => 2008 }
18
+ db.search("mountain").each do |match|
19
+ puts match.fields[:title]
20
+ end
21
+
22
+ == ActiveRecord Example
23
+
24
+ You could use it with something like ActiveRecord to index database
25
+ records:
26
+
27
+ db = XapianDb.new(:dir => 'posts.db', :create => true,
28
+ :store => :id)
29
+ Post.all.each { db << p.attributes }
30
+ db.search("custard").collect do |doc|
31
+ Post.find(doc.id)
32
+ end
33
+
34
+ = More Info
35
+
36
+ Author:: John Leach (mailto:john@johnleach.co.uk)
37
+ Copyright:: Copyright (c) 2009 John Leach
38
+ License:: GPL v2
39
+ Github:: http://github.com/johnl/xapian-fu/tree/master
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+ include XapianFu
7
+ require 'active_record'
8
+
9
+ ActiveRecord::Base.establish_connection(
10
+ :adapter => "mysql",
11
+ :host => "localhost",
12
+ :username => "john",
13
+ :password => "john",
14
+ :database => "john_fametastic_dev" )
15
+
16
+ class WpPost < ActiveRecord::Base
17
+ set_primary_key :ID
18
+ end
19
+
20
+ #puts WpPost.new.attributes.keys.join(' ')
21
+ db = XapianDb.new(:dir => 'ar_spider.db')
22
+
23
+ results = nil
24
+ bm = Benchmark.measure do
25
+ results = db.search(ARGV.join(' '))
26
+ end
27
+
28
+ posts = WpPost.find(results.collect { |r| r.id })
29
+
30
+ puts "Weight\tTitle"
31
+ posts.each_with_index do |p,i|
32
+ puts "%.3f\t#{p.post_title}" % results[i].weight
33
+ end
34
+
35
+ puts "Search took %.5f seconds" % bm.total
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+ include XapianFu
7
+ require 'active_record'
8
+
9
+ ActiveRecord::Base.establish_connection(
10
+ :adapter => "mysql",
11
+ :host => "localhost",
12
+ :username => "john",
13
+ :password => "john",
14
+ :database => "john_fametastic_dev" )
15
+
16
+ class WpPost < ActiveRecord::Base
17
+ set_primary_key :ID
18
+ end
19
+
20
+ #puts WpPost.new.attributes.keys.join(' ')
21
+ db = XapianDb.new(:dir => 'ar_spider.db', :overwrite => true)
22
+
23
+ count = 0
24
+ indexing_time = 0.0
25
+ WpPost.find_in_batches do |posts|
26
+ db.transaction do
27
+ puts "Indexing wp_posts #{count} to #{count += posts.size}"
28
+ posts.each do |post|
29
+ bm = Benchmark.measure do
30
+ db << XapianDoc.new(post.attributes.merge({ :id => post.id }))
31
+ end
32
+ indexing_time += bm.total
33
+ end
34
+ end
35
+ indexing_time += Benchmark.measure { db.flush }.total
36
+ end
37
+ puts "%i documents took %.4f seconds. %.2f per second" % [count, indexing_time, count / indexing_time]
data/examples/query.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+
7
+ query_string = ARGV.join(" ")
8
+ db = XapianFu::XapianDb.new(:dir => 'spider.db')
9
+ results = nil
10
+ bm = Benchmark.measure { results = db.search(query_string) }
11
+ puts "Weight\tFilename"
12
+ results.each do |result|
13
+ puts "%.2f\t%s" % [result.weight, result.fields[:filename]]
14
+ end
15
+ puts "Search took %.5f seconds" % bm.total
16
+
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'rubygems'
4
+ require 'benchmark'
5
+ require 'lib/xapian_fu'
6
+
7
+ db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => :filename,
8
+ :overwrite => true)
9
+
10
+ base_path = ARGV[0] || '.'
11
+
12
+ docs = 0
13
+ indexing_time = 0.0
14
+ Dir.glob(File.join(base_path, "/**/*")) do |filename|
15
+ next unless File.file?(filename)
16
+ next unless filename =~ /\.(txt|doc|README|c|h|rb|py|note|xml)$/i
17
+ puts "Indexing #{filename}"
18
+ text = File.open(filename) { |f| f.read(10 * 1024) }
19
+ bm = Benchmark.measure do
20
+ db << XapianFu::XapianDoc.new({:text => text, :filename => filename,
21
+ :filesize => File.size(filename) })
22
+ end
23
+ indexing_time += bm.total
24
+ docs += 1
25
+ break if docs == 10000
26
+ end
27
+ indexing_time += Benchmark.measure { db.flush }.total
28
+ puts "#{docs} docs indexed in #{indexing_time} seconds (#{docs / indexing_time} docs per second)"
data/lib/xapian_fu.rb ADDED
@@ -0,0 +1,3 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), 'xapian_fu')
2
+ require 'xapian_db'
3
+ require 'xapian_doc'
@@ -0,0 +1,193 @@
1
+ module XapianFu
2
+ class XapianFuError < StandardError ; end
3
+
4
+ require 'xapian'
5
+ require 'xapian_doc'
6
+ require 'thread'
7
+
8
+ class ConcurrencyError < XapianFuError ; end
9
+ class DocNotFound < XapianFuError ; end
10
+
11
+ class XapianDb
12
+ attr_reader :dir, :db_flag, :query_parser
13
+ attr_reader :store_fields, :store_values
14
+
15
+ def initialize( options = { } )
16
+ @dir = options[:dir]
17
+ @db_flag = Xapian::DB_OPEN
18
+ @db_flag = Xapian::DB_CREATE_OR_OPEN if options[:create]
19
+ @db_flag = Xapian::DB_CREATE_OR_OVERWRITE if options[:overwrite]
20
+ @store_fields = Array.new(1, options[:store]).compact
21
+ @store_values = Array.new(1, options[:sortable]).compact
22
+ @store_values += Array.new(1, options[:collapsible]).compact
23
+ rw.flush if options[:create]
24
+ @tx_mutex = Mutex.new
25
+ end
26
+
27
+ # Return the writable Xapian database
28
+ def rw
29
+ @rw ||= setup_rw_db
30
+ end
31
+
32
+ # Return the read-only Xapian database
33
+ def ro
34
+ @ro ||= setup_ro_db
35
+ end
36
+
37
+ # Return the number of docs in the Xapian database
38
+ def size
39
+ ro.doccount
40
+ end
41
+
42
+ # Return the XapianDocumentsAccessor for this database
43
+ def documents
44
+ @documents_accessor ||= XapianDocumentsAccessor.new(self)
45
+ end
46
+
47
+ # Add a document to the index. A document can be just a hash, the
48
+ # keys representing field names and their values the data to be
49
+ # indexed. Or it can be a XapianDoc, or any object with a to_s method.
50
+ #
51
+ # If the document object reponds to the method :data, whatever it
52
+ # returns is marshalled and stored in the Xapian database. Any
53
+ # arbitrary data up to Xmeg can be stored here.
54
+ #
55
+ # Currently, all fields are stored in the database. This will
56
+ # change to store only those fields requested to be stored.
57
+ def add_doc(doc)
58
+ doc = XapianDoc.new(doc) unless doc.is_a? XapianDoc
59
+ doc.db = self
60
+ xdoc = doc.to_xapian_document
61
+ tg = Xapian::TermGenerator.new
62
+ tg.database = rw
63
+ tg.document = xdoc
64
+ tg.index_text( doc.text )
65
+ if doc.id
66
+ rw.replace_document(doc.id, xdoc)
67
+ else
68
+ doc.id = rw.add_document(xdoc)
69
+ end
70
+ doc
71
+ end
72
+ alias_method "<<", :add_doc
73
+
74
+ # Conduct a search on the Xapian database, returning an array of
75
+ # XapianDoc objects for the matches
76
+ def search(q, options = {})
77
+ defaults = { :page => 1, :per_page => 10, :reverse => false }
78
+ options = defaults.merge(options)
79
+ page = options[:page].to_i rescue 1
80
+ page = page > 1 ? page - 1 : 0
81
+ per_page = options[:per_page].to_i rescue 10
82
+ offset = page * per_page
83
+ query = query_parser.parse_query(q, Xapian::QueryParser::FLAG_WILDCARD && Xapian::QueryParser::FLAG_LOVEHATE)
84
+ if options[:order]
85
+ enquiry.sort_by_value!(options[:order].to_s.hash, options[:reverse])
86
+ end
87
+ if options[:collapse]
88
+ enquiry.collapse_key = options[:collapse].to_s.hash
89
+ end
90
+ enquiry.query = query
91
+ enquiry.mset(offset, per_page).matches.collect { |m| XapianDoc.new(m) }
92
+ end
93
+
94
+ # Run the given block in a XapianDB transaction. Any changes to the
95
+ # Xapian database made in the block will be atomically committed at the end.
96
+ #
97
+ # If an exception is raised by the block, all changes are discarded and the
98
+ # exception re-raised.
99
+ #
100
+ # Xapian does not support multiple concurrent transactions on the
101
+ # same Xapian database. Any attempts at this will be serialized by
102
+ # XapianFu, which is not perfect but probably better than just
103
+ # kicking up an exception.
104
+ #
105
+ def transaction
106
+ @tx_mutex.synchronize do
107
+ rw.begin_transaction
108
+ yield
109
+ rw.commit_transaction
110
+ end
111
+ rescue Exception => e
112
+ rw.cancel_transaction
113
+ raise e
114
+ end
115
+
116
+ # Flush any changes to disk and reopen the read-only database.
117
+ # Raises ConcurrencyError if a transaction is in process
118
+ def flush
119
+ raise ConcurrencyError if @tx_mutex.locked?
120
+ rw.flush
121
+ ro.reopen
122
+ end
123
+
124
+ def query_parser
125
+ unless @query_parser
126
+ @query_parser = Xapian::QueryParser.new
127
+ @query_parser.database = ro
128
+ end
129
+ @query_parser
130
+ end
131
+
132
+ def enquiry
133
+ @enquiry ||= Xapian::Enquire.new(ro)
134
+ end
135
+
136
+ private
137
+
138
+ def setup_rw_db
139
+ if dir
140
+ @rw = Xapian::WritableDatabase.new(dir, db_flag)
141
+ else
142
+ # In memory database
143
+ @rw = Xapian::inmemory_open
144
+ end
145
+ end
146
+
147
+ def setup_ro_db
148
+ if dir
149
+ @ro = Xapian::Database.new(dir)
150
+ else
151
+ # In memory db
152
+ @ro = rw
153
+ end
154
+ end
155
+
156
+ #
157
+ class XapianDocumentsAccessor
158
+ def initialize(xdb)
159
+ @xdb = xdb
160
+ end
161
+
162
+ # Return the document with the given id from the
163
+ # database. Raises a XapianFu::DocNotFoundError exception
164
+ # if it doesn't exist.
165
+ def find(doc_id)
166
+ xdoc = @xdb.ro.document(doc_id)
167
+ XapianDoc.new(xdoc)
168
+ rescue RuntimeError => e
169
+ raise e.to_s =~ /^DocNotFoundError/ ? XapianFu::DocNotFound : e
170
+ end
171
+
172
+ # Return the document with the given id from the database or nil
173
+ # if it doesn't exist
174
+ def [](doc_id)
175
+ find(doc_id)
176
+ rescue XapianFu::DocNotFound
177
+ nil
178
+ end
179
+
180
+ # Delete the given document from the database and return the
181
+ # document id, or nil if it doesn't exist
182
+ def delete(doc)
183
+ if doc.respond_to?(:to_i)
184
+ @xdb.rw.delete_document(doc.to_i)
185
+ doc.to_i
186
+ end
187
+ rescue RuntimeError => e
188
+ raise e unless e.to_s =~ /^DocNotFoundError/
189
+ end
190
+ end
191
+ end
192
+
193
+ end
@@ -0,0 +1,117 @@
1
+ module XapianFu
2
+
3
+ class XapianDbNotSet < XapianFuError ; end
4
+ class XapianDocNotSet < XapianFuError ; end
5
+ class XapianTypeError < XapianFuError ; end
6
+
7
+ class XapianDoc
8
+ attr_reader :fields, :data, :weight, :match
9
+ attr_reader :xapian_document
10
+ attr_accessor :id, :db
11
+
12
+ # Expects a Xapian::Document, a Hash-like object, or anything that
13
+ # with a to_s method. Anything else raises a XapianTypeError.
14
+ # Options can be <tt>:weight</tt> to set the search weight or
15
+ # <tt>:data</tt> to set some additional data to be stored with the
16
+ # record in the database.
17
+ def initialize(doc, options = {})
18
+ @fields = {}
19
+ if doc.is_a? Xapian::Match
20
+ match = doc
21
+ doc = match.document
22
+ @match = match
23
+ @weight = @match.weight
24
+ end
25
+
26
+ # Handle initialisation from a Xapian::Document, which is
27
+ # usually a search result from a Xapian database
28
+ if doc.is_a?(Xapian::Document)
29
+ @xapian_document = doc
30
+ @id = doc.docid
31
+ begin
32
+ xdoc_data = Marshal::load(doc.data) unless doc.data.empty?
33
+ rescue ArgumentError
34
+ @data = nil
35
+ end
36
+ if xdoc_data.is_a? Hash
37
+ @data = xdoc_data.delete(:__data)
38
+ @fields = xdoc_data
39
+ else
40
+ @data = xdoc_data
41
+ end
42
+ # Handle initialisation from a hash-like object
43
+ elsif doc.respond_to?("[]") and doc.respond_to?(:has_key?)
44
+ @fields = doc
45
+ @id = doc[:id] if doc.has_key?(:id)
46
+ # Handle initialisation from anything else that can be coerced
47
+ # into a string
48
+ elsif doc.respond_to? :to_s
49
+ @fields = { :content => doc.to_s }
50
+ else
51
+ raise XapianTypeError, "Can't handle indexing a '#{doc.class}' object"
52
+ end
53
+ @weight = options[:weight] if options[:weight]
54
+ @data = options[:data] if options[:data]
55
+ end
56
+
57
+ # Retrieve the given Xapianvalue from the XapianDb. <tt>vkey</tt>
58
+ # can be a symbol or string, in which case it's hashed to get an
59
+ # integer value number. Or you can give the integer value number
60
+ # if you know it.
61
+ def get_value(vkey)
62
+ raise XapianDocNotSet unless @xapian_document
63
+ vkey = vkey.to_s.hash unless vkey.is_a? Integer
64
+ @xapian_document.value(vkey)
65
+ end
66
+
67
+ # Return a list of terms that the db has for this document.
68
+ def terms
69
+ raise XapianFu::XapianDbNotSet unless db
70
+ db.ro.termlist(id) if db.respond_to?(:ro) and db.ro and id
71
+ end
72
+
73
+ # Return a Xapian::Document ready for putting into a Xapian
74
+ # database. Requires that the db attribute has been set up.
75
+ def to_xapian_document
76
+ raise XapianFu::XapianDbNotSet unless db
77
+ xdoc = Xapian::Document.new
78
+ add_stored_fields_to_xapian_doc(xdoc)
79
+ add_stored_values_to_xapian_doc(xdoc)
80
+ xdoc
81
+ end
82
+
83
+ # Return text for indexing from the fields
84
+ def text
85
+ fields.keys.collect { |key| fields[key].to_s }.join(' ')
86
+ end
87
+
88
+ def ==(b)
89
+ if b.is_a?(XapianDoc)
90
+ id == b.id
91
+ else
92
+ super(b)
93
+ end
94
+ end
95
+
96
+ def inspect
97
+ "<#{self.class.to_s} id=#{id}>"
98
+ end
99
+
100
+ private
101
+
102
+ def add_stored_fields_to_xapian_doc(xdoc)
103
+ stored_fields = fields.reject { |k,v| ! db.store_fields.include? k }
104
+ stored_fields[:__data] = data if data
105
+ xdoc.data = Marshal.dump(stored_fields) unless stored_fields.empty?
106
+ xdoc
107
+ end
108
+
109
+ def add_stored_values_to_xapian_doc(xdoc)
110
+ stored_values = fields.reject { |k,v| ! db.store_values.include? k }
111
+ stored_values.each do |k,v|
112
+ xdoc.add_value(k.to_s.hash, v.to_s)
113
+ end
114
+ xdoc
115
+ end
116
+ end
117
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,5 @@
1
+ --colour
2
+ --format n
3
+ --loadby mtime
4
+ --reverse
5
+
@@ -0,0 +1,295 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ # Will be deleted
7
+ tmp_dir = '/tmp/xapian_fu_test.db'
8
+
9
+ describe XapianDb do
10
+ before do
11
+ FileUtils.rm_rf tmp_dir if File.exists?(tmp_dir)
12
+ end
13
+
14
+ it "should make an in-memory database by default" do
15
+ xdb = XapianDb.new
16
+ xdb.ro.should be_a_kind_of(Xapian::Database)
17
+ xdb.rw.should === xdb.ro
18
+ end
19
+
20
+ it "should make an on-disk database when given a :dir option" do
21
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
22
+ File.exists?(tmp_dir).should be_true
23
+ xdb.should respond_to(:dir)
24
+ xdb.dir.should == tmp_dir
25
+ xdb.rw.should be_a_kind_of(Xapian::WritableDatabase)
26
+ xdb.ro.should be_a_kind_of(Xapian::Database)
27
+ end
28
+
29
+ it "should flush documents to the index when flush is called" do
30
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
31
+ xdb.size.should == 0
32
+ xdb << "Once upon a time"
33
+ xdb.size.should == 0
34
+ xdb.flush
35
+ xdb.size.should == 1
36
+ end
37
+
38
+ it "should support transactions" do
39
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
40
+ xdb << "Once upon a time"
41
+ xdb.transaction do
42
+ xdb << "Once upon a time"
43
+ xdb.size.should == 1
44
+ end
45
+ xdb.flush
46
+ xdb.size.should == 2
47
+ end
48
+
49
+ it "should serialize attempts at concurrent transactions" do
50
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
51
+ thread = Thread.new do
52
+ xdb.transaction do
53
+ sleep 0.1
54
+ xdb << "Once upon a time"
55
+ sleep 0.1
56
+ xdb << "Once upon a time"
57
+ end
58
+ end
59
+ xdb.transaction do
60
+ xdb << "Once upon a time"
61
+ sleep 0.1
62
+ xdb << "Once upon a time"
63
+ end
64
+ thread.join
65
+ xdb.flush
66
+ xdb.size.should == 4
67
+ end
68
+
69
+ it "should abort a transaction on an exception" do
70
+ xdb = XapianDb.new(:dir => tmp_dir, :create => true)
71
+ xdb << "Once upon a time"
72
+ begin
73
+ xdb.transaction do
74
+ xdb << "Once upon a time"
75
+ raise StandardError
76
+ end
77
+ rescue StandardError
78
+ end
79
+ xdb.flush
80
+ xdb.size.should == 1
81
+ end
82
+
83
+ it "should index a XapianDoc" do
84
+ xdb = XapianDb.new
85
+ xdb << XapianDoc.new({ :text => "once upon a time", :title => "A story" })
86
+ xdb.flush
87
+ xdb.size.should == 1
88
+ end
89
+
90
+ it "should index a Hash" do
91
+ xdb = XapianDb.new
92
+ xdb << { :text => "once upon a time", :title => "A story" }
93
+ xdb.flush
94
+ xdb.size.should == 1
95
+ end
96
+
97
+ it "should index a string" do
98
+ xdb = XapianDb.new
99
+ xdb << "once upon a time"
100
+ xdb.size.should == 1
101
+ xdb << XapianDoc.new("once upon a time")
102
+ xdb.size.should == 2
103
+ end
104
+
105
+ it "should raise a XapianFu::DocNotFound error on find if the document doesn't exist" do
106
+ xdb = XapianDb.new
107
+ xdb << "once upon a time"
108
+ xdb.flush
109
+ lambda { xdb.documents.find(10) }.should raise_error XapianFu::DocNotFound
110
+ end
111
+
112
+ it "should retrieve documents with the find method" do
113
+ xdb = XapianDb.new
114
+ xdb << "Once upon a time"
115
+ xdb.flush
116
+ xdb.documents.find(1).should be_a_kind_of(XapianDoc)
117
+ end
118
+
119
+ it "should retrieve documents like an array and return a XapianDoc" do
120
+ xdb = XapianDb.new
121
+ xdb << "once upon a time"
122
+ xdb.flush
123
+ xdb.documents[1].should be_a_kind_of(XapianDoc)
124
+ end
125
+
126
+ it "should provide the id of retrieved documents" do
127
+ xdb = XapianDb.new
128
+ xdb << "once upon a time"
129
+ xdb.documents[1].id.should == 1
130
+ end
131
+
132
+ it "should store data in the database" do
133
+ xdb = XapianDb.new
134
+ xdb << XapianDoc.new({ :text => "once upon a time" }, :data => { :thing => 0xdeadbeef })
135
+ xdb.size.should == 1
136
+ doc = xdb.documents[1]
137
+ doc.data.should == { :thing => 0xdeadbeef }
138
+ end
139
+
140
+ it "should return a XapianDoc with an id after indexing" do
141
+ xdb = XapianDb.new
142
+ doc = XapianDoc.new("once upon a time")
143
+ doc.id.should == nil
144
+ new_doc = xdb << doc
145
+ new_doc.id.should == 1
146
+ end
147
+
148
+ it "should replace docs that already have an id when adding to the db" do
149
+ xdb = XapianDb.new
150
+ doc = xdb << XapianDoc.new("Once upon a time")
151
+ xdb.flush
152
+ xdb.size.should == 1
153
+ doc.id.should == 1
154
+ updated_doc = xdb << doc
155
+ xdb.flush
156
+ xdb.size.should == 1
157
+ updated_doc.id.should == doc.id
158
+ end
159
+
160
+ it "should delete docs by id" do
161
+ xdb = XapianDb.new
162
+ doc = xdb << XapianDoc.new("Once upon a time")
163
+ xdb.flush
164
+ xdb.size.should == 1
165
+ xdb.documents.delete(doc.id).should == 1
166
+ xdb.flush
167
+ xdb.size.should == 0
168
+ end
169
+
170
+ it "should handle being asked to delete docs that don't exist in the db" do
171
+ xdb = XapianDb.new
172
+ doc = xdb << XapianDoc.new("Once upon a time")
173
+ xdb.flush
174
+ xdb.documents.delete(100000).should == nil
175
+ end
176
+
177
+ it "should add new docs with the given id" do
178
+ xdb = XapianDb.new
179
+ doc = xdb << XapianDoc.new(:id => 0xbeef, :title => "Once upon a time")
180
+ xdb.flush
181
+ xdb.documents[0xbeef].id.should == 0xbeef
182
+ doc.id.should == 0xbeef
183
+ end
184
+
185
+ it "should tokenize strings" do
186
+ xdb = XapianDb.new
187
+ doc = xdb << XapianDoc.new("once upon a time")
188
+ doc.terms.should be_a_kind_of Array
189
+ doc.terms.last.should be_a_kind_of Xapian::Term
190
+ doc.terms.last.term.should == "upon"
191
+ end
192
+
193
+ it "should tokenize a hash" do
194
+ xdb = XapianDb.new
195
+ doc = xdb << XapianDoc.new(:title => 'once upon a time')
196
+ doc.terms.should be_a_kind_of Array
197
+ doc.terms.last.should be_a_kind_of Xapian::Term
198
+ doc.terms.last.term.should == "upon"
199
+ end
200
+
201
+ it "should return a list of XapianDocs with the weight and match set when returning search results" do
202
+ xdb = XapianDb.new
203
+ xdb << XapianDoc.new(:title => 'once upon a time')
204
+ xdb << XapianDoc.new(:title => 'three little pings')
205
+ results = xdb.search("pings")
206
+ results.should be_a_kind_of Array
207
+ results.size.should == 1
208
+ results.first.should be_a_kind_of XapianDoc
209
+ results.first.match.should be_a_kind_of Xapian::Match
210
+ results.first.weight.should be_a_kind_of Float
211
+ end
212
+
213
+ it "should support searching with :page and :per_page options" do
214
+ xdb = XapianDb.new
215
+ content = "word"
216
+ 200.times { xdb << XapianDoc.new(content) }
217
+ xdb.size.should == 200
218
+ results = xdb.search(content, :page => 1, :per_page => 12)
219
+ results.first.id.should == 1
220
+ results.size.should == 12
221
+ results = xdb.search(content, :page => 5, :per_page => 18)
222
+ results.first.id.should == 18 * 4 + 1
223
+ results.size.should == 18
224
+ results = xdb.search(content, :page => 100, :per_page => 12)
225
+ results.size.should == 0
226
+ end
227
+
228
+ it "should store no fields by default" do
229
+ xdb = XapianDb.new
230
+ xdb << XapianDoc.new(:title => "Once upon a time")
231
+ xdb.flush
232
+ xdb.documents.find(1).fields[:title].should be_nil
233
+ end
234
+
235
+ it "should store fields declared as to be stored" do
236
+ xdb = XapianDb.new(:store => :title)
237
+ xdb << XapianDoc.new(:title => "Once upon a time", :author => "Jim Jones")
238
+ xdb.flush
239
+ doc = xdb.documents.find(1)
240
+ doc.fields[:title].should == "Once upon a time"
241
+ doc.fields[:author].should be_nil
242
+ end
243
+
244
+ it "should store values declared as to be sortable" do
245
+ xdb = XapianDb.new(:sortable => :created_at)
246
+ time = Time.now
247
+ xdb << XapianDoc.new(:created_at => time.to_i.to_s, :author => "Jim Jones")
248
+ xdb.flush
249
+ doc = xdb.documents.find(1)
250
+ doc.get_value(:created_at).should == time.to_i.to_s
251
+ end
252
+
253
+ it "should store values declared as to be collapsible" do
254
+ xdb = XapianDb.new(:collapsible => :group_id)
255
+ xdb << XapianDoc.new(:group_id => "666", :author => "Jim Jones")
256
+ xdb.flush
257
+ doc = xdb.documents.find(1)
258
+ doc.get_value(:group_id).should == "666"
259
+ end
260
+
261
+ describe "search results sort order" do
262
+ before(:each) do
263
+ @xdb = XapianDb.new(:sortable => :number)
264
+ @expected_results = []
265
+ @expected_results << (@xdb << XapianDoc.new(:words => "cow dog cat", :number => 1))
266
+ @expected_results << (@xdb << XapianDoc.new(:words => "cow dog", :number => 3))
267
+ @expected_results << (@xdb << XapianDoc.new(:words => "cow", :number => 2))
268
+ end
269
+
270
+ it "should be by relevance by default" do
271
+ results = @xdb.search("cow dog cat")
272
+ results.should == @expected_results
273
+ end
274
+
275
+ it "should be by the value specified in descending numerical order" do
276
+ results = @xdb.search("cow dog cat", :order => :number)
277
+ results.should == @expected_results.sort_by { |r| r.fields[:number] }
278
+ end
279
+
280
+ it "should be reversed when the reverse option is set to true" do
281
+ results = @xdb.search("cow dog cat", :order => :number, :reverse => true)
282
+ results.should == @expected_results.sort_by { |r| r.fields[:number] }.reverse
283
+ end
284
+ end
285
+
286
+ it "should collapse results by the value specified by the :collapse option" do
287
+ xdb = XapianDb.new(:collapsible => :group)
288
+ alpha1 = xdb << XapianDoc.new(:words => "cow dog cat", :group => "alpha")
289
+ alpha2 = xdb << XapianDoc.new(:words => "cow dog", :group => "alpha")
290
+ beta1 = xdb << XapianDoc.new(:words => "cow", :group => "beta")
291
+ results = xdb.search("cow dog cat", :collapse => :group)
292
+ results.should == [alpha1, beta1]
293
+ end
294
+
295
+ end
@@ -0,0 +1,16 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ describe XapianDoc do
7
+
8
+ it "should be equal to other XapianDoc objects with the same id" do
9
+ XapianDoc.new(:id => 666).should == XapianDoc.new(:id => 666)
10
+ end
11
+
12
+ it "should not be equal to other XapianDoc objects with different ids" do
13
+ XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
14
+ end
15
+
16
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xapian-fu
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.2"
5
+ platform: ruby
6
+ authors:
7
+ - John Leach
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-20 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: A library to provide a more Ruby-like interface to the Xapian search engine.
17
+ email: john@johnleach.co.uk
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README.rdoc
24
+ - LICENSE
25
+ files:
26
+ - lib/xapian_fu.rb
27
+ - lib/xapian_fu
28
+ - lib/xapian_fu/xapian_db.rb
29
+ - lib/xapian_fu/xapian_doc.rb
30
+ - examples/ar_spider.rb
31
+ - examples/query.rb
32
+ - examples/spider.rb
33
+ - examples/ar_query.rb
34
+ - README.rdoc
35
+ - LICENSE
36
+ has_rdoc: true
37
+ homepage: http://github.com/johnl/xapian-fu/tree/master
38
+ post_install_message:
39
+ rdoc_options:
40
+ - --title
41
+ - Xapian Fu
42
+ - --main
43
+ - README.rdoc
44
+ - --line-numbers
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: "0"
52
+ version:
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ version:
59
+ requirements: []
60
+
61
+ rubyforge_project: xapian-fu
62
+ rubygems_version: 1.3.1
63
+ signing_key:
64
+ specification_version: 2
65
+ summary: A Ruby interface to the Xapian search engine
66
+ test_files:
67
+ - spec/xapian_doc_spec.rb
68
+ - spec/xapian_db_spec.rb
69
+ - spec/spec.opts