xapian-fu 0.2 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +152 -13
- data/examples/query.rb +34 -6
- data/examples/spider.rb +44 -15
- data/lib/xapian_fu/query_parser.rb +179 -0
- data/lib/xapian_fu/result_set.rb +52 -0
- data/lib/xapian_fu/stopper_factory.rb +40 -0
- data/lib/xapian_fu/stopwords/README +7 -0
- data/lib/xapian_fu/stopwords/danish.txt +102 -0
- data/lib/xapian_fu/stopwords/dutch.txt +113 -0
- data/lib/xapian_fu/stopwords/english.txt +312 -0
- data/lib/xapian_fu/stopwords/finnish.txt +89 -0
- data/lib/xapian_fu/stopwords/french.txt +168 -0
- data/lib/xapian_fu/stopwords/german.txt +286 -0
- data/lib/xapian_fu/stopwords/hungarian.txt +203 -0
- data/lib/xapian_fu/stopwords/italian.txt +295 -0
- data/lib/xapian_fu/stopwords/norwegian.txt +186 -0
- data/lib/xapian_fu/stopwords/portuguese.txt +245 -0
- data/lib/xapian_fu/stopwords/russian.txt +236 -0
- data/lib/xapian_fu/stopwords/spanish.txt +348 -0
- data/lib/xapian_fu/stopwords/swedish.txt +125 -0
- data/lib/xapian_fu/stopwords/update.rb +7 -0
- data/lib/xapian_fu/xapian_db.rb +215 -99
- data/lib/xapian_fu/xapian_doc.rb +229 -47
- data/lib/xapian_fu/xapian_doc_value_accessor.rb +125 -0
- data/lib/xapian_fu/xapian_documents_accessor.rb +82 -0
- data/lib/xapian_fu.rb +1 -0
- data/spec/query_parser_spec.rb +43 -0
- data/spec/stopper_factory_spec.rb +57 -0
- data/spec/xapian_db_spec.rb +458 -215
- data/spec/xapian_doc_spec.rb +180 -0
- data/spec/xapian_doc_value_accessor_spec.rb +92 -0
- metadata +29 -5
data/README.rdoc
CHANGED
@@ -3,11 +3,33 @@
|
|
3
3
|
XapianFu is a Ruby library for working with
|
4
4
|
{Xapian}[http://xapian.org/] databases. It builds on the GPL licensed
|
5
5
|
Xapian Ruby bindings but provides an interface more in-line with "The
|
6
|
-
Ruby Way"(tm).
|
6
|
+
Ruby Way"(tm) and is considerably easier to use.
|
7
7
|
|
8
|
-
|
8
|
+
For example, you can work almost entirely with Hash objects - XapianFu
|
9
|
+
will handle converting the Hash keys into Xapian term prefixes when
|
10
|
+
indexing and when parsing queries.
|
9
11
|
|
10
|
-
|
12
|
+
It also handles storing and retrieving hash entries as
|
13
|
+
Xapian::Document values. XapianFu basically gives you a persistent
|
14
|
+
Hash with full text indexing (and ACID transactions).
|
15
|
+
|
16
|
+
== Installation
|
17
|
+
|
18
|
+
sudo gem install xapian-fu
|
19
|
+
|
20
|
+
== Documentation
|
21
|
+
|
22
|
+
XapianFu::XapianDb is the corner-stone of XapianFu. A XapianDb
|
23
|
+
instance will handle setting up a XapianFu::XapianDocumentsAccessor
|
24
|
+
for reading and writing documents from and to a Xapian database. It
|
25
|
+
makes use of XapianFu::QueryParser for parsing and setting up a query.
|
26
|
+
|
27
|
+
XapianFu::XapianDoc represents a document retrieved from or to be
|
28
|
+
added to a Xapian database.
|
29
|
+
|
30
|
+
== Basic usage example
|
31
|
+
|
32
|
+
Create a database, add 3 documents to it and then search and retrieve
|
11
33
|
them.
|
12
34
|
|
13
35
|
db = XapianDb.new(:dir => 'example.db', :create => true,
|
@@ -15,25 +37,142 @@ them.
|
|
15
37
|
db << { :title => 'Brokeback Mountain', :year => 2005 }
|
16
38
|
db << { :title => 'Cold Mountain', :year => 2004 }
|
17
39
|
db << { :title => 'Yes Man', :year => 2008 }
|
40
|
+
db.flush
|
18
41
|
db.search("mountain").each do |match|
|
19
|
-
puts match.
|
42
|
+
puts match.values[:title]
|
43
|
+
end
|
44
|
+
|
45
|
+
== Ordering of results
|
46
|
+
|
47
|
+
Create an in-memory database, add 3 documents to it and then search and retrieve
|
48
|
+
them in year order.
|
49
|
+
|
50
|
+
db = XapianDb.new(:store => [:title], :sortable => [:year])
|
51
|
+
db << { :title => 'Brokeback Mountain', :year => 2005 }
|
52
|
+
db << { :title => 'Cold Mountain', :year => 2004 }
|
53
|
+
db << { :title => 'Yes Man', :year => 2008 }
|
54
|
+
db.search("mountain", :order => :year)
|
55
|
+
|
56
|
+
== will_paginate support
|
57
|
+
|
58
|
+
Simple integration with the will_paginate Rails helpers.
|
59
|
+
|
60
|
+
@results = db.search("mountain", :page => 1, :per_page => 5)
|
61
|
+
will_paginate @results
|
62
|
+
|
63
|
+
== Transactions support
|
64
|
+
|
65
|
+
Ensure that a group of documents are either entirely added to the
|
66
|
+
database or not at all - the transaction is aborted if an exception is
|
67
|
+
raised inside the block. The documents only become available to
|
68
|
+
searches at the end of the block, when the transaction is committed.
|
69
|
+
|
70
|
+
db = XapianDb.new(:store => [:title, :year], :sortable => [:year])
|
71
|
+
db.transaction do
|
72
|
+
db << { :title => 'Brokeback Mountain', :year => 2005 }
|
73
|
+
db << { :title => 'Cold Mountain', :year => 2004 }
|
74
|
+
db << { :title => 'Yes Man', :year => 2008 }
|
20
75
|
end
|
76
|
+
db.search("mountain")
|
77
|
+
|
78
|
+
== Complete field definition examples
|
79
|
+
|
80
|
+
Fields can be described in more detail using a hash. For example,
|
81
|
+
telling XapianFu that a particular field is a Date, Fixnum or Bignum
|
82
|
+
will allow very efficient on-disk storage and will ensure the same
|
83
|
+
type of object is instantiated when returning those stored values.
|
84
|
+
And in the case of Fixnum and Bignum, allows you to order search
|
85
|
+
results without worrying about leading zeros.
|
86
|
+
|
87
|
+
db = XapianDb.new(:fields => {
|
88
|
+
:title => { :store => true },
|
89
|
+
:released => { :type => Date, :store => true },
|
90
|
+
:votes => { :type => Fixnum, :store => true }
|
91
|
+
})
|
92
|
+
db << { :title => 'Brokeback Mountain', :released => Date.parse('13th January 2006'), :votes => 105302 }
|
93
|
+
db << { :title => 'Cold Mountain, :released => Date.parse('2nd January 2004'), :votes => 45895 }
|
94
|
+
db << { :title => 'Yes Man', :released => Date.parse('26th December 2008'), :votes => 44936 }
|
95
|
+
db.search("mountain", :order => :votes)
|
96
|
+
|
97
|
+
== Simple max value queries
|
98
|
+
|
99
|
+
Find the document with the highest :year value
|
100
|
+
|
101
|
+
db.documents.max(:year)
|
102
|
+
|
103
|
+
== Search examples
|
104
|
+
|
105
|
+
Search on particular fields
|
106
|
+
|
107
|
+
db.search("title:mountain year:2005")
|
108
|
+
|
109
|
+
Boolean AND (default)
|
110
|
+
|
111
|
+
db.search("ruby AND rails")
|
112
|
+
db.search("ruby rails")
|
113
|
+
|
114
|
+
Boolean OR
|
115
|
+
|
116
|
+
db.search("rails OR sinatra")
|
117
|
+
db.search("rails sinatra", :default_op => :or)
|
118
|
+
|
119
|
+
Exclude certain terms
|
120
|
+
|
121
|
+
db.search("ruby -rails")
|
122
|
+
|
123
|
+
Wildcards
|
124
|
+
|
125
|
+
db.search("xap*")
|
126
|
+
|
127
|
+
Phrase searches
|
128
|
+
|
129
|
+
db.search("'someone dropped a steamer in the gene pool'")
|
130
|
+
|
131
|
+
And any combinations of the above:
|
132
|
+
|
133
|
+
db.search("(ruby OR sinatra) -rails xap*")
|
134
|
+
|
135
|
+
== ActiveRecord Integration
|
136
|
+
|
137
|
+
XapianFu always stores the :id field, so you can easily use it with
|
138
|
+
something like ActiveRecord to index database records:
|
139
|
+
|
140
|
+
db = XapianDb.new(:dir => 'posts.db', :create => true)
|
141
|
+
Post.all.each { |p| db << p.attributes }
|
142
|
+
docs = db.search("custard")
|
143
|
+
docs.each_with_index { |doc,i| docs[i] = Post.find(doc.id) }
|
144
|
+
|
145
|
+
Combine it with the max value search to do batch delta updates by primary key:
|
146
|
+
|
147
|
+
db = XapianDb.new(:dir => 'posts.db')
|
148
|
+
latest_doc = db.documents.max(:id)
|
149
|
+
new_posts = Post.find(:all, :conditions => ['id > ?', lastest_doc.id])
|
150
|
+
new_posts.each { |p| db << p.attributes }
|
151
|
+
|
152
|
+
Or by :updated_at field if you prefer:
|
21
153
|
|
22
|
-
|
154
|
+
db = XapianDb.new(:dir => 'posts.db', :fields => { :updated_at => { :type => Time, :store => true } })
|
155
|
+
last_updated_doc = db.documents.max(:updated_at)
|
156
|
+
new_posts = Post.find(:all, :conditions => ['updated_at >= ?', last_updated_doc.updated_at])
|
157
|
+
new_posts.each { |p| db << p.attributes }
|
23
158
|
|
24
|
-
|
25
|
-
|
159
|
+
Deleted records won't show up in results but can eventually put your
|
160
|
+
result pagination out of whack. So, you'll need to track deletions
|
161
|
+
yourself, either with a deleted_at field, some kind of delete log or
|
162
|
+
perhaps by reindexing once in a while.
|
26
163
|
|
27
|
-
db = XapianDb.new(:dir => 'posts.db'
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
164
|
+
db = XapianDb.new(:dir => 'posts.db')
|
165
|
+
deleted_posts = Post.find(:all, :conditions => 'deleted_at is not null')
|
166
|
+
deleted_posts.each do |post|
|
167
|
+
db.documents.delete(post.id)
|
168
|
+
post.destroy
|
32
169
|
end
|
33
170
|
|
34
171
|
= More Info
|
35
172
|
|
36
173
|
Author:: John Leach (mailto:john@johnleach.co.uk)
|
37
174
|
Copyright:: Copyright (c) 2009 John Leach
|
38
|
-
License:: GPL
|
175
|
+
License:: MIT (The Xapian library is GPL)
|
176
|
+
Mailing list:: http://rubyforge.org/mailman/listinfo/xapian-fu-discuss
|
177
|
+
Web page:: http://johnleach.co.uk/documents/xapian-fu
|
39
178
|
Github:: http://github.com/johnl/xapian-fu/tree/master
|
data/examples/query.rb
CHANGED
@@ -1,16 +1,44 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
#
|
3
|
+
# Example file spider index searcher using XapianFu. Conducts a search
|
4
|
+
# on ./spider.db created with spider.rb.
|
5
|
+
#
|
6
|
+
# --order-by-filesize sorts the results by the file size, largest
|
7
|
+
# first. Default is to sort by relevance.
|
8
|
+
#
|
9
|
+
# All other command line arguments are used as the search query:
|
10
|
+
#
|
11
|
+
# query.rb --order-by-filesize mammoth -woolley
|
12
|
+
#
|
13
|
+
# You can limit queries to particular fields:
|
14
|
+
#
|
15
|
+
# query.rb filename:LICENSE text:BSD
|
16
|
+
#
|
3
17
|
require 'rubygems'
|
4
18
|
require 'benchmark'
|
5
19
|
require 'lib/xapian_fu'
|
6
20
|
|
7
|
-
|
8
|
-
|
21
|
+
order = nil
|
22
|
+
reverse = false
|
23
|
+
if ARGV.delete('--order-by-filesize')
|
24
|
+
order = :filesize
|
25
|
+
reverse = true
|
26
|
+
end
|
27
|
+
query = ARGV.join(" ")
|
28
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db', :fields => [:text, :filesize, :filename])
|
29
|
+
puts "Xapian Database has #{db.size} docs in total"
|
30
|
+
puts "Largest filesize recorded is #{db.documents.max(:filesize).values[:filesize].to_i / 1024}k"
|
31
|
+
puts "Searching for '#{query}'"
|
9
32
|
results = nil
|
10
|
-
bm = Benchmark.measure
|
11
|
-
|
33
|
+
bm = Benchmark.measure do
|
34
|
+
results = db.search(query, :order => order, :reverse => reverse)
|
35
|
+
end
|
36
|
+
puts "Returned #{results.size} of #{results.total_entries} total hits"
|
37
|
+
puts "Weight\tFilename\tFilesize"
|
12
38
|
results.each do |result|
|
13
|
-
|
39
|
+
filename = result.values[:filename]
|
40
|
+
filesize = result.values[:filesize].to_i / 1024
|
41
|
+
puts "%.2f\t%s\t%ik" % [result.weight, filename, filesize]
|
14
42
|
end
|
15
|
-
puts "Search took %.5f seconds" % bm.
|
43
|
+
puts "Search took %.5f seconds" % bm.real
|
16
44
|
|
data/examples/spider.rb
CHANGED
@@ -1,28 +1,57 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
# Example file spider using XapianFu. Overwrites the index on each run (./spider.db)
|
4
|
+
#
|
5
|
+
# spider.rb /path/to/index
|
2
6
|
|
3
7
|
require 'rubygems'
|
4
8
|
require 'benchmark'
|
5
9
|
require 'lib/xapian_fu'
|
6
10
|
|
7
|
-
db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => :filename,
|
11
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => [:filename, :filesize],
|
8
12
|
:overwrite => true)
|
9
13
|
|
10
14
|
base_path = ARGV[0] || '.'
|
11
15
|
|
12
|
-
|
16
|
+
index_queue = [base_path]
|
17
|
+
total_file_count = 0
|
13
18
|
indexing_time = 0.0
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
STDERR.write "Indexing\n"
|
20
|
+
while dir = index_queue.shift
|
21
|
+
STDERR.write " - #{dir}: "
|
22
|
+
file_count = 0
|
23
|
+
file_data = 0
|
24
|
+
Dir.foreach(dir) do |filename|
|
25
|
+
# skip . and ..
|
26
|
+
next if filename =~ /^[.]{1,2}$/
|
27
|
+
filename = File.join(dir, filename)
|
28
|
+
# Put any directories we find onto the queue for indexing
|
29
|
+
if File.directory?(filename)
|
30
|
+
index_queue << filename
|
31
|
+
next
|
32
|
+
end
|
33
|
+
next unless File.file?(filename)
|
34
|
+
next unless filename =~ /(txt|doc|README|c|h|pl|sh|rb|py|note|xml)$/i
|
35
|
+
file_count += 1
|
36
|
+
|
37
|
+
# Read the first 10k of data
|
38
|
+
text = File.open(filename) { |f| f.read(10 * 1024) }
|
39
|
+
file_data += text.size
|
40
|
+
# Index the data, filename and filesize
|
41
|
+
bm = Benchmark.measure do
|
42
|
+
db << {
|
43
|
+
:text => text,
|
44
|
+
:filename => filename,
|
45
|
+
:filesize => File.size(filename)
|
46
|
+
}
|
47
|
+
end
|
48
|
+
indexing_time += bm.real
|
22
49
|
end
|
23
|
-
|
24
|
-
|
25
|
-
break if docs == 10000
|
50
|
+
STDERR.write("#{file_data / 1024}k in #{file_count} files\n")
|
51
|
+
total_file_count += file_count
|
26
52
|
end
|
27
|
-
|
28
|
-
|
53
|
+
|
54
|
+
files_per_second = (total_file_count / indexing_time).round
|
55
|
+
puts "#{total_file_count} files indexed in #{indexing_time.round} seconds (#{files_per_second} per second)"
|
56
|
+
flush_time = Benchmark.measure { db.flush }.real
|
57
|
+
puts "Flush to disk took #{flush_time.round} seconds"
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module XapianFu #:nodoc:
|
2
|
+
|
3
|
+
# The XapianFu::QueryParser is responsible for building useful
|
4
|
+
# Xapian::QueryParser objects.
|
5
|
+
#
|
6
|
+
# The <tt>:fields</tt> option specifies the fields allowed in the
|
7
|
+
# query. Settings <tt>:fields => [:name, :city]</tt> would allow
|
8
|
+
# searches such as <tt>"name:john city:Leeds"</tt> (assuming those
|
9
|
+
# fields were in the document when it was added to the database.)
|
10
|
+
# This options takes an array of symbols or strings representing the
|
11
|
+
# field names.
|
12
|
+
#
|
13
|
+
# The <tt>:database</tt> option specifies the XapianFu::Database,
|
14
|
+
# necessary for calculating spelling corrections. The database's
|
15
|
+
# stemmer, stopper and field list will also be used.
|
16
|
+
#
|
17
|
+
# The <tt>:default_op</tt> option specifies the search operator to
|
18
|
+
# be used when not specified. It takes the operations <tt>:or</tt>,
|
19
|
+
# <tt>:phrase</tt>, <tt>:and</tt> and <tt>:and_maybe</tt>. The
|
20
|
+
# default is <tt>:and</tt>. So for example, with the <tt>:or</tt>
|
21
|
+
# operation, a query <tt>"dog cat rabbit"</tt> will be parsed as
|
22
|
+
# <tt>"dog AND cat AND rabbit"</tt>.
|
23
|
+
#
|
24
|
+
# The <tt>:stemming_strategy</tt> option specifies how terms in the
|
25
|
+
# query should be stemmed. It accepts <tt>:some</tt>, <tt>:all</tt>
|
26
|
+
# or <tt>:none</tt>. The default is <tt>:some</tt> which is best
|
27
|
+
# for most situations. See the Xapian documentation for more
|
28
|
+
# details.
|
29
|
+
#
|
30
|
+
# The <tt>:boolean</tt> option enables or disables boolean
|
31
|
+
# queries. Set to true or false.
|
32
|
+
#
|
33
|
+
# The <tt>:boolean_anycase</tt> option enables or disables
|
34
|
+
# case-insensitive boolean queries. Set to true or false.
|
35
|
+
#
|
36
|
+
# The <tt>:wildcards</tt> option enables or disables the use of
|
37
|
+
# wildcard terms in queries, such as <tt>"york*"</tt>. Set to true or false.
|
38
|
+
#
|
39
|
+
# The <tt>:lovehate</tt> option enables or disables the use of +/-
|
40
|
+
# operators in queries, such as <tt>"+mickey -mouse"</tt>. Set to true or
|
41
|
+
# false.
|
42
|
+
#
|
43
|
+
# The <tt>:spelling</tt> option enables or disables spelling
|
44
|
+
# correction on queries. Set to true or false. Requires the
|
45
|
+
# <tt>:database</tt> option.
|
46
|
+
#
|
47
|
+
# The <tt>:pure_not</tt> option enables or disables the use of
|
48
|
+
# queries that only exclude terms, such as <tt>"NOT apples"</tt>. Set to true
|
49
|
+
# or false.
|
50
|
+
#
|
51
|
+
class QueryParser #:notnew:
|
52
|
+
|
53
|
+
# The stemming strategy to use when generating terms from a query.
|
54
|
+
# Defaults to <tt>:some</tt>
|
55
|
+
attr_accessor :stemming_strategy
|
56
|
+
|
57
|
+
# The default operation when combining search terms. Defaults to
|
58
|
+
# <tt>:and</tt>
|
59
|
+
attr_accessor :default_op
|
60
|
+
|
61
|
+
# The database that this query is agains, used for setting up
|
62
|
+
# fields, stemming, stopping and spelling.
|
63
|
+
attr_accessor :database
|
64
|
+
|
65
|
+
def initialize(options = { })
|
66
|
+
@options = {
|
67
|
+
:stemming_strategy => :some,
|
68
|
+
:default_op => :and
|
69
|
+
}.merge(options)
|
70
|
+
self.stemming_strategy = @options[:stemming_strategy]
|
71
|
+
self.default_op = @options[:default_op]
|
72
|
+
self.database = @options[:database]
|
73
|
+
end
|
74
|
+
|
75
|
+
# Parse the given query string and return a Xapian::Query object
|
76
|
+
def parse_query(q)
|
77
|
+
query_parser.parse_query(q, xapian_flags)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Return the query string with any spelling corrections made
|
81
|
+
def corrected_query
|
82
|
+
query_parser.get_corrected_query_string
|
83
|
+
end
|
84
|
+
|
85
|
+
# The current Xapian::QueryParser object
|
86
|
+
def query_parser
|
87
|
+
if @query_parser
|
88
|
+
@query_parser
|
89
|
+
else
|
90
|
+
qp = Xapian::QueryParser.new
|
91
|
+
qp.database = xapian_database if xapian_database
|
92
|
+
qp.stopper = database.stopper if database
|
93
|
+
qp.stemmer = database.stemmer if database
|
94
|
+
qp.default_op = xapian_default_op
|
95
|
+
qp.stemming_strategy = xapian_stemming_strategy
|
96
|
+
fields.each do |name, type|
|
97
|
+
qp.add_prefix(name.to_s.downcase, "X" + name.to_s.upcase)
|
98
|
+
end
|
99
|
+
@query_parser = qp
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# The Xapian::QueryParser constant for this parsers stemming strategy
|
104
|
+
def xapian_stemming_strategy
|
105
|
+
case stemming_strategy
|
106
|
+
when :all
|
107
|
+
Xapian::QueryParser::STEM_ALL
|
108
|
+
when :some
|
109
|
+
Xapian::QueryParser::STEM_SOME
|
110
|
+
when :none
|
111
|
+
when false
|
112
|
+
when nil
|
113
|
+
Xapian::QueryParser::STEM_NONE
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Return an array of symbols representing the flags set for this
|
118
|
+
# query parser
|
119
|
+
def flags
|
120
|
+
if @flags
|
121
|
+
@flags
|
122
|
+
else
|
123
|
+
valid_flags = [:boolean, :boolean_anycase, :wildcards, :lovehate, :spelling, :pure_not]
|
124
|
+
@flags = valid_flags.delete_if { |vf| not @options[vf] }
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# Return a Xapian::QueryParser flag mask representing the flags
|
129
|
+
# set for this query parser
|
130
|
+
def xapian_flags
|
131
|
+
qflags = 0
|
132
|
+
qflags |= Xapian::QueryParser::FLAG_BOOLEAN if flags.include?(:boolean)
|
133
|
+
qflags |= Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE if flags.include?(:boolean_anycase)
|
134
|
+
qflags |= Xapian::QueryParser::FLAG_WILDCARD if flags.include?(:wildcards)
|
135
|
+
qflags |= Xapian::QueryParser::FLAG_LOVEHATE if flags.include?(:lovehate)
|
136
|
+
qflags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION if flags.include?(:spelling)
|
137
|
+
qflags |= Xapian::QueryParser::FLAG_PURE_NOT if flags.include?(:pure_not)
|
138
|
+
qflags
|
139
|
+
end
|
140
|
+
|
141
|
+
# Return a Xapian::Query constant for this query parser's default
|
142
|
+
# operation
|
143
|
+
def xapian_default_op
|
144
|
+
case default_op
|
145
|
+
when :and_maybe
|
146
|
+
Xapian::Query::OP_AND_MAYBE
|
147
|
+
when :or
|
148
|
+
Xapian::Query::OP_OR
|
149
|
+
when :phrase
|
150
|
+
Xapian::Query::OP_PHRASE
|
151
|
+
when :and
|
152
|
+
Xapian::Query::OP_AND
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Return the available Xapian::Database for use in the query
|
157
|
+
# parser
|
158
|
+
def xapian_database
|
159
|
+
if database.is_a? XapianFu::XapianDb
|
160
|
+
database.ro
|
161
|
+
elsif database.is_a? Xapian::Database
|
162
|
+
database
|
163
|
+
else
|
164
|
+
nil
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# An array of field names that will be recognised in this query
|
169
|
+
def fields
|
170
|
+
if @options[:fields].is_a? Array
|
171
|
+
@options[:fields]
|
172
|
+
elsif database.is_a? XapianFu::XapianDb
|
173
|
+
database.fields
|
174
|
+
else
|
175
|
+
[]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module XapianFu
|
2
|
+
# A XapianFu::ResultSet holds the XapianDoc objects returned from a search.
|
3
|
+
# It acts just like an array but is decorated with useful attributes.
|
4
|
+
class ResultSet < Array
|
5
|
+
|
6
|
+
# The Xapian match set for this search
|
7
|
+
attr_reader :mset
|
8
|
+
attr_reader :current_page, :per_page
|
9
|
+
# The total number of pages of results available for this search
|
10
|
+
attr_reader :total_pages
|
11
|
+
# If any spelling corrections were detected, the full collected query is provided
|
12
|
+
# by :corrected_query, otherwise this is empty.
|
13
|
+
attr_reader :corrected_query
|
14
|
+
|
15
|
+
# nodoc
|
16
|
+
def initialize(options = { })
|
17
|
+
@mset = options[:mset]
|
18
|
+
@current_page = options[:current_page]
|
19
|
+
@per_page = options[:per_page]
|
20
|
+
@corrected_query = options[:corrected_query]
|
21
|
+
concat mset.matches.collect { |m| XapianDoc.new(m) }
|
22
|
+
end
|
23
|
+
|
24
|
+
# The estimated total number of matches this search could return
|
25
|
+
def total_entries
|
26
|
+
mset.matches_estimated
|
27
|
+
end
|
28
|
+
|
29
|
+
# The estimated total number of pages of results this search could return
|
30
|
+
def total_pages
|
31
|
+
(total_entries / per_page.to_f).round
|
32
|
+
end
|
33
|
+
|
34
|
+
# The previous page number, or nil if there are no previous pages available
|
35
|
+
def previous_page
|
36
|
+
p = current_page - 1
|
37
|
+
p == 0 ? nil : p
|
38
|
+
end
|
39
|
+
|
40
|
+
# The next page number, or nil if there are no more more pages available
|
41
|
+
def next_page
|
42
|
+
p = current_page + 1
|
43
|
+
p > total_pages ? nil : p
|
44
|
+
end
|
45
|
+
|
46
|
+
# The offset within the total results of the first result in this page
|
47
|
+
def offset
|
48
|
+
(current_page - 1) * per_page
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module XapianFu
|
2
|
+
class UnsupportedStopperLanguage < XapianFuError ; end
|
3
|
+
|
4
|
+
class StopperFactory
|
5
|
+
@stoppers = { }
|
6
|
+
|
7
|
+
# Return a SimpleStopper loaded with stop words for the given language
|
8
|
+
def self.stopper_for(lang)
|
9
|
+
if lang.is_a? Xapian::Stopper
|
10
|
+
lang
|
11
|
+
else
|
12
|
+
lang = lang.to_s.downcase.strip
|
13
|
+
if @stoppers[lang]
|
14
|
+
@stoppers[lang]
|
15
|
+
else
|
16
|
+
stopper = Xapian::SimpleStopper.new
|
17
|
+
stop_words_for(lang).each { |word| stopper.add(word) }
|
18
|
+
@stoppers[lang] = stopper
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Return the full path to the stop words file for the given language
|
24
|
+
def self.stop_words_filename(lang)
|
25
|
+
File.join(File.dirname(__FILE__), 'stopwords', lang.to_s.downcase + '.txt')
|
26
|
+
end
|
27
|
+
|
28
|
+
# Read and parse the stop words file for the given language, returning an array of words
|
29
|
+
def self.stop_words_for(lang)
|
30
|
+
raise UnsupportedStopperLanguage, lang.to_s unless File.exists?(stop_words_filename(lang))
|
31
|
+
words = []
|
32
|
+
open(stop_words_filename(lang), "r") do |f|
|
33
|
+
while line = f.readline rescue nil
|
34
|
+
words << line.split(" ", 2).first.downcase.strip unless line =~ /^ +|^$|^\|/
|
35
|
+
end
|
36
|
+
end
|
37
|
+
words
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
These stopword lists are from the Snowball library which is covered by the BSD License, with Copyright (c) 2001, Dr Martin Porter, and (for the Java developments) Copyright (c) 2002, Richard Boulton.
|
2
|
+
|
3
|
+
http://snowball.tartarus.org/
|
4
|
+
|
5
|
+
Some have been converted to utf8
|
6
|
+
|
7
|
+
curl http://snowball.tartarus.org/algorithms/russian/stop.txt | iconv -f "KOI8-R" -t utf8 > russian.txt
|