xapian-fu 1.1.2 → 1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +6 -0
- data/examples/spider.rb +3 -3
- data/lib/xapian_fu/query_parser.rb +2 -1
- data/lib/xapian_fu/stopper_factory.rb +4 -1
- data/lib/xapian_fu/xapian_db.rb +13 -0
- data/spec/query_parser_spec.rb +1 -1
- data/spec/stopper_factory_spec.rb +3 -3
- data/spec/xapian_db_spec.rb +34 -12
- data/spec/xapian_doc_spec.rb +13 -13
- data/spec/xapian_doc_value_accessor_spec.rb +11 -11
- metadata +23 -10
data/CHANGELOG.rdoc
CHANGED
data/examples/spider.rb
CHANGED
@@ -8,7 +8,7 @@ require 'rubygems'
|
|
8
8
|
require 'benchmark'
|
9
9
|
require 'lib/xapian_fu'
|
10
10
|
|
11
|
-
db = XapianFu::XapianDb.new(:store => [:filename, :filesize],
|
11
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => [:filename, :filesize],
|
12
12
|
:overwrite => true)
|
13
13
|
|
14
14
|
base_path = ARGV[0] || '.'
|
@@ -31,12 +31,12 @@ while dir = index_queue.shift
|
|
31
31
|
next
|
32
32
|
end
|
33
33
|
next unless File.file?(filename)
|
34
|
-
next unless filename =~ /(txt|doc|README|c|
|
34
|
+
next unless filename =~ /(txt|doc|README|c|h|pl|sh|rb|py|note|xml)$/i
|
35
35
|
file_count += 1
|
36
36
|
|
37
37
|
# Read the first 10k of data
|
38
38
|
text = File.open(filename) { |f| f.read(10 * 1024) }
|
39
|
-
file_data += text.
|
39
|
+
file_data += text.size
|
40
40
|
# Index the data, filename and filesize
|
41
41
|
bm = Benchmark.measure do
|
42
42
|
db << {
|
@@ -120,7 +120,7 @@ module XapianFu #:nodoc:
|
|
120
120
|
if @flags
|
121
121
|
@flags
|
122
122
|
else
|
123
|
-
valid_flags = [:boolean, :boolean_anycase, :wildcards, :lovehate, :spelling, :pure_not]
|
123
|
+
valid_flags = [:boolean, :boolean_anycase, :wildcards, :lovehate, :spelling, :pure_not, :synonyms]
|
124
124
|
@flags = valid_flags.delete_if { |vf| not @options[vf] }
|
125
125
|
end
|
126
126
|
end
|
@@ -135,6 +135,7 @@ module XapianFu #:nodoc:
|
|
135
135
|
qflags |= Xapian::QueryParser::FLAG_LOVEHATE if flags.include?(:lovehate)
|
136
136
|
qflags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION if flags.include?(:spelling)
|
137
137
|
qflags |= Xapian::QueryParser::FLAG_PURE_NOT if flags.include?(:pure_not)
|
138
|
+
qflags |= Xapian::QueryParser::FLAG_AUTO_SYNONYMS if flags.include?(:synonyms)
|
138
139
|
qflags
|
139
140
|
end
|
140
141
|
|
@@ -32,7 +32,10 @@ module XapianFu
|
|
32
32
|
def self.stop_words_for(lang)
|
33
33
|
raise UnsupportedStopperLanguage, lang.to_s unless File.exists?(stop_words_filename(lang))
|
34
34
|
words = []
|
35
|
-
|
35
|
+
# Open files with correct encoding in Ruby 1.9
|
36
|
+
open_args = [stop_words_filename(lang), "r"]
|
37
|
+
open_args << { :encoding => "UTF-8" } if String.new.respond_to? :encoding
|
38
|
+
open(*open_args) do |f|
|
36
39
|
while line = f.readline rescue nil
|
37
40
|
words << line.split(" ", 2).first.downcase.strip unless line =~ /^ +|^$|^\|/
|
38
41
|
end
|
data/lib/xapian_fu/xapian_db.rb
CHANGED
@@ -170,6 +170,19 @@ module XapianFu #:nodoc:
|
|
170
170
|
end
|
171
171
|
alias_method "<<", :add_doc
|
172
172
|
|
173
|
+
# Add a synonym to the database.
|
174
|
+
#
|
175
|
+
# If you want to search with synonym support, remember to add
|
176
|
+
# the option:
|
177
|
+
#
|
178
|
+
# db.search("foo", :synonyms => true)
|
179
|
+
#
|
180
|
+
# Note that in-memory databases don't support synonyms.
|
181
|
+
#
|
182
|
+
def add_synonym(term, synonym)
|
183
|
+
rw.add_synonym(term, synonym)
|
184
|
+
end
|
185
|
+
|
173
186
|
# Conduct a search on the Xapian database, returning an array of
|
174
187
|
# XapianFu::XapianDoc objects for the matches wrapped in a
|
175
188
|
# XapianFu::ResultSet.
|
data/spec/query_parser_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'xapian'
|
3
|
-
require 'lib/xapian_fu.rb'
|
3
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
4
4
|
include XapianFu
|
5
5
|
require 'fileutils'
|
6
6
|
|
@@ -33,7 +33,7 @@ describe StopperFactory do
|
|
33
33
|
words.should include 'and'
|
34
34
|
words.should include "they're"
|
35
35
|
end
|
36
|
-
|
36
|
+
|
37
37
|
%w(danish dutch english finnish french german hungarian italian norwegian portuguese russian spanish swedish).each do |lang|
|
38
38
|
describe lang do
|
39
39
|
it "should return an array of words" do
|
@@ -49,7 +49,7 @@ describe StopperFactory do
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
|
52
|
+
|
53
53
|
it "should raise a UnsupportedStopperLanguage error if there is no data for the given language" do
|
54
54
|
Proc.new { StopperFactory.stop_words_for(:no_existy) }.should raise_error UnsupportedStopperLanguage
|
55
55
|
end
|
data/spec/xapian_db_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'xapian'
|
2
|
-
require 'lib/xapian_fu.rb'
|
2
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
3
3
|
include XapianFu
|
4
4
|
require 'fileutils'
|
5
5
|
require 'date'
|
@@ -28,14 +28,14 @@ describe XapianDb do
|
|
28
28
|
xdb.rw.should be_a_kind_of(Xapian::WritableDatabase)
|
29
29
|
xdb.ro.should be_a_kind_of(Xapian::Database)
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should lazily create the on-disk database when rw is used" do
|
35
35
|
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
36
36
|
File.exists?(tmp_dir).should be_false
|
37
37
|
xdb.rw
|
38
|
-
File.exists?(tmp_dir).should be_true
|
38
|
+
File.exists?(tmp_dir).should be_true
|
39
39
|
end
|
40
40
|
|
41
41
|
it "should flush documents to the index when flush is called" do
|
@@ -201,7 +201,7 @@ describe XapianDb do
|
|
201
201
|
xdb << XapianDoc.new("once upon a time")
|
202
202
|
xdb.size.should == 2
|
203
203
|
end
|
204
|
-
|
204
|
+
|
205
205
|
end
|
206
206
|
|
207
207
|
describe "search" do
|
@@ -277,10 +277,10 @@ describe XapianDb do
|
|
277
277
|
it "should provide a corrected spelling string by default" do
|
278
278
|
xdb = XapianDb.new(:dir => tmp_dir + 'corrected_spelling', :create => true,
|
279
279
|
:overwrite => true)
|
280
|
-
xdb << "there is a mouse in this
|
280
|
+
xdb << "there is a mouse in this building"
|
281
281
|
xdb.flush
|
282
|
-
results = xdb.search("there was a moose at our
|
283
|
-
results.corrected_query.should == "there was a mouse at our
|
282
|
+
results = xdb.search("there was a moose at our building")
|
283
|
+
results.corrected_query.should == "there was a mouse at our building"
|
284
284
|
end
|
285
285
|
|
286
286
|
it "should not provide corrected spellings when disabled" do
|
@@ -293,11 +293,17 @@ describe XapianDb do
|
|
293
293
|
end
|
294
294
|
|
295
295
|
|
296
|
-
it "should do phrase matching by default when then :default_op option is :phrase"
|
296
|
+
it "should do phrase matching by default when then :default_op option is :phrase" do
|
297
|
+
pending
|
298
|
+
end
|
297
299
|
|
298
|
-
it "should do AND_MAYBE matching by default when the :default_op option is :and_maybe"
|
300
|
+
it "should do AND_MAYBE matching by default when the :default_op option is :and_maybe" do
|
301
|
+
pending
|
302
|
+
end
|
299
303
|
|
300
|
-
it "should do PURE_NOT matching by default when the :default_op option is :pure_not"
|
304
|
+
it "should do PURE_NOT matching by default when the :default_op option is :pure_not" do
|
305
|
+
pending
|
306
|
+
end
|
301
307
|
|
302
308
|
it "should page results when given the :page and :per_page options" do
|
303
309
|
xdb = XapianDb.new
|
@@ -356,6 +362,22 @@ describe XapianDb do
|
|
356
362
|
xdb.search("john").should == [john,katherine,louisa]
|
357
363
|
xdb.search("john -name:john").should == [katherine,louisa]
|
358
364
|
end
|
365
|
+
|
366
|
+
it "should recognize synonyms" do
|
367
|
+
xdb = XapianDb.new(:dir => tmp_dir + 'synonyms', :create => true,
|
368
|
+
:fields => [:name], :overwrite => true)
|
369
|
+
|
370
|
+
xdb << {:name => "john"}
|
371
|
+
xdb.flush
|
372
|
+
|
373
|
+
xdb.search("jon", :synonyms => true).should be_empty
|
374
|
+
|
375
|
+
xdb.add_synonym("jon", "john")
|
376
|
+
xdb.flush
|
377
|
+
|
378
|
+
xdb.search("jon").should be_empty
|
379
|
+
xdb.search("jon", :synonyms => true).should_not be_empty
|
380
|
+
end
|
359
381
|
end
|
360
382
|
|
361
383
|
describe "add_doc" do
|
@@ -533,9 +555,9 @@ describe XapianDb do
|
|
533
555
|
xdb = XapianDb.new(:fields => { :name => String, :title => String })
|
534
556
|
xdb.unindexed_fields.should == []
|
535
557
|
end
|
536
|
-
|
558
|
+
|
537
559
|
it "should return fields defined as not indexed in the fields option" do
|
538
|
-
xdb = XapianDb.new(:fields => {
|
560
|
+
xdb = XapianDb.new(:fields => {
|
539
561
|
:name => { :type => String, :index => false },
|
540
562
|
:title => String })
|
541
563
|
xdb.unindexed_fields.should include :name
|
data/spec/xapian_doc_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'xapian'
|
3
|
-
require 'lib/xapian_fu.rb'
|
3
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
4
4
|
include XapianFu
|
5
5
|
require 'fileutils'
|
6
6
|
|
@@ -9,7 +9,7 @@ describe XapianDoc do
|
|
9
9
|
it "should be equal to other XapianDoc objects with the same id" do
|
10
10
|
XapianDoc.new(:id => 666).should == XapianDoc.new(:id => 666)
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
it "should not be equal to other XapianDoc objects with different ids" do
|
14
14
|
XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
|
15
15
|
end
|
@@ -44,7 +44,7 @@ describe XapianDoc do
|
|
44
44
|
xdoc.terms.last.should be_a_kind_of Xapian::Term
|
45
45
|
xdoc.terms.last.term.should == "upon"
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
it "should tokenize the fields of a hash separately" do
|
49
49
|
xdb = XapianDb.new
|
50
50
|
xdoc = xdb.documents.new({ :text => "once upon a time", :title => "A story" }).to_xapian_document
|
@@ -73,7 +73,7 @@ describe XapianDoc do
|
|
73
73
|
terms = xdoc.terms.collect { |t| t.term }
|
74
74
|
terms.should include time.utc.strftime("%Y%m%d%H%M%S")
|
75
75
|
end
|
76
|
-
|
76
|
+
|
77
77
|
it "should convert DateTime instances to a useful format when tokenizing" do
|
78
78
|
datetime = DateTime.now
|
79
79
|
xdb = XapianDb.new
|
@@ -118,8 +118,8 @@ describe XapianDoc do
|
|
118
118
|
it "should stem #{lang.to_s.capitalize} words when the :stemmer option is set to :#{lang}" do
|
119
119
|
xdb = XapianDb.new
|
120
120
|
xdoc = xdb.documents.new(word, :stemmer => lang).to_xapian_document
|
121
|
-
terms = xdoc.terms.collect { |t| t.term }
|
122
|
-
terms.should include 'Z'+stem
|
121
|
+
terms = xdoc.terms.collect { |t| t.term.respond_to?(:force_encoding) ? t.term.force_encoding("UTF-8") : t.term }
|
122
|
+
terms.should include 'Z' + stem
|
123
123
|
end
|
124
124
|
end
|
125
125
|
end
|
@@ -130,7 +130,7 @@ describe XapianDoc do
|
|
130
130
|
terms = xdoc.terms.collect { |t| t.term if t.term =~ /^Z/ }.compact
|
131
131
|
terms.should be_empty
|
132
132
|
end
|
133
|
-
|
133
|
+
|
134
134
|
it "should not stem english stop words by default" do
|
135
135
|
xdb = XapianDb.new
|
136
136
|
xdoc = xdb.documents.new("And they made a cake", :stemmer => :english).to_xapian_document
|
@@ -139,14 +139,14 @@ describe XapianDoc do
|
|
139
139
|
terms.should_not include 'Za'
|
140
140
|
terms.should include 'Zcake'
|
141
141
|
end
|
142
|
-
|
142
|
+
|
143
143
|
it "should allow setting the stopper on initialisation" do
|
144
144
|
xdb = XapianDb.new(:stopper => :english)
|
145
145
|
xdoc = xdb.documents.new("And they made a cake", :stopper => :french)
|
146
146
|
xdoc.stopper.call("ayantes").should == true
|
147
147
|
xdoc.stopper.call("and").should == false
|
148
148
|
end
|
149
|
-
|
149
|
+
|
150
150
|
it "should not stop words when stopper is set to false" do
|
151
151
|
xdb = XapianDb.new
|
152
152
|
xdoc = xdb.documents.new("And they made a cake", :stopper => false).to_xapian_document
|
@@ -160,7 +160,7 @@ describe XapianDoc do
|
|
160
160
|
terms = xdoc.terms.collect { |t| t.term }
|
161
161
|
terms.should_not include 'Zи'
|
162
162
|
terms.should_not include 'Zони'
|
163
|
-
terms.should include 'Zcake'
|
163
|
+
terms.should include 'Zcake'
|
164
164
|
end
|
165
165
|
end
|
166
166
|
|
@@ -180,7 +180,7 @@ describe XapianDoc do
|
|
180
180
|
xdoc = xdb.documents.new("stink and bones", :language => :english, :stemmer => :french)
|
181
181
|
xdoc.stemmer.call("contournait").should == "contourn"
|
182
182
|
end
|
183
|
-
|
183
|
+
|
184
184
|
end
|
185
185
|
|
186
186
|
describe "stopper" do
|
@@ -188,7 +188,7 @@ describe XapianDoc do
|
|
188
188
|
xdb = XapianDb.new(:language => :french)
|
189
189
|
xdoc = xdb.documents.new("stink and bones")
|
190
190
|
xdoc.stopper.call("avec").should == true
|
191
|
-
end
|
191
|
+
end
|
192
192
|
it "should return a stopper for the document language, overriding the db" do
|
193
193
|
xdb = XapianDb.new(:language => :english)
|
194
194
|
xdoc = xdb.documents.new("stink and bones", :language => :french)
|
@@ -198,7 +198,7 @@ describe XapianDoc do
|
|
198
198
|
xdb = XapianDb.new(:language => :german)
|
199
199
|
xdoc = xdb.documents.new("stink and bones", :language => :english, :stopper => :french)
|
200
200
|
xdoc.stopper.call("avec").should == true
|
201
|
-
end
|
201
|
+
end
|
202
202
|
end
|
203
203
|
|
204
204
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'xapian'
|
2
|
-
require 'lib/xapian_fu.rb'
|
2
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
3
3
|
include XapianFu
|
4
4
|
require 'fileutils'
|
5
5
|
require 'fixtures/film_data'
|
@@ -34,12 +34,12 @@ describe XapianDocValueAccessor do
|
|
34
34
|
doc.values.fetch(:number, Fixnum).should == number
|
35
35
|
doc.to_xapian_document.values.first.value.should == [number].pack("G")
|
36
36
|
end
|
37
|
-
end
|
37
|
+
end
|
38
38
|
|
39
39
|
it "should store fields defined as Bignum as packed double-precision float, network byte order" do
|
40
40
|
xdb = XapianDb.new(:fields => { :number => { :type => Bignum, :store => true } })
|
41
41
|
[
|
42
|
-
(-0x1fffffffffffff..-0x1fffffffffffff + 10).to_a,
|
42
|
+
(-0x1fffffffffffff..-0x1fffffffffffff + 10).to_a,
|
43
43
|
(0x1fffffffffffff-10..0x1fffffffffffff).to_a
|
44
44
|
].flatten.each do |number|
|
45
45
|
doc = xdb.documents.new(:number => number)
|
@@ -56,7 +56,7 @@ describe XapianDocValueAccessor do
|
|
56
56
|
lambda { doc.values.store(:number, number, Bignum) }.should raise_error XapianFu::ValueOutOfBounds
|
57
57
|
end
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
it "should store fields defined as Float as packed double-precision float, network byte order" do
|
61
61
|
xdb = XapianDb.new(:fields => { :number => { :type => Float, :store => true } })
|
62
62
|
[-0.303393984588383833, 8.448488388488384, 1.0].each do |number|
|
@@ -64,9 +64,9 @@ describe XapianDocValueAccessor do
|
|
64
64
|
doc.values.store(:number, number).should == number
|
65
65
|
doc.values.fetch(:number).should == number
|
66
66
|
doc.to_xapian_document.values.first.value.should == [number].pack("G")
|
67
|
-
end
|
67
|
+
end
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
it "should store fields defined as Time in UTC as packed double-precision float, network byte order" do
|
71
71
|
xdb = XapianDb.new(:fields => { :created_at => { :type => Time, :store => true }})
|
72
72
|
time = Time.now
|
@@ -110,22 +110,22 @@ describe XapianDocValueAccessor do
|
|
110
110
|
film_data_path = File.join(File.dirname(__FILE__), "fixtures/film_data")
|
111
111
|
Dir.foreach(film_data_path) do |db_path|
|
112
112
|
next unless db_path =~ /.+~.+/
|
113
|
-
it "should read stored values from databases created by #{db_path}" do
|
113
|
+
it "should read stored values from databases created by #{db_path}" do
|
114
114
|
db = XapianDb.new(:dir => File.join(film_data_path, db_path),
|
115
|
-
:fields => {
|
115
|
+
:fields => {
|
116
116
|
:title => { :type => String, :store => true },
|
117
|
-
:released_on => { :type => Date, :store => true },
|
117
|
+
:released_on => { :type => Date, :store => true },
|
118
118
|
:revenue => { :type => Integer, :store => true }
|
119
119
|
})
|
120
120
|
FILM_DATA.size.times do |i|
|
121
121
|
doc = db.documents[i+1]
|
122
122
|
[:title, :released_on, :revenue].each do |field|
|
123
|
-
doc.values[field].should === FILM_DATA[i][field]
|
123
|
+
doc.values[field].should === FILM_DATA[i][field]
|
124
124
|
end
|
125
125
|
end
|
126
126
|
end
|
127
127
|
end
|
128
|
-
|
128
|
+
|
129
129
|
end
|
130
130
|
|
131
131
|
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-fu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 11
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 1
|
9
8
|
- 2
|
10
|
-
version: 1.
|
9
|
+
version: "1.2"
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- John Leach
|
@@ -15,10 +14,24 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date:
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
date: 2011-07-05 00:00:00 Z
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: rspec
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - "="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 27
|
28
|
+
segments:
|
29
|
+
- 1
|
30
|
+
- 3
|
31
|
+
- 0
|
32
|
+
version: 1.3.0
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
22
35
|
description: A library to provide a more Ruby-like interface to the Xapian search engine.
|
23
36
|
email: john@johnleach.co.uk
|
24
37
|
executables: []
|
@@ -120,7 +133,6 @@ files:
|
|
120
133
|
- spec/build_db_for_value_testing.rb
|
121
134
|
- spec/query_parser_spec.rb
|
122
135
|
- spec/spec.opts
|
123
|
-
has_rdoc: true
|
124
136
|
homepage: http://github.com/johnl/xapian-fu
|
125
137
|
licenses: []
|
126
138
|
|
@@ -154,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
166
|
requirements: []
|
155
167
|
|
156
168
|
rubyforge_project: xapian-fu
|
157
|
-
rubygems_version: 1.
|
169
|
+
rubygems_version: 1.7.2
|
158
170
|
signing_key:
|
159
171
|
specification_version: 3
|
160
172
|
summary: A Ruby interface to the Xapian search engine
|
@@ -218,3 +230,4 @@ test_files:
|
|
218
230
|
- spec/build_db_for_value_testing.rb
|
219
231
|
- spec/query_parser_spec.rb
|
220
232
|
- spec/spec.opts
|
233
|
+
has_rdoc:
|