xapian-fu 1.1.2 → 1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +6 -0
- data/examples/spider.rb +3 -3
- data/lib/xapian_fu/query_parser.rb +2 -1
- data/lib/xapian_fu/stopper_factory.rb +4 -1
- data/lib/xapian_fu/xapian_db.rb +13 -0
- data/spec/query_parser_spec.rb +1 -1
- data/spec/stopper_factory_spec.rb +3 -3
- data/spec/xapian_db_spec.rb +34 -12
- data/spec/xapian_doc_spec.rb +13 -13
- data/spec/xapian_doc_value_accessor_spec.rb +11 -11
- metadata +23 -10
data/CHANGELOG.rdoc
CHANGED
data/examples/spider.rb
CHANGED
@@ -8,7 +8,7 @@ require 'rubygems'
|
|
8
8
|
require 'benchmark'
|
9
9
|
require 'lib/xapian_fu'
|
10
10
|
|
11
|
-
db = XapianFu::XapianDb.new(:store => [:filename, :filesize],
|
11
|
+
db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => [:filename, :filesize],
|
12
12
|
:overwrite => true)
|
13
13
|
|
14
14
|
base_path = ARGV[0] || '.'
|
@@ -31,12 +31,12 @@ while dir = index_queue.shift
|
|
31
31
|
next
|
32
32
|
end
|
33
33
|
next unless File.file?(filename)
|
34
|
-
next unless filename =~ /(txt|doc|README|c|
|
34
|
+
next unless filename =~ /(txt|doc|README|c|h|pl|sh|rb|py|note|xml)$/i
|
35
35
|
file_count += 1
|
36
36
|
|
37
37
|
# Read the first 10k of data
|
38
38
|
text = File.open(filename) { |f| f.read(10 * 1024) }
|
39
|
-
file_data += text.
|
39
|
+
file_data += text.size
|
40
40
|
# Index the data, filename and filesize
|
41
41
|
bm = Benchmark.measure do
|
42
42
|
db << {
|
@@ -120,7 +120,7 @@ module XapianFu #:nodoc:
|
|
120
120
|
if @flags
|
121
121
|
@flags
|
122
122
|
else
|
123
|
-
valid_flags = [:boolean, :boolean_anycase, :wildcards, :lovehate, :spelling, :pure_not]
|
123
|
+
valid_flags = [:boolean, :boolean_anycase, :wildcards, :lovehate, :spelling, :pure_not, :synonyms]
|
124
124
|
@flags = valid_flags.delete_if { |vf| not @options[vf] }
|
125
125
|
end
|
126
126
|
end
|
@@ -135,6 +135,7 @@ module XapianFu #:nodoc:
|
|
135
135
|
qflags |= Xapian::QueryParser::FLAG_LOVEHATE if flags.include?(:lovehate)
|
136
136
|
qflags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION if flags.include?(:spelling)
|
137
137
|
qflags |= Xapian::QueryParser::FLAG_PURE_NOT if flags.include?(:pure_not)
|
138
|
+
qflags |= Xapian::QueryParser::FLAG_AUTO_SYNONYMS if flags.include?(:synonyms)
|
138
139
|
qflags
|
139
140
|
end
|
140
141
|
|
@@ -32,7 +32,10 @@ module XapianFu
|
|
32
32
|
def self.stop_words_for(lang)
|
33
33
|
raise UnsupportedStopperLanguage, lang.to_s unless File.exists?(stop_words_filename(lang))
|
34
34
|
words = []
|
35
|
-
|
35
|
+
# Open files with correct encoding in Ruby 1.9
|
36
|
+
open_args = [stop_words_filename(lang), "r"]
|
37
|
+
open_args << { :encoding => "UTF-8" } if String.new.respond_to? :encoding
|
38
|
+
open(*open_args) do |f|
|
36
39
|
while line = f.readline rescue nil
|
37
40
|
words << line.split(" ", 2).first.downcase.strip unless line =~ /^ +|^$|^\|/
|
38
41
|
end
|
data/lib/xapian_fu/xapian_db.rb
CHANGED
@@ -170,6 +170,19 @@ module XapianFu #:nodoc:
|
|
170
170
|
end
|
171
171
|
alias_method "<<", :add_doc
|
172
172
|
|
173
|
+
# Add a synonym to the database.
|
174
|
+
#
|
175
|
+
# If you want to search with synonym support, remember to add
|
176
|
+
# the option:
|
177
|
+
#
|
178
|
+
# db.search("foo", :synonyms => true)
|
179
|
+
#
|
180
|
+
# Note that in-memory databases don't support synonyms.
|
181
|
+
#
|
182
|
+
def add_synonym(term, synonym)
|
183
|
+
rw.add_synonym(term, synonym)
|
184
|
+
end
|
185
|
+
|
173
186
|
# Conduct a search on the Xapian database, returning an array of
|
174
187
|
# XapianFu::XapianDoc objects for the matches wrapped in a
|
175
188
|
# XapianFu::ResultSet.
|
data/spec/query_parser_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'xapian'
|
3
|
-
require 'lib/xapian_fu.rb'
|
3
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
4
4
|
include XapianFu
|
5
5
|
require 'fileutils'
|
6
6
|
|
@@ -33,7 +33,7 @@ describe StopperFactory do
|
|
33
33
|
words.should include 'and'
|
34
34
|
words.should include "they're"
|
35
35
|
end
|
36
|
-
|
36
|
+
|
37
37
|
%w(danish dutch english finnish french german hungarian italian norwegian portuguese russian spanish swedish).each do |lang|
|
38
38
|
describe lang do
|
39
39
|
it "should return an array of words" do
|
@@ -49,7 +49,7 @@ describe StopperFactory do
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
|
52
|
+
|
53
53
|
it "should raise a UnsupportedStopperLanguage error if there is no data for the given language" do
|
54
54
|
Proc.new { StopperFactory.stop_words_for(:no_existy) }.should raise_error UnsupportedStopperLanguage
|
55
55
|
end
|
data/spec/xapian_db_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'xapian'
|
2
|
-
require 'lib/xapian_fu.rb'
|
2
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
3
3
|
include XapianFu
|
4
4
|
require 'fileutils'
|
5
5
|
require 'date'
|
@@ -28,14 +28,14 @@ describe XapianDb do
|
|
28
28
|
xdb.rw.should be_a_kind_of(Xapian::WritableDatabase)
|
29
29
|
xdb.ro.should be_a_kind_of(Xapian::Database)
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should lazily create the on-disk database when rw is used" do
|
35
35
|
xdb = XapianDb.new(:dir => tmp_dir, :create => true)
|
36
36
|
File.exists?(tmp_dir).should be_false
|
37
37
|
xdb.rw
|
38
|
-
File.exists?(tmp_dir).should be_true
|
38
|
+
File.exists?(tmp_dir).should be_true
|
39
39
|
end
|
40
40
|
|
41
41
|
it "should flush documents to the index when flush is called" do
|
@@ -201,7 +201,7 @@ describe XapianDb do
|
|
201
201
|
xdb << XapianDoc.new("once upon a time")
|
202
202
|
xdb.size.should == 2
|
203
203
|
end
|
204
|
-
|
204
|
+
|
205
205
|
end
|
206
206
|
|
207
207
|
describe "search" do
|
@@ -277,10 +277,10 @@ describe XapianDb do
|
|
277
277
|
it "should provide a corrected spelling string by default" do
|
278
278
|
xdb = XapianDb.new(:dir => tmp_dir + 'corrected_spelling', :create => true,
|
279
279
|
:overwrite => true)
|
280
|
-
xdb << "there is a mouse in this
|
280
|
+
xdb << "there is a mouse in this building"
|
281
281
|
xdb.flush
|
282
|
-
results = xdb.search("there was a moose at our
|
283
|
-
results.corrected_query.should == "there was a mouse at our
|
282
|
+
results = xdb.search("there was a moose at our building")
|
283
|
+
results.corrected_query.should == "there was a mouse at our building"
|
284
284
|
end
|
285
285
|
|
286
286
|
it "should not provide corrected spellings when disabled" do
|
@@ -293,11 +293,17 @@ describe XapianDb do
|
|
293
293
|
end
|
294
294
|
|
295
295
|
|
296
|
-
it "should do phrase matching by default when then :default_op option is :phrase"
|
296
|
+
it "should do phrase matching by default when then :default_op option is :phrase" do
|
297
|
+
pending
|
298
|
+
end
|
297
299
|
|
298
|
-
it "should do AND_MAYBE matching by default when the :default_op option is :and_maybe"
|
300
|
+
it "should do AND_MAYBE matching by default when the :default_op option is :and_maybe" do
|
301
|
+
pending
|
302
|
+
end
|
299
303
|
|
300
|
-
it "should do PURE_NOT matching by default when the :default_op option is :pure_not"
|
304
|
+
it "should do PURE_NOT matching by default when the :default_op option is :pure_not" do
|
305
|
+
pending
|
306
|
+
end
|
301
307
|
|
302
308
|
it "should page results when given the :page and :per_page options" do
|
303
309
|
xdb = XapianDb.new
|
@@ -356,6 +362,22 @@ describe XapianDb do
|
|
356
362
|
xdb.search("john").should == [john,katherine,louisa]
|
357
363
|
xdb.search("john -name:john").should == [katherine,louisa]
|
358
364
|
end
|
365
|
+
|
366
|
+
it "should recognize synonyms" do
|
367
|
+
xdb = XapianDb.new(:dir => tmp_dir + 'synonyms', :create => true,
|
368
|
+
:fields => [:name], :overwrite => true)
|
369
|
+
|
370
|
+
xdb << {:name => "john"}
|
371
|
+
xdb.flush
|
372
|
+
|
373
|
+
xdb.search("jon", :synonyms => true).should be_empty
|
374
|
+
|
375
|
+
xdb.add_synonym("jon", "john")
|
376
|
+
xdb.flush
|
377
|
+
|
378
|
+
xdb.search("jon").should be_empty
|
379
|
+
xdb.search("jon", :synonyms => true).should_not be_empty
|
380
|
+
end
|
359
381
|
end
|
360
382
|
|
361
383
|
describe "add_doc" do
|
@@ -533,9 +555,9 @@ describe XapianDb do
|
|
533
555
|
xdb = XapianDb.new(:fields => { :name => String, :title => String })
|
534
556
|
xdb.unindexed_fields.should == []
|
535
557
|
end
|
536
|
-
|
558
|
+
|
537
559
|
it "should return fields defined as not indexed in the fields option" do
|
538
|
-
xdb = XapianDb.new(:fields => {
|
560
|
+
xdb = XapianDb.new(:fields => {
|
539
561
|
:name => { :type => String, :index => false },
|
540
562
|
:title => String })
|
541
563
|
xdb.unindexed_fields.should include :name
|
data/spec/xapian_doc_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'xapian'
|
3
|
-
require 'lib/xapian_fu.rb'
|
3
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
4
4
|
include XapianFu
|
5
5
|
require 'fileutils'
|
6
6
|
|
@@ -9,7 +9,7 @@ describe XapianDoc do
|
|
9
9
|
it "should be equal to other XapianDoc objects with the same id" do
|
10
10
|
XapianDoc.new(:id => 666).should == XapianDoc.new(:id => 666)
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
it "should not be equal to other XapianDoc objects with different ids" do
|
14
14
|
XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
|
15
15
|
end
|
@@ -44,7 +44,7 @@ describe XapianDoc do
|
|
44
44
|
xdoc.terms.last.should be_a_kind_of Xapian::Term
|
45
45
|
xdoc.terms.last.term.should == "upon"
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
it "should tokenize the fields of a hash separately" do
|
49
49
|
xdb = XapianDb.new
|
50
50
|
xdoc = xdb.documents.new({ :text => "once upon a time", :title => "A story" }).to_xapian_document
|
@@ -73,7 +73,7 @@ describe XapianDoc do
|
|
73
73
|
terms = xdoc.terms.collect { |t| t.term }
|
74
74
|
terms.should include time.utc.strftime("%Y%m%d%H%M%S")
|
75
75
|
end
|
76
|
-
|
76
|
+
|
77
77
|
it "should convert DateTime instances to a useful format when tokenizing" do
|
78
78
|
datetime = DateTime.now
|
79
79
|
xdb = XapianDb.new
|
@@ -118,8 +118,8 @@ describe XapianDoc do
|
|
118
118
|
it "should stem #{lang.to_s.capitalize} words when the :stemmer option is set to :#{lang}" do
|
119
119
|
xdb = XapianDb.new
|
120
120
|
xdoc = xdb.documents.new(word, :stemmer => lang).to_xapian_document
|
121
|
-
terms = xdoc.terms.collect { |t| t.term }
|
122
|
-
terms.should include 'Z'+stem
|
121
|
+
terms = xdoc.terms.collect { |t| t.term.respond_to?(:force_encoding) ? t.term.force_encoding("UTF-8") : t.term }
|
122
|
+
terms.should include 'Z' + stem
|
123
123
|
end
|
124
124
|
end
|
125
125
|
end
|
@@ -130,7 +130,7 @@ describe XapianDoc do
|
|
130
130
|
terms = xdoc.terms.collect { |t| t.term if t.term =~ /^Z/ }.compact
|
131
131
|
terms.should be_empty
|
132
132
|
end
|
133
|
-
|
133
|
+
|
134
134
|
it "should not stem english stop words by default" do
|
135
135
|
xdb = XapianDb.new
|
136
136
|
xdoc = xdb.documents.new("And they made a cake", :stemmer => :english).to_xapian_document
|
@@ -139,14 +139,14 @@ describe XapianDoc do
|
|
139
139
|
terms.should_not include 'Za'
|
140
140
|
terms.should include 'Zcake'
|
141
141
|
end
|
142
|
-
|
142
|
+
|
143
143
|
it "should allow setting the stopper on initialisation" do
|
144
144
|
xdb = XapianDb.new(:stopper => :english)
|
145
145
|
xdoc = xdb.documents.new("And they made a cake", :stopper => :french)
|
146
146
|
xdoc.stopper.call("ayantes").should == true
|
147
147
|
xdoc.stopper.call("and").should == false
|
148
148
|
end
|
149
|
-
|
149
|
+
|
150
150
|
it "should not stop words when stopper is set to false" do
|
151
151
|
xdb = XapianDb.new
|
152
152
|
xdoc = xdb.documents.new("And they made a cake", :stopper => false).to_xapian_document
|
@@ -160,7 +160,7 @@ describe XapianDoc do
|
|
160
160
|
terms = xdoc.terms.collect { |t| t.term }
|
161
161
|
terms.should_not include 'Zи'
|
162
162
|
terms.should_not include 'Zони'
|
163
|
-
terms.should include 'Zcake'
|
163
|
+
terms.should include 'Zcake'
|
164
164
|
end
|
165
165
|
end
|
166
166
|
|
@@ -180,7 +180,7 @@ describe XapianDoc do
|
|
180
180
|
xdoc = xdb.documents.new("stink and bones", :language => :english, :stemmer => :french)
|
181
181
|
xdoc.stemmer.call("contournait").should == "contourn"
|
182
182
|
end
|
183
|
-
|
183
|
+
|
184
184
|
end
|
185
185
|
|
186
186
|
describe "stopper" do
|
@@ -188,7 +188,7 @@ describe XapianDoc do
|
|
188
188
|
xdb = XapianDb.new(:language => :french)
|
189
189
|
xdoc = xdb.documents.new("stink and bones")
|
190
190
|
xdoc.stopper.call("avec").should == true
|
191
|
-
end
|
191
|
+
end
|
192
192
|
it "should return a stopper for the document language, overriding the db" do
|
193
193
|
xdb = XapianDb.new(:language => :english)
|
194
194
|
xdoc = xdb.documents.new("stink and bones", :language => :french)
|
@@ -198,7 +198,7 @@ describe XapianDoc do
|
|
198
198
|
xdb = XapianDb.new(:language => :german)
|
199
199
|
xdoc = xdb.documents.new("stink and bones", :language => :english, :stopper => :french)
|
200
200
|
xdoc.stopper.call("avec").should == true
|
201
|
-
end
|
201
|
+
end
|
202
202
|
end
|
203
203
|
|
204
204
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'xapian'
|
2
|
-
require 'lib/xapian_fu.rb'
|
2
|
+
require File.expand_path('../lib/xapian_fu.rb', File.dirname(__FILE__))
|
3
3
|
include XapianFu
|
4
4
|
require 'fileutils'
|
5
5
|
require 'fixtures/film_data'
|
@@ -34,12 +34,12 @@ describe XapianDocValueAccessor do
|
|
34
34
|
doc.values.fetch(:number, Fixnum).should == number
|
35
35
|
doc.to_xapian_document.values.first.value.should == [number].pack("G")
|
36
36
|
end
|
37
|
-
end
|
37
|
+
end
|
38
38
|
|
39
39
|
it "should store fields defined as Bignum as packed double-precision float, network byte order" do
|
40
40
|
xdb = XapianDb.new(:fields => { :number => { :type => Bignum, :store => true } })
|
41
41
|
[
|
42
|
-
(-0x1fffffffffffff..-0x1fffffffffffff + 10).to_a,
|
42
|
+
(-0x1fffffffffffff..-0x1fffffffffffff + 10).to_a,
|
43
43
|
(0x1fffffffffffff-10..0x1fffffffffffff).to_a
|
44
44
|
].flatten.each do |number|
|
45
45
|
doc = xdb.documents.new(:number => number)
|
@@ -56,7 +56,7 @@ describe XapianDocValueAccessor do
|
|
56
56
|
lambda { doc.values.store(:number, number, Bignum) }.should raise_error XapianFu::ValueOutOfBounds
|
57
57
|
end
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
it "should store fields defined as Float as packed double-precision float, network byte order" do
|
61
61
|
xdb = XapianDb.new(:fields => { :number => { :type => Float, :store => true } })
|
62
62
|
[-0.303393984588383833, 8.448488388488384, 1.0].each do |number|
|
@@ -64,9 +64,9 @@ describe XapianDocValueAccessor do
|
|
64
64
|
doc.values.store(:number, number).should == number
|
65
65
|
doc.values.fetch(:number).should == number
|
66
66
|
doc.to_xapian_document.values.first.value.should == [number].pack("G")
|
67
|
-
end
|
67
|
+
end
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
it "should store fields defined as Time in UTC as packed double-precision float, network byte order" do
|
71
71
|
xdb = XapianDb.new(:fields => { :created_at => { :type => Time, :store => true }})
|
72
72
|
time = Time.now
|
@@ -110,22 +110,22 @@ describe XapianDocValueAccessor do
|
|
110
110
|
film_data_path = File.join(File.dirname(__FILE__), "fixtures/film_data")
|
111
111
|
Dir.foreach(film_data_path) do |db_path|
|
112
112
|
next unless db_path =~ /.+~.+/
|
113
|
-
it "should read stored values from databases created by #{db_path}" do
|
113
|
+
it "should read stored values from databases created by #{db_path}" do
|
114
114
|
db = XapianDb.new(:dir => File.join(film_data_path, db_path),
|
115
|
-
:fields => {
|
115
|
+
:fields => {
|
116
116
|
:title => { :type => String, :store => true },
|
117
|
-
:released_on => { :type => Date, :store => true },
|
117
|
+
:released_on => { :type => Date, :store => true },
|
118
118
|
:revenue => { :type => Integer, :store => true }
|
119
119
|
})
|
120
120
|
FILM_DATA.size.times do |i|
|
121
121
|
doc = db.documents[i+1]
|
122
122
|
[:title, :released_on, :revenue].each do |field|
|
123
|
-
doc.values[field].should === FILM_DATA[i][field]
|
123
|
+
doc.values[field].should === FILM_DATA[i][field]
|
124
124
|
end
|
125
125
|
end
|
126
126
|
end
|
127
127
|
end
|
128
|
-
|
128
|
+
|
129
129
|
end
|
130
130
|
|
131
131
|
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-fu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 11
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 1
|
9
8
|
- 2
|
10
|
-
version: 1.
|
9
|
+
version: "1.2"
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- John Leach
|
@@ -15,10 +14,24 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date:
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
date: 2011-07-05 00:00:00 Z
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: rspec
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - "="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 27
|
28
|
+
segments:
|
29
|
+
- 1
|
30
|
+
- 3
|
31
|
+
- 0
|
32
|
+
version: 1.3.0
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
22
35
|
description: A library to provide a more Ruby-like interface to the Xapian search engine.
|
23
36
|
email: john@johnleach.co.uk
|
24
37
|
executables: []
|
@@ -120,7 +133,6 @@ files:
|
|
120
133
|
- spec/build_db_for_value_testing.rb
|
121
134
|
- spec/query_parser_spec.rb
|
122
135
|
- spec/spec.opts
|
123
|
-
has_rdoc: true
|
124
136
|
homepage: http://github.com/johnl/xapian-fu
|
125
137
|
licenses: []
|
126
138
|
|
@@ -154,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
166
|
requirements: []
|
155
167
|
|
156
168
|
rubyforge_project: xapian-fu
|
157
|
-
rubygems_version: 1.
|
169
|
+
rubygems_version: 1.7.2
|
158
170
|
signing_key:
|
159
171
|
specification_version: 3
|
160
172
|
summary: A Ruby interface to the Xapian search engine
|
@@ -218,3 +230,4 @@ test_files:
|
|
218
230
|
- spec/build_db_for_value_testing.rb
|
219
231
|
- spec/query_parser_spec.rb
|
220
232
|
- spec/spec.opts
|
233
|
+
has_rdoc:
|