xapian-fu 0.2 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +152 -13
- data/examples/query.rb +34 -6
- data/examples/spider.rb +44 -15
- data/lib/xapian_fu/query_parser.rb +179 -0
- data/lib/xapian_fu/result_set.rb +52 -0
- data/lib/xapian_fu/stopper_factory.rb +40 -0
- data/lib/xapian_fu/stopwords/README +7 -0
- data/lib/xapian_fu/stopwords/danish.txt +102 -0
- data/lib/xapian_fu/stopwords/dutch.txt +113 -0
- data/lib/xapian_fu/stopwords/english.txt +312 -0
- data/lib/xapian_fu/stopwords/finnish.txt +89 -0
- data/lib/xapian_fu/stopwords/french.txt +168 -0
- data/lib/xapian_fu/stopwords/german.txt +286 -0
- data/lib/xapian_fu/stopwords/hungarian.txt +203 -0
- data/lib/xapian_fu/stopwords/italian.txt +295 -0
- data/lib/xapian_fu/stopwords/norwegian.txt +186 -0
- data/lib/xapian_fu/stopwords/portuguese.txt +245 -0
- data/lib/xapian_fu/stopwords/russian.txt +236 -0
- data/lib/xapian_fu/stopwords/spanish.txt +348 -0
- data/lib/xapian_fu/stopwords/swedish.txt +125 -0
- data/lib/xapian_fu/stopwords/update.rb +7 -0
- data/lib/xapian_fu/xapian_db.rb +215 -99
- data/lib/xapian_fu/xapian_doc.rb +229 -47
- data/lib/xapian_fu/xapian_doc_value_accessor.rb +125 -0
- data/lib/xapian_fu/xapian_documents_accessor.rb +82 -0
- data/lib/xapian_fu.rb +1 -0
- data/spec/query_parser_spec.rb +43 -0
- data/spec/stopper_factory_spec.rb +57 -0
- data/spec/xapian_db_spec.rb +458 -215
- data/spec/xapian_doc_spec.rb +180 -0
- data/spec/xapian_doc_value_accessor_spec.rb +92 -0
- metadata +29 -5
data/spec/xapian_doc_spec.rb
CHANGED
@@ -13,4 +13,184 @@ describe XapianDoc do
|
|
13
13
|
XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
|
14
14
|
end
|
15
15
|
|
16
|
+
describe "to_xapian_document" do
|
17
|
+
it "should tokenize strings" do
|
18
|
+
xdb = XapianDb.new
|
19
|
+
xdoc = xdb.documents.new("once upon a time").to_xapian_document
|
20
|
+
xdoc.terms.should be_a_kind_of Array
|
21
|
+
xdoc.terms.last.should be_a_kind_of Xapian::Term
|
22
|
+
xdoc.terms.last.term.should == "upon"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should store positions by default when tokenizing" do
|
26
|
+
xdb = XapianDb.new
|
27
|
+
doc = xdb.documents.new("once upon a time")
|
28
|
+
doc.save
|
29
|
+
xdb.ro.positionlist(doc.id, "time").first.should_not == nil
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should not store positions when tokenizing when :index_positions is set to false" do
|
33
|
+
xdb = XapianDb.new(:index_positions => false)
|
34
|
+
doc = xdb.documents.new("once upon a time")
|
35
|
+
doc.save
|
36
|
+
xdb.ro.positionlist(doc.id, "once").first.should == nil
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should tokenize a hash" do
|
40
|
+
xdb = XapianDb.new
|
41
|
+
xdoc = xdb.documents.new(:title => 'once upon a time').to_xapian_document
|
42
|
+
xdoc.terms.should be_a_kind_of Array
|
43
|
+
xdoc.terms.last.should be_a_kind_of Xapian::Term
|
44
|
+
xdoc.terms.last.term.should == "upon"
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should tokenize the fields of a hash separately" do
|
48
|
+
xdb = XapianDb.new
|
49
|
+
xdoc = xdb.documents.new({ :text => "once upon a time", :title => "A story" }).to_xapian_document
|
50
|
+
terms = xdoc.terms.collect { |t| t.term }
|
51
|
+
terms.should include "XTEXTonce"
|
52
|
+
terms.should include "XTITLEstory"
|
53
|
+
terms.should_not include "XTEXTstory"
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should not tokenize fields declared as not to be indexed" do
|
57
|
+
xdb = XapianDb.new(:fields => { :name => { :index => false } })
|
58
|
+
xdoc = xdb.documents.new({ :name => 'John Leach', :quote => 'Xapian Rocks' }).to_xapian_document
|
59
|
+
terms = xdoc.terms.collect { |t| t.term }
|
60
|
+
terms.should_not include 'XNAMEjohn'
|
61
|
+
terms.should_not include 'XNAMEleach'
|
62
|
+
terms.should_not include 'Zjohn'
|
63
|
+
terms.should_not include 'Zleach'
|
64
|
+
terms.should_not include 'john'
|
65
|
+
terms.should_not include 'leach'
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should convert Time instances to a useful format when tokenizing" do
|
69
|
+
time = Time.now
|
70
|
+
xdb = XapianDb.new
|
71
|
+
xdoc = xdb.documents.new(:created_at => time).to_xapian_document
|
72
|
+
terms = xdoc.terms.collect { |t| t.term }
|
73
|
+
terms.should include time.utc.strftime("%Y%m%d%H%M%S")
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should convert DateTime instances to a useful format when tokenizing" do
|
77
|
+
datetime = DateTime.now
|
78
|
+
xdb = XapianDb.new
|
79
|
+
xdoc = xdb.documents.new(:created_at => datetime).to_xapian_document
|
80
|
+
terms = xdoc.terms.collect { |t| t.term }
|
81
|
+
terms.should include datetime.strftime("%Y%m%d%H%M%S")
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should convert Time instances to a useful format when tokenizing" do
|
85
|
+
date = Date.today
|
86
|
+
xdb = XapianDb.new
|
87
|
+
xdoc = xdb.documents.new(:created_on => date).to_xapian_document
|
88
|
+
terms = xdoc.terms.collect { |t| t.term }
|
89
|
+
terms.should include date.strftime("%Y%m%d")
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should stem English words by default" do
|
93
|
+
xdb = XapianDb.new
|
94
|
+
xdoc = xdb.documents.new("She fished for fish").to_xapian_document
|
95
|
+
terms = xdoc.terms.collect { |t| t.term }
|
96
|
+
terms.should_not include "Zfished"
|
97
|
+
terms.should include "Zfish"
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should inherit the databases stemmer by default" do
|
101
|
+
xdb = XapianDb.new(:stemmer => :french)
|
102
|
+
xdoc = xdb.documents.new("majestueusement").to_xapian_document
|
103
|
+
terms = xdoc.terms.collect { |t| t.term }
|
104
|
+
terms.should include 'Zmajestu'
|
105
|
+
end
|
106
|
+
|
107
|
+
stems = {
|
108
|
+
:german => { "aufeinander" => "aufeinand" },
|
109
|
+
:french => { "majestueusement" => "majestu" },
|
110
|
+
:swedish => { "kloekornas" => "kloek" },
|
111
|
+
:danish => { "indvendingerne" => "indvending" },
|
112
|
+
:russian => { "падшую", "падш" },
|
113
|
+
:italian => { "propagamento" => "propag" }
|
114
|
+
}
|
115
|
+
stems.keys.each do |lang|
|
116
|
+
stems[lang].each do |word, stem|
|
117
|
+
it "should stem #{lang.to_s.capitalize} words when the :stemmer option is set to :#{lang}" do
|
118
|
+
xdb = XapianDb.new
|
119
|
+
xdoc = xdb.documents.new(word, :stemmer => lang).to_xapian_document
|
120
|
+
terms = xdoc.terms.collect { |t| t.term }
|
121
|
+
terms.should include 'Z'+stem
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should not stem words when stemmer is set to false" do
|
127
|
+
xdb = XapianDb.new
|
128
|
+
xdoc = xdb.documents.new("She fished for fish", :stemmer => false).to_xapian_document
|
129
|
+
terms = xdoc.terms.collect { |t| t.term if t.term =~ /^Z/ }.compact
|
130
|
+
terms.should be_empty
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should not stem english stop words by default" do
|
134
|
+
xdb = XapianDb.new
|
135
|
+
xdoc = xdb.documents.new("And they made a cake", :stemmer => :english).to_xapian_document
|
136
|
+
terms = xdoc.terms.collect { |t| t.term }
|
137
|
+
terms.should_not include 'Zand'
|
138
|
+
terms.should_not include 'Za'
|
139
|
+
terms.should include 'Zcake'
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should allow setting the stopper on initialisation" do
|
143
|
+
xdb = XapianDb.new(:stopper => :english)
|
144
|
+
xdoc = xdb.documents.new("And they made a cake", :stopper => :french)
|
145
|
+
xdoc.stopper.call("ayantes").should == true
|
146
|
+
xdoc.stopper.call("and").should == false
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should support stop words encoded in utf8" do
|
150
|
+
xdb = XapianDb.new
|
151
|
+
xdoc = xdb.documents.new("и они made a cake", :stemmer => :russian, :stopper => :russian).to_xapian_document
|
152
|
+
terms = xdoc.terms.collect { |t| t.term }
|
153
|
+
terms.should_not include 'Zи'
|
154
|
+
terms.should_not include 'Zони'
|
155
|
+
terms.should include 'Zcake'
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
describe "stemmer" do
|
160
|
+
it "should return the same stemmer as the database by default" do
|
161
|
+
xdb = XapianDb.new(:language => :french)
|
162
|
+
xdoc = xdb.documents.new("stink and bones")
|
163
|
+
xdoc.stemmer.call("contournait").should == "contourn"
|
164
|
+
end
|
165
|
+
it "should return a stemmer for the document language, overriding the db" do
|
166
|
+
xdb = XapianDb.new(:language => :english)
|
167
|
+
xdoc = xdb.documents.new("stink and bones", :language => :french)
|
168
|
+
xdoc.stemmer.call("contournait").should == "contourn"
|
169
|
+
end
|
170
|
+
it "should return a stemmer set by the :stemmer option, overriding the :language option and the db stemmer" do
|
171
|
+
xdb = XapianDb.new(:language => :german)
|
172
|
+
xdoc = xdb.documents.new("stink and bones", :language => :english, :stemmer => :french)
|
173
|
+
xdoc.stemmer.call("contournait").should == "contourn"
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
describe "stopper" do
|
179
|
+
it "should return the same stopper as the database by default" do
|
180
|
+
xdb = XapianDb.new(:language => :french)
|
181
|
+
xdoc = xdb.documents.new("stink and bones")
|
182
|
+
xdoc.stopper.call("avec").should == true
|
183
|
+
end
|
184
|
+
it "should return a stopper for the document language, overriding the db" do
|
185
|
+
xdb = XapianDb.new(:language => :english)
|
186
|
+
xdoc = xdb.documents.new("stink and bones", :language => :french)
|
187
|
+
xdoc.stopper.call("avec").should == true
|
188
|
+
end
|
189
|
+
it "should return a stopper set by the :stopper option, overriding the :language option and the db stopper" do
|
190
|
+
xdb = XapianDb.new(:language => :german)
|
191
|
+
xdoc = xdb.documents.new("stink and bones", :language => :english, :stopper => :french)
|
192
|
+
xdoc.stopper.call("avec").should == true
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
16
196
|
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'xapian'
|
2
|
+
require 'lib/xapian_fu.rb'
|
3
|
+
include XapianFu
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
describe XapianDocValueAccessor do
|
7
|
+
|
8
|
+
it "should store and fetch values like a hash" do
|
9
|
+
values = XapianDocValueAccessor.new(XapianDoc.new(nil))
|
10
|
+
values.store(:city, "Leeds").should == "Leeds"
|
11
|
+
values.fetch(:city).should == "Leeds"
|
12
|
+
values[:city] = "London"
|
13
|
+
values[:city].should == "London"
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should add and retrieve values from the Xapian::Document" do
|
17
|
+
doc = XapianDoc.new(nil)
|
18
|
+
values = XapianDocValueAccessor.new(doc)
|
19
|
+
lambda { values[:city] = "London" }.should change(doc.xapian_document, :values_count).by(1)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should store fields defined as Fixnum as packed Long" do
|
23
|
+
xdb = XapianDb.new(:fields => { :number => { :type => Fixnum, :store => true } })
|
24
|
+
[-83883, 256532, 0, 0x3fffffff].each do |number|
|
25
|
+
doc = xdb.documents.new(:number => number)
|
26
|
+
doc.values.store(:number, number, Fixnum).should == number
|
27
|
+
doc.values.fetch(:number, Fixnum).should == number
|
28
|
+
doc.to_xapian_document.values.first.value.should == [number].pack("l")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should store fields defined as Bignum as packed Double-precision float, big-endian byte order" do
|
33
|
+
xdb = XapianDb.new(:fields => { :number => { :type => Bignum, :store => true } })
|
34
|
+
doc = xdb.documents.new(:number => 0x3fffffffffffff)
|
35
|
+
doc.values.store(:number, 0x3fffffffffffff).should == 0x3fffffffffffff
|
36
|
+
doc.values.fetch(:number).should == 0x3fffffffffffff
|
37
|
+
doc.to_xapian_document.values.first.value.should == [0x3fffffffffffff].pack("G")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should store fields defined as Float as packed Double-precision float, big-endian byte order" do
|
41
|
+
xdb = XapianDb.new(:fields => { :number => { :type => Float, :store => true } })
|
42
|
+
[-0.303393984588383833, 8.448488388488384, 1.0].each do |number|
|
43
|
+
doc = xdb.documents.new(:number => number)
|
44
|
+
doc.values.store(:number, number).should == number
|
45
|
+
doc.values.fetch(:number).should == number
|
46
|
+
doc.to_xapian_document.values.first.value.should == [number].pack("G")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should store fields defined as Time in UTC as packed Double-precision float, big-endian byte order" do
|
51
|
+
xdb = XapianDb.new(:fields => { :created_at => { :type => Time, :store => true }})
|
52
|
+
time = Time.now
|
53
|
+
doc = xdb.documents.new(:created_at => time)
|
54
|
+
doc.values.store(:created_at, time).should == time
|
55
|
+
doc.values.fetch(:created_at).should == time
|
56
|
+
doc.to_xapian_document.values.first.value.should == [time.utc.to_f].pack("G")
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should store fields defined as DateTime as a string" do
|
60
|
+
xdb = XapianDb.new(:fields => { :created_at => { :type => DateTime, :store => true }})
|
61
|
+
datetime = DateTime.now
|
62
|
+
doc = xdb.documents.new(:created_at => datetime)
|
63
|
+
doc.values.store(:created_at, datetime).should == datetime
|
64
|
+
doc.values.fetch(:created_at).should be_close(datetime, 0.0001) # miliseconds are not stored
|
65
|
+
doc.to_xapian_document.values.first.value.should == datetime.to_s
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should store fields defined as Date as a string" do
|
69
|
+
xdb = XapianDb.new(:fields => { :created_on => { :type => Date, :store => true }})
|
70
|
+
date = Date.today
|
71
|
+
doc = xdb.documents.new(:created_on => date)
|
72
|
+
doc.values.store(:created_on, date).should == date
|
73
|
+
doc.values.fetch(:created_on).should == date
|
74
|
+
doc.to_xapian_document.values.first.value.should == date.to_s
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should count the stored values when size is called" do
|
78
|
+
doc = XapianDoc.new(nil)
|
79
|
+
lambda { doc.values[:city] = "London" }.should change(doc.values, :size).by(1)
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should delete values from the Xapian::Document" do
|
83
|
+
doc = XapianDoc.new(nil)
|
84
|
+
doc.values[:city] = "Leeds"
|
85
|
+
lambda { doc.values.delete(:city) }.should change(doc.values, :size).by(-1)
|
86
|
+
doc.values[:city] = "London"
|
87
|
+
doc.values.delete(:city).should == "London"
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xapian-fu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Leach
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-09 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -24,9 +24,28 @@ extra_rdoc_files:
|
|
24
24
|
- LICENSE
|
25
25
|
files:
|
26
26
|
- lib/xapian_fu.rb
|
27
|
-
- lib/xapian_fu
|
27
|
+
- lib/xapian_fu/result_set.rb
|
28
|
+
- lib/xapian_fu/stopper_factory.rb
|
29
|
+
- lib/xapian_fu/stopwords/swedish.txt
|
30
|
+
- lib/xapian_fu/stopwords/french.txt
|
31
|
+
- lib/xapian_fu/stopwords/english.txt
|
32
|
+
- lib/xapian_fu/stopwords/dutch.txt
|
33
|
+
- lib/xapian_fu/stopwords/italian.txt
|
34
|
+
- lib/xapian_fu/stopwords/norwegian.txt
|
35
|
+
- lib/xapian_fu/stopwords/finnish.txt
|
36
|
+
- lib/xapian_fu/stopwords/hungarian.txt
|
37
|
+
- lib/xapian_fu/stopwords/README
|
38
|
+
- lib/xapian_fu/stopwords/update.rb
|
39
|
+
- lib/xapian_fu/stopwords/portuguese.txt
|
40
|
+
- lib/xapian_fu/stopwords/german.txt
|
41
|
+
- lib/xapian_fu/stopwords/danish.txt
|
42
|
+
- lib/xapian_fu/stopwords/spanish.txt
|
43
|
+
- lib/xapian_fu/stopwords/russian.txt
|
28
44
|
- lib/xapian_fu/xapian_db.rb
|
29
45
|
- lib/xapian_fu/xapian_doc.rb
|
46
|
+
- lib/xapian_fu/xapian_documents_accessor.rb
|
47
|
+
- lib/xapian_fu/query_parser.rb
|
48
|
+
- lib/xapian_fu/xapian_doc_value_accessor.rb
|
30
49
|
- examples/ar_spider.rb
|
31
50
|
- examples/query.rb
|
32
51
|
- examples/spider.rb
|
@@ -35,6 +54,8 @@ files:
|
|
35
54
|
- LICENSE
|
36
55
|
has_rdoc: true
|
37
56
|
homepage: http://github.com/johnl/xapian-fu/tree/master
|
57
|
+
licenses: []
|
58
|
+
|
38
59
|
post_install_message:
|
39
60
|
rdoc_options:
|
40
61
|
- --title
|
@@ -59,11 +80,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
80
|
requirements: []
|
60
81
|
|
61
82
|
rubyforge_project: xapian-fu
|
62
|
-
rubygems_version: 1.3.
|
83
|
+
rubygems_version: 1.3.5
|
63
84
|
signing_key:
|
64
|
-
specification_version:
|
85
|
+
specification_version: 3
|
65
86
|
summary: A Ruby interface to the Xapian search engine
|
66
87
|
test_files:
|
88
|
+
- spec/stopper_factory_spec.rb
|
67
89
|
- spec/xapian_doc_spec.rb
|
68
90
|
- spec/xapian_db_spec.rb
|
91
|
+
- spec/query_parser_spec.rb
|
92
|
+
- spec/xapian_doc_value_accessor_spec.rb
|
69
93
|
- spec/spec.opts
|