xapian-fu 0.2 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,4 +13,184 @@ describe XapianDoc do
13
13
  XapianDoc.new(:id => 666).should_not == XapianDoc.new(:id => 667)
14
14
  end
15
15
 
16
+ describe "to_xapian_document" do
17
+ it "should tokenize strings" do
18
+ xdb = XapianDb.new
19
+ xdoc = xdb.documents.new("once upon a time").to_xapian_document
20
+ xdoc.terms.should be_a_kind_of Array
21
+ xdoc.terms.last.should be_a_kind_of Xapian::Term
22
+ xdoc.terms.last.term.should == "upon"
23
+ end
24
+
25
+ it "should store positions by default when tokenizing" do
26
+ xdb = XapianDb.new
27
+ doc = xdb.documents.new("once upon a time")
28
+ doc.save
29
+ xdb.ro.positionlist(doc.id, "time").first.should_not == nil
30
+ end
31
+
32
+ it "should not store positions when tokenizing when :index_positions is set to false" do
33
+ xdb = XapianDb.new(:index_positions => false)
34
+ doc = xdb.documents.new("once upon a time")
35
+ doc.save
36
+ xdb.ro.positionlist(doc.id, "once").first.should == nil
37
+ end
38
+
39
+ it "should tokenize a hash" do
40
+ xdb = XapianDb.new
41
+ xdoc = xdb.documents.new(:title => 'once upon a time').to_xapian_document
42
+ xdoc.terms.should be_a_kind_of Array
43
+ xdoc.terms.last.should be_a_kind_of Xapian::Term
44
+ xdoc.terms.last.term.should == "upon"
45
+ end
46
+
47
+ it "should tokenize the fields of a hash separately" do
48
+ xdb = XapianDb.new
49
+ xdoc = xdb.documents.new({ :text => "once upon a time", :title => "A story" }).to_xapian_document
50
+ terms = xdoc.terms.collect { |t| t.term }
51
+ terms.should include "XTEXTonce"
52
+ terms.should include "XTITLEstory"
53
+ terms.should_not include "XTEXTstory"
54
+ end
55
+
56
+ it "should not tokenize fields declared as not to be indexed" do
57
+ xdb = XapianDb.new(:fields => { :name => { :index => false } })
58
+ xdoc = xdb.documents.new({ :name => 'John Leach', :quote => 'Xapian Rocks' }).to_xapian_document
59
+ terms = xdoc.terms.collect { |t| t.term }
60
+ terms.should_not include 'XNAMEjohn'
61
+ terms.should_not include 'XNAMEleach'
62
+ terms.should_not include 'Zjohn'
63
+ terms.should_not include 'Zleach'
64
+ terms.should_not include 'john'
65
+ terms.should_not include 'leach'
66
+ end
67
+
68
+ it "should convert Time instances to a useful format when tokenizing" do
69
+ time = Time.now
70
+ xdb = XapianDb.new
71
+ xdoc = xdb.documents.new(:created_at => time).to_xapian_document
72
+ terms = xdoc.terms.collect { |t| t.term }
73
+ terms.should include time.utc.strftime("%Y%m%d%H%M%S")
74
+ end
75
+
76
+ it "should convert DateTime instances to a useful format when tokenizing" do
77
+ datetime = DateTime.now
78
+ xdb = XapianDb.new
79
+ xdoc = xdb.documents.new(:created_at => datetime).to_xapian_document
80
+ terms = xdoc.terms.collect { |t| t.term }
81
+ terms.should include datetime.strftime("%Y%m%d%H%M%S")
82
+ end
83
+
84
+ it "should convert Time instances to a useful format when tokenizing" do
85
+ date = Date.today
86
+ xdb = XapianDb.new
87
+ xdoc = xdb.documents.new(:created_on => date).to_xapian_document
88
+ terms = xdoc.terms.collect { |t| t.term }
89
+ terms.should include date.strftime("%Y%m%d")
90
+ end
91
+
92
+ it "should stem English words by default" do
93
+ xdb = XapianDb.new
94
+ xdoc = xdb.documents.new("She fished for fish").to_xapian_document
95
+ terms = xdoc.terms.collect { |t| t.term }
96
+ terms.should_not include "Zfished"
97
+ terms.should include "Zfish"
98
+ end
99
+
100
+ it "should inherit the databases stemmer by default" do
101
+ xdb = XapianDb.new(:stemmer => :french)
102
+ xdoc = xdb.documents.new("majestueusement").to_xapian_document
103
+ terms = xdoc.terms.collect { |t| t.term }
104
+ terms.should include 'Zmajestu'
105
+ end
106
+
107
+ stems = {
108
+ :german => { "aufeinander" => "aufeinand" },
109
+ :french => { "majestueusement" => "majestu" },
110
+ :swedish => { "kloekornas" => "kloek" },
111
+ :danish => { "indvendingerne" => "indvending" },
112
+ :russian => { "падшую", "падш" },
113
+ :italian => { "propagamento" => "propag" }
114
+ }
115
+ stems.keys.each do |lang|
116
+ stems[lang].each do |word, stem|
117
+ it "should stem #{lang.to_s.capitalize} words when the :stemmer option is set to :#{lang}" do
118
+ xdb = XapianDb.new
119
+ xdoc = xdb.documents.new(word, :stemmer => lang).to_xapian_document
120
+ terms = xdoc.terms.collect { |t| t.term }
121
+ terms.should include 'Z'+stem
122
+ end
123
+ end
124
+ end
125
+
126
+ it "should not stem words when stemmer is set to false" do
127
+ xdb = XapianDb.new
128
+ xdoc = xdb.documents.new("She fished for fish", :stemmer => false).to_xapian_document
129
+ terms = xdoc.terms.collect { |t| t.term if t.term =~ /^Z/ }.compact
130
+ terms.should be_empty
131
+ end
132
+
133
+ it "should not stem english stop words by default" do
134
+ xdb = XapianDb.new
135
+ xdoc = xdb.documents.new("And they made a cake", :stemmer => :english).to_xapian_document
136
+ terms = xdoc.terms.collect { |t| t.term }
137
+ terms.should_not include 'Zand'
138
+ terms.should_not include 'Za'
139
+ terms.should include 'Zcake'
140
+ end
141
+
142
+ it "should allow setting the stopper on initialisation" do
143
+ xdb = XapianDb.new(:stopper => :english)
144
+ xdoc = xdb.documents.new("And they made a cake", :stopper => :french)
145
+ xdoc.stopper.call("ayantes").should == true
146
+ xdoc.stopper.call("and").should == false
147
+ end
148
+
149
+ it "should support stop words encoded in utf8" do
150
+ xdb = XapianDb.new
151
+ xdoc = xdb.documents.new("и они made a cake", :stemmer => :russian, :stopper => :russian).to_xapian_document
152
+ terms = xdoc.terms.collect { |t| t.term }
153
+ terms.should_not include 'Zи'
154
+ terms.should_not include 'Zони'
155
+ terms.should include 'Zcake'
156
+ end
157
+ end
158
+
159
+ describe "stemmer" do
160
+ it "should return the same stemmer as the database by default" do
161
+ xdb = XapianDb.new(:language => :french)
162
+ xdoc = xdb.documents.new("stink and bones")
163
+ xdoc.stemmer.call("contournait").should == "contourn"
164
+ end
165
+ it "should return a stemmer for the document language, overriding the db" do
166
+ xdb = XapianDb.new(:language => :english)
167
+ xdoc = xdb.documents.new("stink and bones", :language => :french)
168
+ xdoc.stemmer.call("contournait").should == "contourn"
169
+ end
170
+ it "should return a stemmer set by the :stemmer option, overriding the :language option and the db stemmer" do
171
+ xdb = XapianDb.new(:language => :german)
172
+ xdoc = xdb.documents.new("stink and bones", :language => :english, :stemmer => :french)
173
+ xdoc.stemmer.call("contournait").should == "contourn"
174
+ end
175
+
176
+ end
177
+
178
+ describe "stopper" do
179
+ it "should return the same stopper as the database by default" do
180
+ xdb = XapianDb.new(:language => :french)
181
+ xdoc = xdb.documents.new("stink and bones")
182
+ xdoc.stopper.call("avec").should == true
183
+ end
184
+ it "should return a stopper for the document language, overriding the db" do
185
+ xdb = XapianDb.new(:language => :english)
186
+ xdoc = xdb.documents.new("stink and bones", :language => :french)
187
+ xdoc.stopper.call("avec").should == true
188
+ end
189
+ it "should return a stopper set by the :stopper option, overriding the :language option and the db stopper" do
190
+ xdb = XapianDb.new(:language => :german)
191
+ xdoc = xdb.documents.new("stink and bones", :language => :english, :stopper => :french)
192
+ xdoc.stopper.call("avec").should == true
193
+ end
194
+ end
195
+
16
196
  end
@@ -0,0 +1,92 @@
1
+ require 'xapian'
2
+ require 'lib/xapian_fu.rb'
3
+ include XapianFu
4
+ require 'fileutils'
5
+
6
+ describe XapianDocValueAccessor do
7
+
8
+ it "should store and fetch values like a hash" do
9
+ values = XapianDocValueAccessor.new(XapianDoc.new(nil))
10
+ values.store(:city, "Leeds").should == "Leeds"
11
+ values.fetch(:city).should == "Leeds"
12
+ values[:city] = "London"
13
+ values[:city].should == "London"
14
+ end
15
+
16
+ it "should add and retrieve values from the Xapian::Document" do
17
+ doc = XapianDoc.new(nil)
18
+ values = XapianDocValueAccessor.new(doc)
19
+ lambda { values[:city] = "London" }.should change(doc.xapian_document, :values_count).by(1)
20
+ end
21
+
22
+ it "should store fields defined as Fixnum as packed Long" do
23
+ xdb = XapianDb.new(:fields => { :number => { :type => Fixnum, :store => true } })
24
+ [-83883, 256532, 0, 0x3fffffff].each do |number|
25
+ doc = xdb.documents.new(:number => number)
26
+ doc.values.store(:number, number, Fixnum).should == number
27
+ doc.values.fetch(:number, Fixnum).should == number
28
+ doc.to_xapian_document.values.first.value.should == [number].pack("l")
29
+ end
30
+ end
31
+
32
+ it "should store fields defined as Bignum as packed Double-precision float, big-endian byte order" do
33
+ xdb = XapianDb.new(:fields => { :number => { :type => Bignum, :store => true } })
34
+ doc = xdb.documents.new(:number => 0x3fffffffffffff)
35
+ doc.values.store(:number, 0x3fffffffffffff).should == 0x3fffffffffffff
36
+ doc.values.fetch(:number).should == 0x3fffffffffffff
37
+ doc.to_xapian_document.values.first.value.should == [0x3fffffffffffff].pack("G")
38
+ end
39
+
40
+ it "should store fields defined as Float as packed Double-precision float, big-endian byte order" do
41
+ xdb = XapianDb.new(:fields => { :number => { :type => Float, :store => true } })
42
+ [-0.303393984588383833, 8.448488388488384, 1.0].each do |number|
43
+ doc = xdb.documents.new(:number => number)
44
+ doc.values.store(:number, number).should == number
45
+ doc.values.fetch(:number).should == number
46
+ doc.to_xapian_document.values.first.value.should == [number].pack("G")
47
+ end
48
+ end
49
+
50
+ it "should store fields defined as Time in UTC as packed Double-precision float, big-endian byte order" do
51
+ xdb = XapianDb.new(:fields => { :created_at => { :type => Time, :store => true }})
52
+ time = Time.now
53
+ doc = xdb.documents.new(:created_at => time)
54
+ doc.values.store(:created_at, time).should == time
55
+ doc.values.fetch(:created_at).should == time
56
+ doc.to_xapian_document.values.first.value.should == [time.utc.to_f].pack("G")
57
+ end
58
+
59
+ it "should store fields defined as DateTime as a string" do
60
+ xdb = XapianDb.new(:fields => { :created_at => { :type => DateTime, :store => true }})
61
+ datetime = DateTime.now
62
+ doc = xdb.documents.new(:created_at => datetime)
63
+ doc.values.store(:created_at, datetime).should == datetime
64
+ doc.values.fetch(:created_at).should be_close(datetime, 0.0001) # miliseconds are not stored
65
+ doc.to_xapian_document.values.first.value.should == datetime.to_s
66
+ end
67
+
68
+ it "should store fields defined as Date as a string" do
69
+ xdb = XapianDb.new(:fields => { :created_on => { :type => Date, :store => true }})
70
+ date = Date.today
71
+ doc = xdb.documents.new(:created_on => date)
72
+ doc.values.store(:created_on, date).should == date
73
+ doc.values.fetch(:created_on).should == date
74
+ doc.to_xapian_document.values.first.value.should == date.to_s
75
+ end
76
+
77
+ it "should count the stored values when size is called" do
78
+ doc = XapianDoc.new(nil)
79
+ lambda { doc.values[:city] = "London" }.should change(doc.values, :size).by(1)
80
+ end
81
+
82
+ it "should delete values from the Xapian::Document" do
83
+ doc = XapianDoc.new(nil)
84
+ doc.values[:city] = "Leeds"
85
+ lambda { doc.values.delete(:city) }.should change(doc.values, :size).by(-1)
86
+ doc.values[:city] = "London"
87
+ doc.values.delete(:city).should == "London"
88
+ end
89
+
90
+ end
91
+
92
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xapian-fu
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.2"
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Leach
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-06-20 00:00:00 +01:00
12
+ date: 2009-09-09 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -24,9 +24,28 @@ extra_rdoc_files:
24
24
  - LICENSE
25
25
  files:
26
26
  - lib/xapian_fu.rb
27
- - lib/xapian_fu
27
+ - lib/xapian_fu/result_set.rb
28
+ - lib/xapian_fu/stopper_factory.rb
29
+ - lib/xapian_fu/stopwords/swedish.txt
30
+ - lib/xapian_fu/stopwords/french.txt
31
+ - lib/xapian_fu/stopwords/english.txt
32
+ - lib/xapian_fu/stopwords/dutch.txt
33
+ - lib/xapian_fu/stopwords/italian.txt
34
+ - lib/xapian_fu/stopwords/norwegian.txt
35
+ - lib/xapian_fu/stopwords/finnish.txt
36
+ - lib/xapian_fu/stopwords/hungarian.txt
37
+ - lib/xapian_fu/stopwords/README
38
+ - lib/xapian_fu/stopwords/update.rb
39
+ - lib/xapian_fu/stopwords/portuguese.txt
40
+ - lib/xapian_fu/stopwords/german.txt
41
+ - lib/xapian_fu/stopwords/danish.txt
42
+ - lib/xapian_fu/stopwords/spanish.txt
43
+ - lib/xapian_fu/stopwords/russian.txt
28
44
  - lib/xapian_fu/xapian_db.rb
29
45
  - lib/xapian_fu/xapian_doc.rb
46
+ - lib/xapian_fu/xapian_documents_accessor.rb
47
+ - lib/xapian_fu/query_parser.rb
48
+ - lib/xapian_fu/xapian_doc_value_accessor.rb
30
49
  - examples/ar_spider.rb
31
50
  - examples/query.rb
32
51
  - examples/spider.rb
@@ -35,6 +54,8 @@ files:
35
54
  - LICENSE
36
55
  has_rdoc: true
37
56
  homepage: http://github.com/johnl/xapian-fu/tree/master
57
+ licenses: []
58
+
38
59
  post_install_message:
39
60
  rdoc_options:
40
61
  - --title
@@ -59,11 +80,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
59
80
  requirements: []
60
81
 
61
82
  rubyforge_project: xapian-fu
62
- rubygems_version: 1.3.1
83
+ rubygems_version: 1.3.5
63
84
  signing_key:
64
- specification_version: 2
85
+ specification_version: 3
65
86
  summary: A Ruby interface to the Xapian search engine
66
87
  test_files:
88
+ - spec/stopper_factory_spec.rb
67
89
  - spec/xapian_doc_spec.rb
68
90
  - spec/xapian_db_spec.rb
91
+ - spec/query_parser_spec.rb
92
+ - spec/xapian_doc_value_accessor_spec.rb
69
93
  - spec/spec.opts