acts_as_indexed 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/CHANGELOG +90 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +137 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/acts_as_indexed.gemspec +67 -0
- data/lib/acts_as_indexed.rb +248 -0
- data/lib/acts_as_indexed/configuration.rb +41 -0
- data/lib/acts_as_indexed/search_atom.rb +104 -0
- data/lib/acts_as_indexed/search_index.rb +325 -0
- data/lib/will_paginate_search.rb +29 -0
- data/rails/init.rb +2 -0
- data/test/abstract_unit.rb +52 -0
- data/test/acts_as_indexed_test.rb +133 -0
- data/test/configuration_test.rb +57 -0
- data/test/database.yml +10 -0
- data/test/fixtures/post.rb +5 -0
- data/test/fixtures/posts.yml +31 -0
- data/test/schema.rb +6 -0
- data/test/search_atom_test.rb +98 -0
- data/test/search_index_test.rb +50 -0
- metadata +94 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
# ActsAsIndexed
|
2
|
+
# Copyright (c) 2007 - 2010 Douglas F Shearer.
|
3
|
+
# http://douglasfshearer.com
|
4
|
+
# Distributed under the MIT license as included with this plugin.
|
5
|
+
|
6
|
+
module ActsAsIndexed
|
7
|
+
# Used to set up and modify settings for acts_as_indexed.
|
8
|
+
class Configuration
|
9
|
+
|
10
|
+
# Sets the location for the index. Specify as an array. Heroku, for
|
11
|
+
# example would use RAILS_ROOT/tmp/index, which would be set as
|
12
|
+
# [Rails.root,'tmp','index]
|
13
|
+
attr_accessor :index_file
|
14
|
+
|
15
|
+
# Tuning value for the index partitioning. Larger values result in quicker
|
16
|
+
# searches, but slower indexing. Default is 3.
|
17
|
+
attr_reader :index_file_depth
|
18
|
+
|
19
|
+
# Sets the minimum length for a word in a query. Words shorter than this
|
20
|
+
# value are ignored in searches unless preceded by the '+' operator.
|
21
|
+
# Default is 3.
|
22
|
+
attr_reader :min_word_size
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
@index_file = [Rails.root, 'index']
|
26
|
+
@index_file_depth = 3
|
27
|
+
@min_word_size = 3
|
28
|
+
end
|
29
|
+
|
30
|
+
def index_file_depth=(val)
|
31
|
+
raise(ArgumentError, 'index_file_depth cannot be less than one (1)') if val < 1
|
32
|
+
@index_file_depth = val
|
33
|
+
end
|
34
|
+
|
35
|
+
def min_word_size=(val)
|
36
|
+
raise(ArgumentError, 'min_word_size cannot be less than one (1)') if val < 1
|
37
|
+
@min_word_size = val
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# ActsAsIndexed
|
2
|
+
# Copyright (c) 2007 - 2010 Douglas F Shearer.
|
3
|
+
# http://douglasfshearer.com
|
4
|
+
# Distributed under the MIT license as included with this plugin.
|
5
|
+
|
6
|
+
module ActsAsIndexed #:nodoc:
|
7
|
+
class SearchAtom
|
8
|
+
|
9
|
+
# Contains a hash of records.
|
10
|
+
# { 'record_id' => [pos1, pos2, pos] }
|
11
|
+
#--
|
12
|
+
# Weighting:
|
13
|
+
# http://www.perlmonks.com/index.pl?node_id=27509
|
14
|
+
# W(T, D) = tf(T, D) * log ( DN / df(T))
|
15
|
+
# weighting = frequency_in_this_record * log (total_number_of_records / number_of_matching_records)
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@records = {}
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns true if the given record is present.
|
22
|
+
def include_record?(record_id)
|
23
|
+
@records.include?(record_id)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Adds +record_id+ to the stored records.
|
27
|
+
def add_record(record_id)
|
28
|
+
@records[record_id] = [] if !include_record?(record_id)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Adds +pos+ to the array of positions for +record_id+.
|
32
|
+
def add_position(record_id, pos)
|
33
|
+
add_record(record_id)
|
34
|
+
@records[record_id] << pos
|
35
|
+
end
|
36
|
+
|
37
|
+
# Returns all record IDs stored in this Atom.
|
38
|
+
def record_ids
|
39
|
+
@records.keys
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns an array of positions for +record_id+ stored in this Atom.
|
43
|
+
def positions(record_id)
|
44
|
+
@records[record_id]
|
45
|
+
end
|
46
|
+
|
47
|
+
# Removes +record_id+ from this Atom.
|
48
|
+
def remove_record(record_id)
|
49
|
+
@records.delete(record_id)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns at atom containing the records and positions of +self+ preceded by +former+
|
53
|
+
# "former latter" or "big dog" where "big" is the former and "dog" is the latter.
|
54
|
+
def preceded_by(former)
|
55
|
+
matches = SearchAtom.new
|
56
|
+
latter = {}
|
57
|
+
former.record_ids.each do |rid|
|
58
|
+
latter[rid] = @records[rid] if @records[rid]
|
59
|
+
end
|
60
|
+
# Iterate over each record in latter.
|
61
|
+
latter.each do |record_id,pos|
|
62
|
+
|
63
|
+
# Iterate over each position.
|
64
|
+
pos.each do |p|
|
65
|
+
# Check if previous position is in former.
|
66
|
+
if former.include_position?(record_id,p-1)
|
67
|
+
matches.add_record(record_id) if !matches.include_record?(record_id)
|
68
|
+
matches.add_position(record_id,p)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
matches
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns a hash of record_ids and weightings for each record in the
|
77
|
+
# atom.
|
78
|
+
def weightings(records_size)
|
79
|
+
out = {}
|
80
|
+
@records.each do |r_id, pos|
|
81
|
+
|
82
|
+
# Fixes a bug when the records_size is zero. i.e. The only record
|
83
|
+
# contaning the word has been deleted.
|
84
|
+
if records_size < 1
|
85
|
+
out[r_id] = 0.0
|
86
|
+
next
|
87
|
+
end
|
88
|
+
|
89
|
+
# weighting = frequency * log (records.size / records_with_atom)
|
90
|
+
## parndt 2010/05/03 changed to records_size.to_f to avoid -Infinity Errno::ERANGE exceptions
|
91
|
+
## which would happen for example Math.log(1 / 20) == -Infinity but Math.log(1.0 / 20) == -2.99573227355399
|
92
|
+
out[r_id] = pos.size * Math.log(records_size.to_f / @records.size)
|
93
|
+
end
|
94
|
+
out
|
95
|
+
end
|
96
|
+
|
97
|
+
protected
|
98
|
+
|
99
|
+
def include_position?(record_id,pos)
|
100
|
+
@records[record_id].include?(pos)
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,325 @@
|
|
1
|
+
# ActsAsIndexed
|
2
|
+
# Copyright (c) 2007 - 2010 Douglas F Shearer.
|
3
|
+
# http://douglasfshearer.com
|
4
|
+
# Distributed under the MIT license as included with this plugin.
|
5
|
+
|
6
|
+
module ActsAsIndexed #:nodoc:
|
7
|
+
class SearchIndex
|
8
|
+
|
9
|
+
# root:: Location of index on filesystem.
|
10
|
+
# index_depth:: Degree of index partitioning.
|
11
|
+
# fields:: Fields or instance methods of ActiveRecord model to be indexed.
|
12
|
+
# min_word_size:: Smallest query term that will be run through search.
|
13
|
+
def initialize(root, index_depth, fields, min_word_size)
|
14
|
+
@root = root
|
15
|
+
@fields = fields
|
16
|
+
@index_depth = index_depth
|
17
|
+
@atoms = {}
|
18
|
+
@min_word_size = min_word_size
|
19
|
+
@records_size = exists? ? load_record_size : 0
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds +record+ to the index.
|
23
|
+
def add_record(record)
|
24
|
+
condensed_record = condense_record(record)
|
25
|
+
load_atoms(condensed_record)
|
26
|
+
add_occurences(condensed_record,record.id)
|
27
|
+
@records_size += 1
|
28
|
+
end
|
29
|
+
|
30
|
+
# Adds multiple records to the index. Accepts an array of +records+.
|
31
|
+
def add_records(records)
|
32
|
+
records.each do |record|
|
33
|
+
add_record(record)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Removes +record+ from the index.
|
38
|
+
def remove_record(record)
|
39
|
+
atoms = condense_record(record)
|
40
|
+
load_atoms(atoms)
|
41
|
+
atoms.each do |a|
|
42
|
+
@atoms[a].remove_record(record.id) if @atoms.has_key?(a)
|
43
|
+
@records_size -= 1
|
44
|
+
#p "removing #{record.id} from #{a}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def update_record(record_new, record_old)
|
49
|
+
# Work out which atoms have modifications.
|
50
|
+
# Minimises loading and saving of partitions.
|
51
|
+
old_atoms = condense_record(record_old)
|
52
|
+
new_atoms = condense_record(record_new)
|
53
|
+
|
54
|
+
# Remove the old version from the appropriate atoms.
|
55
|
+
load_atoms(old_atoms)
|
56
|
+
old_atoms.each do |a|
|
57
|
+
@atoms[a].remove_record(record_new.id) if @atoms.has_key?(a)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Add the new version to the appropriate atoms.
|
61
|
+
load_atoms(new_atoms)
|
62
|
+
# TODO: Make a version of this method that takes the
|
63
|
+
# atomised version of the record.
|
64
|
+
add_occurences(new_atoms, record_new.id)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Saves the current index partitions to the filesystem.
|
68
|
+
def save
|
69
|
+
prepare
|
70
|
+
atoms_sorted = {}
|
71
|
+
@atoms.each do |atom_name, records|
|
72
|
+
e_p = encoded_prefix(atom_name)
|
73
|
+
atoms_sorted[e_p] = {} if !atoms_sorted.has_key?(e_p)
|
74
|
+
atoms_sorted[e_p][atom_name] = records
|
75
|
+
end
|
76
|
+
atoms_sorted.each do |e_p, atoms|
|
77
|
+
#p "Saving #{e_p}."
|
78
|
+
File.open(File.join(@root + [e_p.to_s]),'w+') do |f|
|
79
|
+
Marshal.dump(atoms,f)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
save_record_size
|
83
|
+
end
|
84
|
+
|
85
|
+
# Deletes the current model's index from the filesystem.
|
86
|
+
#--
|
87
|
+
# TODO: Write a public method that will delete all indexes.
|
88
|
+
def destroy
|
89
|
+
FileUtils.rm_rf(@root)
|
90
|
+
true
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns an array of IDs for records matching +query+.
|
94
|
+
def search(query)
|
95
|
+
return [] if query.nil?
|
96
|
+
load_atoms(cleanup_atoms(query))
|
97
|
+
queries = parse_query(query.dup)
|
98
|
+
positive = run_queries(queries[:positive])
|
99
|
+
positive_quoted = run_quoted_queries(queries[:positive_quoted])
|
100
|
+
negative = run_queries(queries[:negative])
|
101
|
+
negative_quoted = run_quoted_queries(queries[:negative_quoted])
|
102
|
+
|
103
|
+
if !queries[:positive].empty? && !queries[:positive_quoted].empty?
|
104
|
+
p = positive.delete_if{ |r_id,w| !positive_quoted.include?(r_id) }
|
105
|
+
pq = positive_quoted.delete_if{ |r_id,w| !positive.include?(r_id) }
|
106
|
+
results = p.merge(pq) { |r_id,old_val,new_val| old_val + new_val}
|
107
|
+
elsif !queries[:positive].empty?
|
108
|
+
results = positive
|
109
|
+
else
|
110
|
+
results = positive_quoted
|
111
|
+
end
|
112
|
+
|
113
|
+
negative_results = (negative.keys + negative_quoted.keys)
|
114
|
+
results.delete_if { |r_id, w| negative_results.include?(r_id) }
|
115
|
+
#p results
|
116
|
+
results
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns true if the index root exists on the FS.
|
120
|
+
#--
|
121
|
+
# TODO: Make a private method called 'root_exists?' which checks for the root directory.
|
122
|
+
def exists?
|
123
|
+
File.exists?(File.join(@root + ['size']))
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
# Gets the size file from the index.
|
129
|
+
def load_record_size
|
130
|
+
File.open(File.join(@root + ['size'])) do |f|
|
131
|
+
(Marshal.load(f))
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Saves the size to the size file.
|
136
|
+
def save_record_size
|
137
|
+
File.open(File.join(@root + ['size']),'w+') do |f|
|
138
|
+
Marshal.dump(@records_size,f)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Returns true if the given atom is present.
|
143
|
+
def include_atom?(atom)
|
144
|
+
@atoms.has_key?(atom)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns true if all the given atoms are present.
|
148
|
+
def include_atoms?(atoms_arr)
|
149
|
+
atoms_arr.each do |a|
|
150
|
+
return false if !include_atom?(a)
|
151
|
+
end
|
152
|
+
true
|
153
|
+
end
|
154
|
+
|
155
|
+
# Returns true if the given record is present.
|
156
|
+
def include_record?(record_id)
|
157
|
+
@atoms.each do |atomname, atom|
|
158
|
+
return true if atom.include_record?(record_id)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def add_atom(atom)
|
163
|
+
@atoms[atom] = SearchAtom.new if !include_atom?(atom)
|
164
|
+
end
|
165
|
+
|
166
|
+
def add_occurences(condensed_record,record_id)
|
167
|
+
condensed_record.each_with_index do |atom, i|
|
168
|
+
add_atom(atom)
|
169
|
+
@atoms[atom].add_position(record_id, i)
|
170
|
+
#p "adding #{record.id} to #{atom}"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def encoded_prefix(atom)
|
175
|
+
prefix = atom[0,@index_depth]
|
176
|
+
if !@prefix_cache || !@prefix_cache.has_key?(prefix)
|
177
|
+
@prefix_cache = {} if !@prefix_cache
|
178
|
+
len = atom.length
|
179
|
+
if len > 1
|
180
|
+
@prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_')
|
181
|
+
else
|
182
|
+
@prefix_cache[prefix] = encode_character(atom)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
@prefix_cache[prefix]
|
186
|
+
end
|
187
|
+
|
188
|
+
# Allows compatibility with 1.8.6 which has no ord method.
|
189
|
+
def encode_character(char)
|
190
|
+
if @@has_ord ||= char.respond_to?(:ord)
|
191
|
+
char.ord.to_s
|
192
|
+
else
|
193
|
+
char[0]
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def parse_query(s)
|
198
|
+
|
199
|
+
# Find -"foo bar".
|
200
|
+
negative_quoted = []
|
201
|
+
while neg_quoted = s.slice!(/-\"[^\"]*\"/)
|
202
|
+
negative_quoted << cleanup_atoms(neg_quoted)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Find "foo bar".
|
206
|
+
positive_quoted = []
|
207
|
+
while pos_quoted = s.slice!(/\"[^\"]*\"/)
|
208
|
+
positive_quoted << cleanup_atoms(pos_quoted)
|
209
|
+
end
|
210
|
+
|
211
|
+
# Find -foo.
|
212
|
+
negative = []
|
213
|
+
while neg = s.slice!(/-[\S]*/)
|
214
|
+
negative << cleanup_atoms(neg).first
|
215
|
+
end
|
216
|
+
|
217
|
+
# Find +foo
|
218
|
+
positive = []
|
219
|
+
while pos = s.slice!(/\+[\S]*/)
|
220
|
+
positive << cleanup_atoms(pos).first
|
221
|
+
end
|
222
|
+
|
223
|
+
# Find all other terms.
|
224
|
+
positive += cleanup_atoms(s,true)
|
225
|
+
|
226
|
+
{:negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :negative => negative, :positive => positive}
|
227
|
+
end
|
228
|
+
|
229
|
+
def run_queries(atoms)
|
230
|
+
results = {}
|
231
|
+
atoms.uniq.each do |atom|
|
232
|
+
interim_results = {}
|
233
|
+
if include_atom?(atom)
|
234
|
+
# Collect all the weightings for the current atom.
|
235
|
+
interim_results = @atoms[atom].weightings(@records_size)
|
236
|
+
end
|
237
|
+
if results.empty?
|
238
|
+
# If first time round, set results with initial weightings.
|
239
|
+
results = interim_results
|
240
|
+
else
|
241
|
+
# If second time round, add weightings together for records
|
242
|
+
# matching both atoms. Any matching only one are discarded.
|
243
|
+
rr = {}
|
244
|
+
interim_results.each do |r,w|
|
245
|
+
rr[r] = w + results[r] if results[r]
|
246
|
+
end
|
247
|
+
results = rr
|
248
|
+
end
|
249
|
+
end
|
250
|
+
#p results
|
251
|
+
results
|
252
|
+
end
|
253
|
+
|
254
|
+
def run_quoted_queries(quoted_atoms)
|
255
|
+
results = {}
|
256
|
+
quoted_atoms.each do |quoted_atom|
|
257
|
+
interim_results = {}
|
258
|
+
# Check the index contains all the required atoms.
|
259
|
+
# match_atom = first_word_atom
|
260
|
+
# for each of the others
|
261
|
+
# return atom containing records + positions where current atom is preceded by following atom.
|
262
|
+
# end
|
263
|
+
# return records from final atom.
|
264
|
+
next if !include_atoms?(quoted_atom)
|
265
|
+
matches = @atoms[quoted_atom.first]
|
266
|
+
quoted_atom[1..-1].each do |atom_name|
|
267
|
+
matches = @atoms[atom_name].preceded_by(matches)
|
268
|
+
end
|
269
|
+
#results += matches.record_ids
|
270
|
+
|
271
|
+
interim_results = matches.weightings(@records_size)
|
272
|
+
if results.empty?
|
273
|
+
results = interim_results
|
274
|
+
else
|
275
|
+
rr = {}
|
276
|
+
interim_results.each do |r,w|
|
277
|
+
rr[r] = w + results[r] if results[r]
|
278
|
+
end
|
279
|
+
#p results.class
|
280
|
+
results = rr
|
281
|
+
end
|
282
|
+
|
283
|
+
end
|
284
|
+
results
|
285
|
+
end
|
286
|
+
|
287
|
+
def load_atoms(atoms)
|
288
|
+
# Remove duplicate atoms.
|
289
|
+
# Remove atoms already in index.
|
290
|
+
# Calculate prefixes.
|
291
|
+
# Remove duplicate prefixes.
|
292
|
+
atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name|
|
293
|
+
if File.exists?(File.join(@root + [name.to_s]))
|
294
|
+
File.open(File.join(@root + [name.to_s])) do |f|
|
295
|
+
@atoms.merge!(Marshal.load(f))
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
def prepare
|
302
|
+
# Makes the RAILS_ROOT/index directory
|
303
|
+
Dir.mkdir(File.join(@root[0,2])) if !File.exists?(File.join(@root[0,2]))
|
304
|
+
# Makes the RAILS_ROOT/index/ENVIRONMENT directory
|
305
|
+
Dir.mkdir(File.join(@root[0,3])) if !File.exists?(File.join(@root[0,3]))
|
306
|
+
# Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directory
|
307
|
+
Dir.mkdir(File.join(@root)) if !File.exists?(File.join(@root))
|
308
|
+
end
|
309
|
+
|
310
|
+
def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3)
|
311
|
+
atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split
|
312
|
+
return atoms if !limit_size
|
313
|
+
atoms.reject{|w| w.size < min_size}
|
314
|
+
end
|
315
|
+
|
316
|
+
def condense_record(record)
|
317
|
+
record_condensed = ''
|
318
|
+
@fields.each do |f|
|
319
|
+
record_condensed += ' ' + record.send(f).to_s if record.send(f)
|
320
|
+
end
|
321
|
+
cleanup_atoms(record_condensed)
|
322
|
+
end
|
323
|
+
|
324
|
+
end
|
325
|
+
end
|