acts_as_indexed 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/CHANGELOG +90 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +137 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/acts_as_indexed.gemspec +67 -0
- data/lib/acts_as_indexed.rb +248 -0
- data/lib/acts_as_indexed/configuration.rb +41 -0
- data/lib/acts_as_indexed/search_atom.rb +104 -0
- data/lib/acts_as_indexed/search_index.rb +325 -0
- data/lib/will_paginate_search.rb +29 -0
- data/rails/init.rb +2 -0
- data/test/abstract_unit.rb +52 -0
- data/test/acts_as_indexed_test.rb +133 -0
- data/test/configuration_test.rb +57 -0
- data/test/database.yml +10 -0
- data/test/fixtures/post.rb +5 -0
- data/test/fixtures/posts.yml +31 -0
- data/test/schema.rb +6 -0
- data/test/search_atom_test.rb +98 -0
- data/test/search_index_test.rb +50 -0
- metadata +94 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
# ActsAsIndexed
|
2
|
+
# Copyright (c) 2007 - 2010 Douglas F Shearer.
|
3
|
+
# http://douglasfshearer.com
|
4
|
+
# Distributed under the MIT license as included with this plugin.
|
5
|
+
|
6
|
+
module ActsAsIndexed
|
7
|
+
# Used to set up and modify settings for acts_as_indexed.
|
8
|
+
class Configuration
|
9
|
+
|
10
|
+
# Sets the location for the index. Specify as an array. Heroku, for
|
11
|
+
# example would use RAILS_ROOT/tmp/index, which would be set as
|
12
|
+
# [Rails.root,'tmp','index]
|
13
|
+
attr_accessor :index_file
|
14
|
+
|
15
|
+
# Tuning value for the index partitioning. Larger values result in quicker
|
16
|
+
# searches, but slower indexing. Default is 3.
|
17
|
+
attr_reader :index_file_depth
|
18
|
+
|
19
|
+
# Sets the minimum length for a word in a query. Words shorter than this
|
20
|
+
# value are ignored in searches unless preceded by the '+' operator.
|
21
|
+
# Default is 3.
|
22
|
+
attr_reader :min_word_size
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
@index_file = [Rails.root, 'index']
|
26
|
+
@index_file_depth = 3
|
27
|
+
@min_word_size = 3
|
28
|
+
end
|
29
|
+
|
30
|
+
def index_file_depth=(val)
|
31
|
+
raise(ArgumentError, 'index_file_depth cannot be less than one (1)') if val < 1
|
32
|
+
@index_file_depth = val
|
33
|
+
end
|
34
|
+
|
35
|
+
def min_word_size=(val)
|
36
|
+
raise(ArgumentError, 'min_word_size cannot be less than one (1)') if val < 1
|
37
|
+
@min_word_size = val
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# ActsAsIndexed
|
2
|
+
# Copyright (c) 2007 - 2010 Douglas F Shearer.
|
3
|
+
# http://douglasfshearer.com
|
4
|
+
# Distributed under the MIT license as included with this plugin.
|
5
|
+
|
6
|
+
module ActsAsIndexed #:nodoc:
|
7
|
+
class SearchAtom
|
8
|
+
|
9
|
+
# Contains a hash of records.
|
10
|
+
# { 'record_id' => [pos1, pos2, pos] }
|
11
|
+
#--
|
12
|
+
# Weighting:
|
13
|
+
# http://www.perlmonks.com/index.pl?node_id=27509
|
14
|
+
# W(T, D) = tf(T, D) * log ( DN / df(T))
|
15
|
+
# weighting = frequency_in_this_record * log (total_number_of_records / number_of_matching_records)
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@records = {}
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns true if the given record is present.
|
22
|
+
def include_record?(record_id)
|
23
|
+
@records.include?(record_id)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Adds +record_id+ to the stored records.
|
27
|
+
def add_record(record_id)
|
28
|
+
@records[record_id] = [] if !include_record?(record_id)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Adds +pos+ to the array of positions for +record_id+.
|
32
|
+
def add_position(record_id, pos)
|
33
|
+
add_record(record_id)
|
34
|
+
@records[record_id] << pos
|
35
|
+
end
|
36
|
+
|
37
|
+
# Returns all record IDs stored in this Atom.
|
38
|
+
def record_ids
|
39
|
+
@records.keys
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns an array of positions for +record_id+ stored in this Atom.
|
43
|
+
def positions(record_id)
|
44
|
+
@records[record_id]
|
45
|
+
end
|
46
|
+
|
47
|
+
# Removes +record_id+ from this Atom.
|
48
|
+
def remove_record(record_id)
|
49
|
+
@records.delete(record_id)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns at atom containing the records and positions of +self+ preceded by +former+
|
53
|
+
# "former latter" or "big dog" where "big" is the former and "dog" is the latter.
|
54
|
+
def preceded_by(former)
|
55
|
+
matches = SearchAtom.new
|
56
|
+
latter = {}
|
57
|
+
former.record_ids.each do |rid|
|
58
|
+
latter[rid] = @records[rid] if @records[rid]
|
59
|
+
end
|
60
|
+
# Iterate over each record in latter.
|
61
|
+
latter.each do |record_id,pos|
|
62
|
+
|
63
|
+
# Iterate over each position.
|
64
|
+
pos.each do |p|
|
65
|
+
# Check if previous position is in former.
|
66
|
+
if former.include_position?(record_id,p-1)
|
67
|
+
matches.add_record(record_id) if !matches.include_record?(record_id)
|
68
|
+
matches.add_position(record_id,p)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
matches
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns a hash of record_ids and weightings for each record in the
|
77
|
+
# atom.
|
78
|
+
def weightings(records_size)
|
79
|
+
out = {}
|
80
|
+
@records.each do |r_id, pos|
|
81
|
+
|
82
|
+
# Fixes a bug when the records_size is zero. i.e. The only record
|
83
|
+
# contaning the word has been deleted.
|
84
|
+
if records_size < 1
|
85
|
+
out[r_id] = 0.0
|
86
|
+
next
|
87
|
+
end
|
88
|
+
|
89
|
+
# weighting = frequency * log (records.size / records_with_atom)
|
90
|
+
## parndt 2010/05/03 changed to records_size.to_f to avoid -Infinity Errno::ERANGE exceptions
|
91
|
+
## which would happen for example Math.log(1 / 20) == -Infinity but Math.log(1.0 / 20) == -2.99573227355399
|
92
|
+
out[r_id] = pos.size * Math.log(records_size.to_f / @records.size)
|
93
|
+
end
|
94
|
+
out
|
95
|
+
end
|
96
|
+
|
97
|
+
protected
|
98
|
+
|
99
|
+
def include_position?(record_id,pos)
|
100
|
+
@records[record_id].include?(pos)
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,325 @@
|
|
1
|
+
# ActsAsIndexed
|
2
|
+
# Copyright (c) 2007 - 2010 Douglas F Shearer.
|
3
|
+
# http://douglasfshearer.com
|
4
|
+
# Distributed under the MIT license as included with this plugin.
|
5
|
+
|
6
|
+
module ActsAsIndexed #:nodoc:
|
7
|
+
class SearchIndex
|
8
|
+
|
9
|
+
# root:: Location of index on filesystem.
|
10
|
+
# index_depth:: Degree of index partitioning.
|
11
|
+
# fields:: Fields or instance methods of ActiveRecord model to be indexed.
|
12
|
+
# min_word_size:: Smallest query term that will be run through search.
|
13
|
+
def initialize(root, index_depth, fields, min_word_size)
|
14
|
+
@root = root
|
15
|
+
@fields = fields
|
16
|
+
@index_depth = index_depth
|
17
|
+
@atoms = {}
|
18
|
+
@min_word_size = min_word_size
|
19
|
+
@records_size = exists? ? load_record_size : 0
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds +record+ to the index.
|
23
|
+
def add_record(record)
|
24
|
+
condensed_record = condense_record(record)
|
25
|
+
load_atoms(condensed_record)
|
26
|
+
add_occurences(condensed_record,record.id)
|
27
|
+
@records_size += 1
|
28
|
+
end
|
29
|
+
|
30
|
+
# Adds multiple records to the index. Accepts an array of +records+.
|
31
|
+
def add_records(records)
|
32
|
+
records.each do |record|
|
33
|
+
add_record(record)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Removes +record+ from the index.
|
38
|
+
def remove_record(record)
|
39
|
+
atoms = condense_record(record)
|
40
|
+
load_atoms(atoms)
|
41
|
+
atoms.each do |a|
|
42
|
+
@atoms[a].remove_record(record.id) if @atoms.has_key?(a)
|
43
|
+
@records_size -= 1
|
44
|
+
#p "removing #{record.id} from #{a}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def update_record(record_new, record_old)
|
49
|
+
# Work out which atoms have modifications.
|
50
|
+
# Minimises loading and saving of partitions.
|
51
|
+
old_atoms = condense_record(record_old)
|
52
|
+
new_atoms = condense_record(record_new)
|
53
|
+
|
54
|
+
# Remove the old version from the appropriate atoms.
|
55
|
+
load_atoms(old_atoms)
|
56
|
+
old_atoms.each do |a|
|
57
|
+
@atoms[a].remove_record(record_new.id) if @atoms.has_key?(a)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Add the new version to the appropriate atoms.
|
61
|
+
load_atoms(new_atoms)
|
62
|
+
# TODO: Make a version of this method that takes the
|
63
|
+
# atomised version of the record.
|
64
|
+
add_occurences(new_atoms, record_new.id)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Saves the current index partitions to the filesystem.
|
68
|
+
def save
|
69
|
+
prepare
|
70
|
+
atoms_sorted = {}
|
71
|
+
@atoms.each do |atom_name, records|
|
72
|
+
e_p = encoded_prefix(atom_name)
|
73
|
+
atoms_sorted[e_p] = {} if !atoms_sorted.has_key?(e_p)
|
74
|
+
atoms_sorted[e_p][atom_name] = records
|
75
|
+
end
|
76
|
+
atoms_sorted.each do |e_p, atoms|
|
77
|
+
#p "Saving #{e_p}."
|
78
|
+
File.open(File.join(@root + [e_p.to_s]),'w+') do |f|
|
79
|
+
Marshal.dump(atoms,f)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
save_record_size
|
83
|
+
end
|
84
|
+
|
85
|
+
# Deletes the current model's index from the filesystem.
|
86
|
+
#--
|
87
|
+
# TODO: Write a public method that will delete all indexes.
|
88
|
+
def destroy
|
89
|
+
FileUtils.rm_rf(@root)
|
90
|
+
true
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns an array of IDs for records matching +query+.
|
94
|
+
def search(query)
|
95
|
+
return [] if query.nil?
|
96
|
+
load_atoms(cleanup_atoms(query))
|
97
|
+
queries = parse_query(query.dup)
|
98
|
+
positive = run_queries(queries[:positive])
|
99
|
+
positive_quoted = run_quoted_queries(queries[:positive_quoted])
|
100
|
+
negative = run_queries(queries[:negative])
|
101
|
+
negative_quoted = run_quoted_queries(queries[:negative_quoted])
|
102
|
+
|
103
|
+
if !queries[:positive].empty? && !queries[:positive_quoted].empty?
|
104
|
+
p = positive.delete_if{ |r_id,w| !positive_quoted.include?(r_id) }
|
105
|
+
pq = positive_quoted.delete_if{ |r_id,w| !positive.include?(r_id) }
|
106
|
+
results = p.merge(pq) { |r_id,old_val,new_val| old_val + new_val}
|
107
|
+
elsif !queries[:positive].empty?
|
108
|
+
results = positive
|
109
|
+
else
|
110
|
+
results = positive_quoted
|
111
|
+
end
|
112
|
+
|
113
|
+
negative_results = (negative.keys + negative_quoted.keys)
|
114
|
+
results.delete_if { |r_id, w| negative_results.include?(r_id) }
|
115
|
+
#p results
|
116
|
+
results
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns true if the index root exists on the FS.
|
120
|
+
#--
|
121
|
+
# TODO: Make a private method called 'root_exists?' which checks for the root directory.
|
122
|
+
def exists?
|
123
|
+
File.exists?(File.join(@root + ['size']))
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
# Gets the size file from the index.
|
129
|
+
def load_record_size
|
130
|
+
File.open(File.join(@root + ['size'])) do |f|
|
131
|
+
(Marshal.load(f))
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Saves the size to the size file.
|
136
|
+
def save_record_size
|
137
|
+
File.open(File.join(@root + ['size']),'w+') do |f|
|
138
|
+
Marshal.dump(@records_size,f)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Returns true if the given atom is present.
|
143
|
+
def include_atom?(atom)
|
144
|
+
@atoms.has_key?(atom)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns true if all the given atoms are present.
|
148
|
+
def include_atoms?(atoms_arr)
|
149
|
+
atoms_arr.each do |a|
|
150
|
+
return false if !include_atom?(a)
|
151
|
+
end
|
152
|
+
true
|
153
|
+
end
|
154
|
+
|
155
|
+
# Returns true if the given record is present.
|
156
|
+
def include_record?(record_id)
|
157
|
+
@atoms.each do |atomname, atom|
|
158
|
+
return true if atom.include_record?(record_id)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def add_atom(atom)
|
163
|
+
@atoms[atom] = SearchAtom.new if !include_atom?(atom)
|
164
|
+
end
|
165
|
+
|
166
|
+
def add_occurences(condensed_record,record_id)
|
167
|
+
condensed_record.each_with_index do |atom, i|
|
168
|
+
add_atom(atom)
|
169
|
+
@atoms[atom].add_position(record_id, i)
|
170
|
+
#p "adding #{record.id} to #{atom}"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def encoded_prefix(atom)
|
175
|
+
prefix = atom[0,@index_depth]
|
176
|
+
if !@prefix_cache || !@prefix_cache.has_key?(prefix)
|
177
|
+
@prefix_cache = {} if !@prefix_cache
|
178
|
+
len = atom.length
|
179
|
+
if len > 1
|
180
|
+
@prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_')
|
181
|
+
else
|
182
|
+
@prefix_cache[prefix] = encode_character(atom)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
@prefix_cache[prefix]
|
186
|
+
end
|
187
|
+
|
188
|
+
# Allows compatibility with 1.8.6 which has no ord method.
|
189
|
+
def encode_character(char)
|
190
|
+
if @@has_ord ||= char.respond_to?(:ord)
|
191
|
+
char.ord.to_s
|
192
|
+
else
|
193
|
+
char[0]
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def parse_query(s)
|
198
|
+
|
199
|
+
# Find -"foo bar".
|
200
|
+
negative_quoted = []
|
201
|
+
while neg_quoted = s.slice!(/-\"[^\"]*\"/)
|
202
|
+
negative_quoted << cleanup_atoms(neg_quoted)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Find "foo bar".
|
206
|
+
positive_quoted = []
|
207
|
+
while pos_quoted = s.slice!(/\"[^\"]*\"/)
|
208
|
+
positive_quoted << cleanup_atoms(pos_quoted)
|
209
|
+
end
|
210
|
+
|
211
|
+
# Find -foo.
|
212
|
+
negative = []
|
213
|
+
while neg = s.slice!(/-[\S]*/)
|
214
|
+
negative << cleanup_atoms(neg).first
|
215
|
+
end
|
216
|
+
|
217
|
+
# Find +foo
|
218
|
+
positive = []
|
219
|
+
while pos = s.slice!(/\+[\S]*/)
|
220
|
+
positive << cleanup_atoms(pos).first
|
221
|
+
end
|
222
|
+
|
223
|
+
# Find all other terms.
|
224
|
+
positive += cleanup_atoms(s,true)
|
225
|
+
|
226
|
+
{:negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :negative => negative, :positive => positive}
|
227
|
+
end
|
228
|
+
|
229
|
+
def run_queries(atoms)
|
230
|
+
results = {}
|
231
|
+
atoms.uniq.each do |atom|
|
232
|
+
interim_results = {}
|
233
|
+
if include_atom?(atom)
|
234
|
+
# Collect all the weightings for the current atom.
|
235
|
+
interim_results = @atoms[atom].weightings(@records_size)
|
236
|
+
end
|
237
|
+
if results.empty?
|
238
|
+
# If first time round, set results with initial weightings.
|
239
|
+
results = interim_results
|
240
|
+
else
|
241
|
+
# If second time round, add weightings together for records
|
242
|
+
# matching both atoms. Any matching only one are discarded.
|
243
|
+
rr = {}
|
244
|
+
interim_results.each do |r,w|
|
245
|
+
rr[r] = w + results[r] if results[r]
|
246
|
+
end
|
247
|
+
results = rr
|
248
|
+
end
|
249
|
+
end
|
250
|
+
#p results
|
251
|
+
results
|
252
|
+
end
|
253
|
+
|
254
|
+
def run_quoted_queries(quoted_atoms)
|
255
|
+
results = {}
|
256
|
+
quoted_atoms.each do |quoted_atom|
|
257
|
+
interim_results = {}
|
258
|
+
# Check the index contains all the required atoms.
|
259
|
+
# match_atom = first_word_atom
|
260
|
+
# for each of the others
|
261
|
+
# return atom containing records + positions where current atom is preceded by following atom.
|
262
|
+
# end
|
263
|
+
# return records from final atom.
|
264
|
+
next if !include_atoms?(quoted_atom)
|
265
|
+
matches = @atoms[quoted_atom.first]
|
266
|
+
quoted_atom[1..-1].each do |atom_name|
|
267
|
+
matches = @atoms[atom_name].preceded_by(matches)
|
268
|
+
end
|
269
|
+
#results += matches.record_ids
|
270
|
+
|
271
|
+
interim_results = matches.weightings(@records_size)
|
272
|
+
if results.empty?
|
273
|
+
results = interim_results
|
274
|
+
else
|
275
|
+
rr = {}
|
276
|
+
interim_results.each do |r,w|
|
277
|
+
rr[r] = w + results[r] if results[r]
|
278
|
+
end
|
279
|
+
#p results.class
|
280
|
+
results = rr
|
281
|
+
end
|
282
|
+
|
283
|
+
end
|
284
|
+
results
|
285
|
+
end
|
286
|
+
|
287
|
+
def load_atoms(atoms)
|
288
|
+
# Remove duplicate atoms.
|
289
|
+
# Remove atoms already in index.
|
290
|
+
# Calculate prefixes.
|
291
|
+
# Remove duplicate prefixes.
|
292
|
+
atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name|
|
293
|
+
if File.exists?(File.join(@root + [name.to_s]))
|
294
|
+
File.open(File.join(@root + [name.to_s])) do |f|
|
295
|
+
@atoms.merge!(Marshal.load(f))
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
def prepare
|
302
|
+
# Makes the RAILS_ROOT/index directory
|
303
|
+
Dir.mkdir(File.join(@root[0,2])) if !File.exists?(File.join(@root[0,2]))
|
304
|
+
# Makes the RAILS_ROOT/index/ENVIRONMENT directory
|
305
|
+
Dir.mkdir(File.join(@root[0,3])) if !File.exists?(File.join(@root[0,3]))
|
306
|
+
# Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directory
|
307
|
+
Dir.mkdir(File.join(@root)) if !File.exists?(File.join(@root))
|
308
|
+
end
|
309
|
+
|
310
|
+
def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3)
|
311
|
+
atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split
|
312
|
+
return atoms if !limit_size
|
313
|
+
atoms.reject{|w| w.size < min_size}
|
314
|
+
end
|
315
|
+
|
316
|
+
def condense_record(record)
|
317
|
+
record_condensed = ''
|
318
|
+
@fields.each do |f|
|
319
|
+
record_condensed += ' ' + record.send(f).to_s if record.send(f)
|
320
|
+
end
|
321
|
+
cleanup_atoms(record_condensed)
|
322
|
+
end
|
323
|
+
|
324
|
+
end
|
325
|
+
end
|