acts_as_indexed 0.6.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,41 @@
1
+ # ActsAsIndexed
2
+ # Copyright (c) 2007 - 2010 Douglas F Shearer.
3
+ # http://douglasfshearer.com
4
+ # Distributed under the MIT license as included with this plugin.
5
+
6
+ module ActsAsIndexed
7
+ # Used to set up and modify settings for acts_as_indexed.
8
+ class Configuration
9
+
10
+ # Sets the location for the index. Specify as an array. Heroku, for
11
+ # example would use RAILS_ROOT/tmp/index, which would be set as
12
+ # [Rails.root,'tmp','index]
13
+ attr_accessor :index_file
14
+
15
+ # Tuning value for the index partitioning. Larger values result in quicker
16
+ # searches, but slower indexing. Default is 3.
17
+ attr_reader :index_file_depth
18
+
19
+ # Sets the minimum length for a word in a query. Words shorter than this
20
+ # value are ignored in searches unless preceded by the '+' operator.
21
+ # Default is 3.
22
+ attr_reader :min_word_size
23
+
24
+ def initialize
25
+ @index_file = [Rails.root, 'index']
26
+ @index_file_depth = 3
27
+ @min_word_size = 3
28
+ end
29
+
30
+ def index_file_depth=(val)
31
+ raise(ArgumentError, 'index_file_depth cannot be less than one (1)') if val < 1
32
+ @index_file_depth = val
33
+ end
34
+
35
+ def min_word_size=(val)
36
+ raise(ArgumentError, 'min_word_size cannot be less than one (1)') if val < 1
37
+ @min_word_size = val
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,104 @@
1
+ # ActsAsIndexed
2
+ # Copyright (c) 2007 - 2010 Douglas F Shearer.
3
+ # http://douglasfshearer.com
4
+ # Distributed under the MIT license as included with this plugin.
5
+
6
+ module ActsAsIndexed #:nodoc:
7
+ class SearchAtom
8
+
9
+ # Contains a hash of records.
10
+ # { 'record_id' => [pos1, pos2, pos] }
11
+ #--
12
+ # Weighting:
13
+ # http://www.perlmonks.com/index.pl?node_id=27509
14
+ # W(T, D) = tf(T, D) * log ( DN / df(T))
15
+ # weighting = frequency_in_this_record * log (total_number_of_records / number_of_matching_records)
16
+
17
+ def initialize
18
+ @records = {}
19
+ end
20
+
21
+ # Returns true if the given record is present.
22
+ def include_record?(record_id)
23
+ @records.include?(record_id)
24
+ end
25
+
26
+ # Adds +record_id+ to the stored records.
27
+ def add_record(record_id)
28
+ @records[record_id] = [] if !include_record?(record_id)
29
+ end
30
+
31
+ # Adds +pos+ to the array of positions for +record_id+.
32
+ def add_position(record_id, pos)
33
+ add_record(record_id)
34
+ @records[record_id] << pos
35
+ end
36
+
37
+ # Returns all record IDs stored in this Atom.
38
+ def record_ids
39
+ @records.keys
40
+ end
41
+
42
+ # Returns an array of positions for +record_id+ stored in this Atom.
43
+ def positions(record_id)
44
+ @records[record_id]
45
+ end
46
+
47
+ # Removes +record_id+ from this Atom.
48
+ def remove_record(record_id)
49
+ @records.delete(record_id)
50
+ end
51
+
52
+ # Returns at atom containing the records and positions of +self+ preceded by +former+
53
+ # "former latter" or "big dog" where "big" is the former and "dog" is the latter.
54
+ def preceded_by(former)
55
+ matches = SearchAtom.new
56
+ latter = {}
57
+ former.record_ids.each do |rid|
58
+ latter[rid] = @records[rid] if @records[rid]
59
+ end
60
+ # Iterate over each record in latter.
61
+ latter.each do |record_id,pos|
62
+
63
+ # Iterate over each position.
64
+ pos.each do |p|
65
+ # Check if previous position is in former.
66
+ if former.include_position?(record_id,p-1)
67
+ matches.add_record(record_id) if !matches.include_record?(record_id)
68
+ matches.add_position(record_id,p)
69
+ end
70
+ end
71
+
72
+ end
73
+ matches
74
+ end
75
+
76
+ # Returns a hash of record_ids and weightings for each record in the
77
+ # atom.
78
+ def weightings(records_size)
79
+ out = {}
80
+ @records.each do |r_id, pos|
81
+
82
+ # Fixes a bug when the records_size is zero. i.e. The only record
83
+ # contaning the word has been deleted.
84
+ if records_size < 1
85
+ out[r_id] = 0.0
86
+ next
87
+ end
88
+
89
+ # weighting = frequency * log (records.size / records_with_atom)
90
+ ## parndt 2010/05/03 changed to records_size.to_f to avoid -Infinity Errno::ERANGE exceptions
91
+ ## which would happen for example Math.log(1 / 20) == -Infinity but Math.log(1.0 / 20) == -2.99573227355399
92
+ out[r_id] = pos.size * Math.log(records_size.to_f / @records.size)
93
+ end
94
+ out
95
+ end
96
+
97
+ protected
98
+
99
+ def include_position?(record_id,pos)
100
+ @records[record_id].include?(pos)
101
+ end
102
+
103
+ end
104
+ end
@@ -0,0 +1,325 @@
1
+ # ActsAsIndexed
2
+ # Copyright (c) 2007 - 2010 Douglas F Shearer.
3
+ # http://douglasfshearer.com
4
+ # Distributed under the MIT license as included with this plugin.
5
+
6
+ module ActsAsIndexed #:nodoc:
7
+ class SearchIndex
8
+
9
+ # root:: Location of index on filesystem.
10
+ # index_depth:: Degree of index partitioning.
11
+ # fields:: Fields or instance methods of ActiveRecord model to be indexed.
12
+ # min_word_size:: Smallest query term that will be run through search.
13
+ def initialize(root, index_depth, fields, min_word_size)
14
+ @root = root
15
+ @fields = fields
16
+ @index_depth = index_depth
17
+ @atoms = {}
18
+ @min_word_size = min_word_size
19
+ @records_size = exists? ? load_record_size : 0
20
+ end
21
+
22
+ # Adds +record+ to the index.
23
+ def add_record(record)
24
+ condensed_record = condense_record(record)
25
+ load_atoms(condensed_record)
26
+ add_occurences(condensed_record,record.id)
27
+ @records_size += 1
28
+ end
29
+
30
+ # Adds multiple records to the index. Accepts an array of +records+.
31
+ def add_records(records)
32
+ records.each do |record|
33
+ add_record(record)
34
+ end
35
+ end
36
+
37
+ # Removes +record+ from the index.
38
+ def remove_record(record)
39
+ atoms = condense_record(record)
40
+ load_atoms(atoms)
41
+ atoms.each do |a|
42
+ @atoms[a].remove_record(record.id) if @atoms.has_key?(a)
43
+ @records_size -= 1
44
+ #p "removing #{record.id} from #{a}"
45
+ end
46
+ end
47
+
48
+ def update_record(record_new, record_old)
49
+ # Work out which atoms have modifications.
50
+ # Minimises loading and saving of partitions.
51
+ old_atoms = condense_record(record_old)
52
+ new_atoms = condense_record(record_new)
53
+
54
+ # Remove the old version from the appropriate atoms.
55
+ load_atoms(old_atoms)
56
+ old_atoms.each do |a|
57
+ @atoms[a].remove_record(record_new.id) if @atoms.has_key?(a)
58
+ end
59
+
60
+ # Add the new version to the appropriate atoms.
61
+ load_atoms(new_atoms)
62
+ # TODO: Make a version of this method that takes the
63
+ # atomised version of the record.
64
+ add_occurences(new_atoms, record_new.id)
65
+ end
66
+
67
+ # Saves the current index partitions to the filesystem.
68
+ def save
69
+ prepare
70
+ atoms_sorted = {}
71
+ @atoms.each do |atom_name, records|
72
+ e_p = encoded_prefix(atom_name)
73
+ atoms_sorted[e_p] = {} if !atoms_sorted.has_key?(e_p)
74
+ atoms_sorted[e_p][atom_name] = records
75
+ end
76
+ atoms_sorted.each do |e_p, atoms|
77
+ #p "Saving #{e_p}."
78
+ File.open(File.join(@root + [e_p.to_s]),'w+') do |f|
79
+ Marshal.dump(atoms,f)
80
+ end
81
+ end
82
+ save_record_size
83
+ end
84
+
85
+ # Deletes the current model's index from the filesystem.
86
+ #--
87
+ # TODO: Write a public method that will delete all indexes.
88
+ def destroy
89
+ FileUtils.rm_rf(@root)
90
+ true
91
+ end
92
+
93
+ # Returns an array of IDs for records matching +query+.
94
+ def search(query)
95
+ return [] if query.nil?
96
+ load_atoms(cleanup_atoms(query))
97
+ queries = parse_query(query.dup)
98
+ positive = run_queries(queries[:positive])
99
+ positive_quoted = run_quoted_queries(queries[:positive_quoted])
100
+ negative = run_queries(queries[:negative])
101
+ negative_quoted = run_quoted_queries(queries[:negative_quoted])
102
+
103
+ if !queries[:positive].empty? && !queries[:positive_quoted].empty?
104
+ p = positive.delete_if{ |r_id,w| !positive_quoted.include?(r_id) }
105
+ pq = positive_quoted.delete_if{ |r_id,w| !positive.include?(r_id) }
106
+ results = p.merge(pq) { |r_id,old_val,new_val| old_val + new_val}
107
+ elsif !queries[:positive].empty?
108
+ results = positive
109
+ else
110
+ results = positive_quoted
111
+ end
112
+
113
+ negative_results = (negative.keys + negative_quoted.keys)
114
+ results.delete_if { |r_id, w| negative_results.include?(r_id) }
115
+ #p results
116
+ results
117
+ end
118
+
119
+ # Returns true if the index root exists on the FS.
120
+ #--
121
+ # TODO: Make a private method called 'root_exists?' which checks for the root directory.
122
+ def exists?
123
+ File.exists?(File.join(@root + ['size']))
124
+ end
125
+
126
+ private
127
+
128
+ # Gets the size file from the index.
129
+ def load_record_size
130
+ File.open(File.join(@root + ['size'])) do |f|
131
+ (Marshal.load(f))
132
+ end
133
+ end
134
+
135
+ # Saves the size to the size file.
136
+ def save_record_size
137
+ File.open(File.join(@root + ['size']),'w+') do |f|
138
+ Marshal.dump(@records_size,f)
139
+ end
140
+ end
141
+
142
+ # Returns true if the given atom is present.
143
+ def include_atom?(atom)
144
+ @atoms.has_key?(atom)
145
+ end
146
+
147
+ # Returns true if all the given atoms are present.
148
+ def include_atoms?(atoms_arr)
149
+ atoms_arr.each do |a|
150
+ return false if !include_atom?(a)
151
+ end
152
+ true
153
+ end
154
+
155
+ # Returns true if the given record is present.
156
+ def include_record?(record_id)
157
+ @atoms.each do |atomname, atom|
158
+ return true if atom.include_record?(record_id)
159
+ end
160
+ end
161
+
162
+ def add_atom(atom)
163
+ @atoms[atom] = SearchAtom.new if !include_atom?(atom)
164
+ end
165
+
166
+ def add_occurences(condensed_record,record_id)
167
+ condensed_record.each_with_index do |atom, i|
168
+ add_atom(atom)
169
+ @atoms[atom].add_position(record_id, i)
170
+ #p "adding #{record.id} to #{atom}"
171
+ end
172
+ end
173
+
174
+ def encoded_prefix(atom)
175
+ prefix = atom[0,@index_depth]
176
+ if !@prefix_cache || !@prefix_cache.has_key?(prefix)
177
+ @prefix_cache = {} if !@prefix_cache
178
+ len = atom.length
179
+ if len > 1
180
+ @prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_')
181
+ else
182
+ @prefix_cache[prefix] = encode_character(atom)
183
+ end
184
+ end
185
+ @prefix_cache[prefix]
186
+ end
187
+
188
+ # Allows compatibility with 1.8.6 which has no ord method.
189
+ def encode_character(char)
190
+ if @@has_ord ||= char.respond_to?(:ord)
191
+ char.ord.to_s
192
+ else
193
+ char[0]
194
+ end
195
+ end
196
+
197
+ def parse_query(s)
198
+
199
+ # Find -"foo bar".
200
+ negative_quoted = []
201
+ while neg_quoted = s.slice!(/-\"[^\"]*\"/)
202
+ negative_quoted << cleanup_atoms(neg_quoted)
203
+ end
204
+
205
+ # Find "foo bar".
206
+ positive_quoted = []
207
+ while pos_quoted = s.slice!(/\"[^\"]*\"/)
208
+ positive_quoted << cleanup_atoms(pos_quoted)
209
+ end
210
+
211
+ # Find -foo.
212
+ negative = []
213
+ while neg = s.slice!(/-[\S]*/)
214
+ negative << cleanup_atoms(neg).first
215
+ end
216
+
217
+ # Find +foo
218
+ positive = []
219
+ while pos = s.slice!(/\+[\S]*/)
220
+ positive << cleanup_atoms(pos).first
221
+ end
222
+
223
+ # Find all other terms.
224
+ positive += cleanup_atoms(s,true)
225
+
226
+ {:negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :negative => negative, :positive => positive}
227
+ end
228
+
229
+ def run_queries(atoms)
230
+ results = {}
231
+ atoms.uniq.each do |atom|
232
+ interim_results = {}
233
+ if include_atom?(atom)
234
+ # Collect all the weightings for the current atom.
235
+ interim_results = @atoms[atom].weightings(@records_size)
236
+ end
237
+ if results.empty?
238
+ # If first time round, set results with initial weightings.
239
+ results = interim_results
240
+ else
241
+ # If second time round, add weightings together for records
242
+ # matching both atoms. Any matching only one are discarded.
243
+ rr = {}
244
+ interim_results.each do |r,w|
245
+ rr[r] = w + results[r] if results[r]
246
+ end
247
+ results = rr
248
+ end
249
+ end
250
+ #p results
251
+ results
252
+ end
253
+
254
+ def run_quoted_queries(quoted_atoms)
255
+ results = {}
256
+ quoted_atoms.each do |quoted_atom|
257
+ interim_results = {}
258
+ # Check the index contains all the required atoms.
259
+ # match_atom = first_word_atom
260
+ # for each of the others
261
+ # return atom containing records + positions where current atom is preceded by following atom.
262
+ # end
263
+ # return records from final atom.
264
+ next if !include_atoms?(quoted_atom)
265
+ matches = @atoms[quoted_atom.first]
266
+ quoted_atom[1..-1].each do |atom_name|
267
+ matches = @atoms[atom_name].preceded_by(matches)
268
+ end
269
+ #results += matches.record_ids
270
+
271
+ interim_results = matches.weightings(@records_size)
272
+ if results.empty?
273
+ results = interim_results
274
+ else
275
+ rr = {}
276
+ interim_results.each do |r,w|
277
+ rr[r] = w + results[r] if results[r]
278
+ end
279
+ #p results.class
280
+ results = rr
281
+ end
282
+
283
+ end
284
+ results
285
+ end
286
+
287
+ def load_atoms(atoms)
288
+ # Remove duplicate atoms.
289
+ # Remove atoms already in index.
290
+ # Calculate prefixes.
291
+ # Remove duplicate prefixes.
292
+ atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name|
293
+ if File.exists?(File.join(@root + [name.to_s]))
294
+ File.open(File.join(@root + [name.to_s])) do |f|
295
+ @atoms.merge!(Marshal.load(f))
296
+ end
297
+ end
298
+ end
299
+ end
300
+
301
+ def prepare
302
+ # Makes the RAILS_ROOT/index directory
303
+ Dir.mkdir(File.join(@root[0,2])) if !File.exists?(File.join(@root[0,2]))
304
+ # Makes the RAILS_ROOT/index/ENVIRONMENT directory
305
+ Dir.mkdir(File.join(@root[0,3])) if !File.exists?(File.join(@root[0,3]))
306
+ # Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directory
307
+ Dir.mkdir(File.join(@root)) if !File.exists?(File.join(@root))
308
+ end
309
+
310
+ def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3)
311
+ atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split
312
+ return atoms if !limit_size
313
+ atoms.reject{|w| w.size < min_size}
314
+ end
315
+
316
+ def condense_record(record)
317
+ record_condensed = ''
318
+ @fields.each do |f|
319
+ record_condensed += ' ' + record.send(f).to_s if record.send(f)
320
+ end
321
+ cleanup_atoms(record_condensed)
322
+ end
323
+
324
+ end
325
+ end