acts_as_indexed 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ # ActsAsIndexed
2
+ # Copyright (c) 2007 - 2010 Douglas F Shearer.
3
+ # http://douglasfshearer.com
4
+ # Distributed under the MIT license as included with this plugin.
5
+
6
+ module ActsAsIndexed
7
+ # Used to set up and modify settings for acts_as_indexed.
8
+ class Configuration
9
+
10
+ # Sets the location for the index. Specify as an array. Heroku, for
11
+ # example would use RAILS_ROOT/tmp/index, which would be set as
12
+ # [Rails.root,'tmp','index]
13
+ attr_accessor :index_file
14
+
15
+ # Tuning value for the index partitioning. Larger values result in quicker
16
+ # searches, but slower indexing. Default is 3.
17
+ attr_reader :index_file_depth
18
+
19
+ # Sets the minimum length for a word in a query. Words shorter than this
20
+ # value are ignored in searches unless preceded by the '+' operator.
21
+ # Default is 3.
22
+ attr_reader :min_word_size
23
+
24
+ def initialize
25
+ @index_file = [Rails.root, 'index']
26
+ @index_file_depth = 3
27
+ @min_word_size = 3
28
+ end
29
+
30
+ def index_file_depth=(val)
31
+ raise(ArgumentError, 'index_file_depth cannot be less than one (1)') if val < 1
32
+ @index_file_depth = val
33
+ end
34
+
35
+ def min_word_size=(val)
36
+ raise(ArgumentError, 'min_word_size cannot be less than one (1)') if val < 1
37
+ @min_word_size = val
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,104 @@
1
+ # ActsAsIndexed
2
+ # Copyright (c) 2007 - 2010 Douglas F Shearer.
3
+ # http://douglasfshearer.com
4
+ # Distributed under the MIT license as included with this plugin.
5
+
6
+ module ActsAsIndexed #:nodoc:
7
+ class SearchAtom
8
+
9
+ # Contains a hash of records.
10
+ # { 'record_id' => [pos1, pos2, pos] }
11
+ #--
12
+ # Weighting:
13
+ # http://www.perlmonks.com/index.pl?node_id=27509
14
+ # W(T, D) = tf(T, D) * log ( DN / df(T))
15
+ # weighting = frequency_in_this_record * log (total_number_of_records / number_of_matching_records)
16
+
17
+ def initialize
18
+ @records = {}
19
+ end
20
+
21
+ # Returns true if the given record is present.
22
+ def include_record?(record_id)
23
+ @records.include?(record_id)
24
+ end
25
+
26
+ # Adds +record_id+ to the stored records.
27
+ def add_record(record_id)
28
+ @records[record_id] = [] if !include_record?(record_id)
29
+ end
30
+
31
+ # Adds +pos+ to the array of positions for +record_id+.
32
+ def add_position(record_id, pos)
33
+ add_record(record_id)
34
+ @records[record_id] << pos
35
+ end
36
+
37
+ # Returns all record IDs stored in this Atom.
38
+ def record_ids
39
+ @records.keys
40
+ end
41
+
42
+ # Returns an array of positions for +record_id+ stored in this Atom.
43
+ def positions(record_id)
44
+ @records[record_id]
45
+ end
46
+
47
+ # Removes +record_id+ from this Atom.
48
+ def remove_record(record_id)
49
+ @records.delete(record_id)
50
+ end
51
+
52
+ # Returns at atom containing the records and positions of +self+ preceded by +former+
53
+ # "former latter" or "big dog" where "big" is the former and "dog" is the latter.
54
+ def preceded_by(former)
55
+ matches = SearchAtom.new
56
+ latter = {}
57
+ former.record_ids.each do |rid|
58
+ latter[rid] = @records[rid] if @records[rid]
59
+ end
60
+ # Iterate over each record in latter.
61
+ latter.each do |record_id,pos|
62
+
63
+ # Iterate over each position.
64
+ pos.each do |p|
65
+ # Check if previous position is in former.
66
+ if former.include_position?(record_id,p-1)
67
+ matches.add_record(record_id) if !matches.include_record?(record_id)
68
+ matches.add_position(record_id,p)
69
+ end
70
+ end
71
+
72
+ end
73
+ matches
74
+ end
75
+
76
+ # Returns a hash of record_ids and weightings for each record in the
77
+ # atom.
78
+ def weightings(records_size)
79
+ out = {}
80
+ @records.each do |r_id, pos|
81
+
82
+ # Fixes a bug when the records_size is zero. i.e. The only record
83
+ # contaning the word has been deleted.
84
+ if records_size < 1
85
+ out[r_id] = 0.0
86
+ next
87
+ end
88
+
89
+ # weighting = frequency * log (records.size / records_with_atom)
90
+ ## parndt 2010/05/03 changed to records_size.to_f to avoid -Infinity Errno::ERANGE exceptions
91
+ ## which would happen for example Math.log(1 / 20) == -Infinity but Math.log(1.0 / 20) == -2.99573227355399
92
+ out[r_id] = pos.size * Math.log(records_size.to_f / @records.size)
93
+ end
94
+ out
95
+ end
96
+
97
+ protected
98
+
99
+ def include_position?(record_id,pos)
100
+ @records[record_id].include?(pos)
101
+ end
102
+
103
+ end
104
+ end
@@ -0,0 +1,325 @@
1
+ # ActsAsIndexed
2
+ # Copyright (c) 2007 - 2010 Douglas F Shearer.
3
+ # http://douglasfshearer.com
4
+ # Distributed under the MIT license as included with this plugin.
5
+
6
+ module ActsAsIndexed #:nodoc:
7
+ class SearchIndex
8
+
9
+ # root:: Location of index on filesystem.
10
+ # index_depth:: Degree of index partitioning.
11
+ # fields:: Fields or instance methods of ActiveRecord model to be indexed.
12
+ # min_word_size:: Smallest query term that will be run through search.
13
+ def initialize(root, index_depth, fields, min_word_size)
14
+ @root = root
15
+ @fields = fields
16
+ @index_depth = index_depth
17
+ @atoms = {}
18
+ @min_word_size = min_word_size
19
+ @records_size = exists? ? load_record_size : 0
20
+ end
21
+
22
+ # Adds +record+ to the index.
23
+ def add_record(record)
24
+ condensed_record = condense_record(record)
25
+ load_atoms(condensed_record)
26
+ add_occurences(condensed_record,record.id)
27
+ @records_size += 1
28
+ end
29
+
30
+ # Adds multiple records to the index. Accepts an array of +records+.
31
+ def add_records(records)
32
+ records.each do |record|
33
+ add_record(record)
34
+ end
35
+ end
36
+
37
+ # Removes +record+ from the index.
38
+ def remove_record(record)
39
+ atoms = condense_record(record)
40
+ load_atoms(atoms)
41
+ atoms.each do |a|
42
+ @atoms[a].remove_record(record.id) if @atoms.has_key?(a)
43
+ @records_size -= 1
44
+ #p "removing #{record.id} from #{a}"
45
+ end
46
+ end
47
+
48
+ def update_record(record_new, record_old)
49
+ # Work out which atoms have modifications.
50
+ # Minimises loading and saving of partitions.
51
+ old_atoms = condense_record(record_old)
52
+ new_atoms = condense_record(record_new)
53
+
54
+ # Remove the old version from the appropriate atoms.
55
+ load_atoms(old_atoms)
56
+ old_atoms.each do |a|
57
+ @atoms[a].remove_record(record_new.id) if @atoms.has_key?(a)
58
+ end
59
+
60
+ # Add the new version to the appropriate atoms.
61
+ load_atoms(new_atoms)
62
+ # TODO: Make a version of this method that takes the
63
+ # atomised version of the record.
64
+ add_occurences(new_atoms, record_new.id)
65
+ end
66
+
67
+ # Saves the current index partitions to the filesystem.
68
+ def save
69
+ prepare
70
+ atoms_sorted = {}
71
+ @atoms.each do |atom_name, records|
72
+ e_p = encoded_prefix(atom_name)
73
+ atoms_sorted[e_p] = {} if !atoms_sorted.has_key?(e_p)
74
+ atoms_sorted[e_p][atom_name] = records
75
+ end
76
+ atoms_sorted.each do |e_p, atoms|
77
+ #p "Saving #{e_p}."
78
+ File.open(File.join(@root + [e_p.to_s]),'w+') do |f|
79
+ Marshal.dump(atoms,f)
80
+ end
81
+ end
82
+ save_record_size
83
+ end
84
+
85
+ # Deletes the current model's index from the filesystem.
86
+ #--
87
+ # TODO: Write a public method that will delete all indexes.
88
+ def destroy
89
+ FileUtils.rm_rf(@root)
90
+ true
91
+ end
92
+
93
+ # Returns an array of IDs for records matching +query+.
94
+ def search(query)
95
+ return [] if query.nil?
96
+ load_atoms(cleanup_atoms(query))
97
+ queries = parse_query(query.dup)
98
+ positive = run_queries(queries[:positive])
99
+ positive_quoted = run_quoted_queries(queries[:positive_quoted])
100
+ negative = run_queries(queries[:negative])
101
+ negative_quoted = run_quoted_queries(queries[:negative_quoted])
102
+
103
+ if !queries[:positive].empty? && !queries[:positive_quoted].empty?
104
+ p = positive.delete_if{ |r_id,w| !positive_quoted.include?(r_id) }
105
+ pq = positive_quoted.delete_if{ |r_id,w| !positive.include?(r_id) }
106
+ results = p.merge(pq) { |r_id,old_val,new_val| old_val + new_val}
107
+ elsif !queries[:positive].empty?
108
+ results = positive
109
+ else
110
+ results = positive_quoted
111
+ end
112
+
113
+ negative_results = (negative.keys + negative_quoted.keys)
114
+ results.delete_if { |r_id, w| negative_results.include?(r_id) }
115
+ #p results
116
+ results
117
+ end
118
+
119
+ # Returns true if the index root exists on the FS.
120
+ #--
121
+ # TODO: Make a private method called 'root_exists?' which checks for the root directory.
122
+ def exists?
123
+ File.exists?(File.join(@root + ['size']))
124
+ end
125
+
126
+ private
127
+
128
+ # Gets the size file from the index.
129
+ def load_record_size
130
+ File.open(File.join(@root + ['size'])) do |f|
131
+ (Marshal.load(f))
132
+ end
133
+ end
134
+
135
+ # Saves the size to the size file.
136
+ def save_record_size
137
+ File.open(File.join(@root + ['size']),'w+') do |f|
138
+ Marshal.dump(@records_size,f)
139
+ end
140
+ end
141
+
142
+ # Returns true if the given atom is present.
143
+ def include_atom?(atom)
144
+ @atoms.has_key?(atom)
145
+ end
146
+
147
+ # Returns true if all the given atoms are present.
148
+ def include_atoms?(atoms_arr)
149
+ atoms_arr.each do |a|
150
+ return false if !include_atom?(a)
151
+ end
152
+ true
153
+ end
154
+
155
+ # Returns true if the given record is present.
156
+ def include_record?(record_id)
157
+ @atoms.each do |atomname, atom|
158
+ return true if atom.include_record?(record_id)
159
+ end
160
+ end
161
+
162
+ def add_atom(atom)
163
+ @atoms[atom] = SearchAtom.new if !include_atom?(atom)
164
+ end
165
+
166
+ def add_occurences(condensed_record,record_id)
167
+ condensed_record.each_with_index do |atom, i|
168
+ add_atom(atom)
169
+ @atoms[atom].add_position(record_id, i)
170
+ #p "adding #{record.id} to #{atom}"
171
+ end
172
+ end
173
+
174
+ def encoded_prefix(atom)
175
+ prefix = atom[0,@index_depth]
176
+ if !@prefix_cache || !@prefix_cache.has_key?(prefix)
177
+ @prefix_cache = {} if !@prefix_cache
178
+ len = atom.length
179
+ if len > 1
180
+ @prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_')
181
+ else
182
+ @prefix_cache[prefix] = encode_character(atom)
183
+ end
184
+ end
185
+ @prefix_cache[prefix]
186
+ end
187
+
188
+ # Allows compatibility with 1.8.6 which has no ord method.
189
+ def encode_character(char)
190
+ if @@has_ord ||= char.respond_to?(:ord)
191
+ char.ord.to_s
192
+ else
193
+ char[0]
194
+ end
195
+ end
196
+
197
+ def parse_query(s)
198
+
199
+ # Find -"foo bar".
200
+ negative_quoted = []
201
+ while neg_quoted = s.slice!(/-\"[^\"]*\"/)
202
+ negative_quoted << cleanup_atoms(neg_quoted)
203
+ end
204
+
205
+ # Find "foo bar".
206
+ positive_quoted = []
207
+ while pos_quoted = s.slice!(/\"[^\"]*\"/)
208
+ positive_quoted << cleanup_atoms(pos_quoted)
209
+ end
210
+
211
+ # Find -foo.
212
+ negative = []
213
+ while neg = s.slice!(/-[\S]*/)
214
+ negative << cleanup_atoms(neg).first
215
+ end
216
+
217
+ # Find +foo
218
+ positive = []
219
+ while pos = s.slice!(/\+[\S]*/)
220
+ positive << cleanup_atoms(pos).first
221
+ end
222
+
223
+ # Find all other terms.
224
+ positive += cleanup_atoms(s,true)
225
+
226
+ {:negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :negative => negative, :positive => positive}
227
+ end
228
+
229
+ def run_queries(atoms)
230
+ results = {}
231
+ atoms.uniq.each do |atom|
232
+ interim_results = {}
233
+ if include_atom?(atom)
234
+ # Collect all the weightings for the current atom.
235
+ interim_results = @atoms[atom].weightings(@records_size)
236
+ end
237
+ if results.empty?
238
+ # If first time round, set results with initial weightings.
239
+ results = interim_results
240
+ else
241
+ # If second time round, add weightings together for records
242
+ # matching both atoms. Any matching only one are discarded.
243
+ rr = {}
244
+ interim_results.each do |r,w|
245
+ rr[r] = w + results[r] if results[r]
246
+ end
247
+ results = rr
248
+ end
249
+ end
250
+ #p results
251
+ results
252
+ end
253
+
254
+ def run_quoted_queries(quoted_atoms)
255
+ results = {}
256
+ quoted_atoms.each do |quoted_atom|
257
+ interim_results = {}
258
+ # Check the index contains all the required atoms.
259
+ # match_atom = first_word_atom
260
+ # for each of the others
261
+ # return atom containing records + positions where current atom is preceded by following atom.
262
+ # end
263
+ # return records from final atom.
264
+ next if !include_atoms?(quoted_atom)
265
+ matches = @atoms[quoted_atom.first]
266
+ quoted_atom[1..-1].each do |atom_name|
267
+ matches = @atoms[atom_name].preceded_by(matches)
268
+ end
269
+ #results += matches.record_ids
270
+
271
+ interim_results = matches.weightings(@records_size)
272
+ if results.empty?
273
+ results = interim_results
274
+ else
275
+ rr = {}
276
+ interim_results.each do |r,w|
277
+ rr[r] = w + results[r] if results[r]
278
+ end
279
+ #p results.class
280
+ results = rr
281
+ end
282
+
283
+ end
284
+ results
285
+ end
286
+
287
+ def load_atoms(atoms)
288
+ # Remove duplicate atoms.
289
+ # Remove atoms already in index.
290
+ # Calculate prefixes.
291
+ # Remove duplicate prefixes.
292
+ atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name|
293
+ if File.exists?(File.join(@root + [name.to_s]))
294
+ File.open(File.join(@root + [name.to_s])) do |f|
295
+ @atoms.merge!(Marshal.load(f))
296
+ end
297
+ end
298
+ end
299
+ end
300
+
301
+ def prepare
302
+ # Makes the RAILS_ROOT/index directory
303
+ Dir.mkdir(File.join(@root[0,2])) if !File.exists?(File.join(@root[0,2]))
304
+ # Makes the RAILS_ROOT/index/ENVIRONMENT directory
305
+ Dir.mkdir(File.join(@root[0,3])) if !File.exists?(File.join(@root[0,3]))
306
+ # Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directory
307
+ Dir.mkdir(File.join(@root)) if !File.exists?(File.join(@root))
308
+ end
309
+
310
+ def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3)
311
+ atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split
312
+ return atoms if !limit_size
313
+ atoms.reject{|w| w.size < min_size}
314
+ end
315
+
316
+ def condense_record(record)
317
+ record_condensed = ''
318
+ @fields.each do |f|
319
+ record_condensed += ' ' + record.send(f).to_s if record.send(f)
320
+ end
321
+ cleanup_atoms(record_condensed)
322
+ end
323
+
324
+ end
325
+ end