sdsykes_acts_as_ferret 0.4.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,211 @@
1
+ module ActsAsFerret
2
+ class LocalIndex < AbstractIndex
3
+ include MoreLikeThis::IndexMethods
4
+
5
+ def initialize(aaf_configuration)
6
+ super
7
+ ensure_index_exists
8
+ end
9
+
10
+ def reopen!
11
+ if @ferret_index
12
+ @ferret_index.close
13
+ @ferret_index = nil
14
+ end
15
+ logger.debug "reopening index at #{aaf_configuration[:ferret][:path]}"
16
+ ferret_index
17
+ end
18
+
19
+ # The 'real' Ferret Index instance
20
+ def ferret_index
21
+ ensure_index_exists
22
+ (@ferret_index ||= Ferret::Index::Index.new(aaf_configuration[:ferret])).tap do
23
+ @ferret_index.batch_size = aaf_configuration[:reindex_batch_size]
24
+ @ferret_index.logger = logger
25
+ end
26
+ end
27
+
28
+ # Checks for the presence of a segments file in the index directory
29
+ # Rebuilds the index if none exists.
30
+ def ensure_index_exists
31
+ logger.debug "LocalIndex: ensure_index_exists at #{aaf_configuration[:index_dir]}"
32
+ unless File.file? "#{aaf_configuration[:index_dir]}/segments"
33
+ ActsAsFerret::ensure_directory(aaf_configuration[:index_dir])
34
+ close
35
+ rebuild_index
36
+ end
37
+ end
38
+
39
+ # Closes the underlying index instance
40
+ def close
41
+ @ferret_index.close if @ferret_index
42
+ rescue StandardError
43
+ # is raised when index already closed
44
+ ensure
45
+ @ferret_index = nil
46
+ end
47
+
48
+ # rebuilds the index from all records of the model class this index belongs
49
+ # to. Arguments can be given in shared index scenarios to name multiple
50
+ # model classes to include in the index
51
+ def rebuild_index(*models)
52
+ models << aaf_configuration[:class_name] unless models.include?(aaf_configuration[:class_name])
53
+ models = models.flatten.uniq.map(&:constantize)
54
+ logger.debug "rebuild index: #{models.inspect}"
55
+ index = Ferret::Index::Index.new(aaf_configuration[:ferret].dup.update(:auto_flush => false,
56
+ :field_infos => ActsAsFerret::field_infos(models),
57
+ :create => true))
58
+ index.batch_size = aaf_configuration[:reindex_batch_size]
59
+ index.logger = logger
60
+ index.index_models models
61
+ end
62
+
63
+ def bulk_index(ids, options)
64
+ ferret_index.bulk_index(aaf_configuration[:class_name].constantize, ids, options)
65
+ end
66
+
67
+ # Parses the given query string into a Ferret Query object.
68
+ def process_query(query)
69
+ # work around ferret bug in #process_query (doesn't ensure the
70
+ # reader is open)
71
+ ferret_index.synchronize do
72
+ ferret_index.send(:ensure_reader_open)
73
+ original_query = ferret_index.process_query(query)
74
+ end
75
+ end
76
+
77
+ # Total number of hits for the given query.
78
+ # To count the results of a multi_search query, specify an array of
79
+ # class names with the :multi option.
80
+ def total_hits(query, options = {})
81
+ index = (models = options.delete(:multi)) ? multi_index(models) : ferret_index
82
+ index.search(query, options).total_hits
83
+ end
84
+
85
+ def determine_lazy_fields(options = {})
86
+ stored_fields = options[:lazy]
87
+ if stored_fields && !(Array === stored_fields)
88
+ stored_fields = aaf_configuration[:ferret_fields].select { |field, config| config[:store] == :yes }.map(&:first)
89
+ end
90
+ logger.debug "stored_fields: #{stored_fields}"
91
+ return stored_fields
92
+ end
93
+
94
+ # Queries the Ferret index to retrieve model class, id, score and the
95
+ # values of any fields stored in the index for each hit.
96
+ # If a block is given, these are yielded and the number of total hits is
97
+ # returned. Otherwise [total_hits, result_array] is returned.
98
+ def find_id_by_contents(query, options = {})
99
+ result = []
100
+ index = ferret_index
101
+ logger.debug "query: #{ferret_index.process_query query}" if logger.debug?
102
+ lazy_fields = determine_lazy_fields options
103
+
104
+ total_hits = index.search_each(query, options) do |hit, score|
105
+ doc = index[hit]
106
+ model = aaf_configuration[:store_class_name] ? doc[:class_name] : aaf_configuration[:class_name]
107
+ # fetch stored fields if lazy loading
108
+ data = {}
109
+ lazy_fields.each { |field| data[field] = doc[field] } if lazy_fields
110
+ if block_given?
111
+ yield model, doc[:id], score, data
112
+ else
113
+ result << { :model => model, :id => doc[:id], :score => score, :data => data }
114
+ end
115
+ end
116
+ #logger.debug "id_score_model array: #{result.inspect}"
117
+ return block_given? ? total_hits : [total_hits, result]
118
+ end
119
+
120
+ # Queries multiple Ferret indexes to retrieve model class, id and score for
121
+ # each hit. Use the models parameter to give the list of models to search.
122
+ # If a block is given, model, id and score are yielded and the number of
123
+ # total hits is returned. Otherwise [total_hits, result_array] is returned.
124
+ def id_multi_search(query, models, options = {})
125
+ index = multi_index(models)
126
+ result = []
127
+ lazy_fields = determine_lazy_fields options
128
+ total_hits = index.search_each(query, options) do |hit, score|
129
+ doc = index[hit]
130
+ # fetch stored fields if lazy loading
131
+ data = {}
132
+ lazy_fields.each { |field| data[field] = doc[field] } if lazy_fields
133
+ raise "':store_class_name => true' required for multi_search to work" if doc[:class_name].blank?
134
+ if block_given?
135
+ yield doc[:class_name], doc[:id], score, doc, data
136
+ else
137
+ result << { :model => doc[:class_name], :id => doc[:id], :score => score, :data => data }
138
+ end
139
+ end
140
+ return block_given? ? total_hits : [ total_hits, result ]
141
+ end
142
+
143
+ ######################################
144
+ # methods working on a single record
145
+ # called from instance_methods, here to simplify interfacing with the
146
+ # remote ferret server
147
+ # TODO having to pass id and class_name around like this isn't nice
148
+ ######################################
149
+
150
+ # add record to index
151
+ # record may be the full AR object, a Ferret document instance or a Hash
152
+ def add(record)
153
+ record = record.to_doc unless Hash === record || Ferret::Document === record
154
+ ferret_index << record
155
+ end
156
+ alias << add
157
+
158
+ # delete record from index
159
+ def remove(id, class_name)
160
+ ferret_index.query_delete query_for_record(id, class_name)
161
+ end
162
+
163
+ # highlight search terms for the record with the given id.
164
+ def highlight(id, class_name, query, options = {})
165
+ options.reverse_merge! :num_excerpts => 2, :pre_tag => '<em>', :post_tag => '</em>'
166
+ highlights = []
167
+ ferret_index.synchronize do
168
+ doc_num = document_number(id, class_name)
169
+ if options[:field]
170
+ highlights << ferret_index.highlight(query, doc_num, options)
171
+ else
172
+ query = process_query(query) # process only once
173
+ aaf_configuration[:ferret_fields].each_pair do |field, config|
174
+ next if config[:store] == :no || config[:highlight] == :no
175
+ options[:field] = field
176
+ highlights << ferret_index.highlight(query, doc_num, options)
177
+ end
178
+ end
179
+ end
180
+ return highlights.compact.flatten[0..options[:num_excerpts]-1]
181
+ end
182
+
183
+ # retrieves the ferret document number of the record with the given id.
184
+ def document_number(id, class_name)
185
+ hits = ferret_index.search(query_for_record(id, class_name))
186
+ return hits.hits.first.doc if hits.total_hits == 1
187
+ raise "cannot determine document number from primary key: #{id}"
188
+ end
189
+
190
+ # build a ferret query matching only the record with the given id
191
+ # the class name only needs to be given in case of a shared index configuration
192
+ def query_for_record(id, class_name = nil)
193
+ Ferret::Search::TermQuery.new(:id, id.to_s)
194
+ end
195
+
196
+
197
+ protected
198
+
199
+ # returns a MultiIndex instance operating on a MultiReader
200
+ def multi_index(model_classes)
201
+ model_classes.map!(&:constantize) if String === model_classes.first
202
+ model_classes.sort! { |a, b| a.name <=> b.name }
203
+ key = model_classes.inject("") { |s, clazz| s + clazz.name }
204
+ multi_config = aaf_configuration[:ferret].dup
205
+ multi_config.delete :default_field # we don't want the default field list of *this* class for multi_searching
206
+ ActsAsFerret::multi_indexes[key] ||= MultiIndex.new(model_classes, multi_config)
207
+ end
208
+
209
+ end
210
+
211
+ end
@@ -0,0 +1,217 @@
1
+ module ActsAsFerret #:nodoc:
2
+
3
+ module MoreLikeThis
4
+
5
+ module InstanceMethods
6
+
7
+ # returns other instances of this class, which have similar contents
8
+ # like this one. Basically works like this: find out n most interesting
9
+ # (i.e. characteristic) terms from this document, and then build a
10
+ # query from those which is run against the whole index. Which terms
11
+ # are interesting is decided on variour criteria which can be
12
+ # influenced by the given options.
13
+ #
14
+ # The algorithm used here is a quite straight port of the MoreLikeThis class
15
+ # from Apache Lucene.
16
+ #
17
+ # options are:
18
+ # :field_names : Array of field names to use for similarity search (mandatory)
19
+ # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
20
+ # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
21
+ # :min_word_length => nil, # Ignore words shorter than this length (longer words tend to
22
+ # be more characteristic for the document they occur in).
23
+ # :max_word_length => nil, # Ignore words if greater than this len.
24
+ # :max_query_terms => 25, # maximum number of terms in the query built
25
+ # :max_num_tokens => 5000, # maximum number of tokens to examine in a single field
26
+ # :boost => false, # when true, a boost according to the relative score of
27
+ # a term is applied to this Term's TermQuery.
28
+ # :similarity => 'DefaultAAFSimilarity' # the similarity implementation to use (the default
29
+ # equals Ferret's internal similarity implementation)
30
+ # :analyzer => 'Ferret::Analysis::StandardAnalyzer' # class name of the analyzer to use
31
+ # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
32
+ # ferret_options : Ferret options handed over to find_by_contents (i.e. for limits and sorting)
33
+ # ar_options : options handed over to find_by_contents for AR scoping
34
+ def more_like_this(options = {}, ferret_options = {}, ar_options = {})
35
+ options = {
36
+ :field_names => nil, # Default field names
37
+ :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
38
+ :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
39
+ :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
40
+ :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
41
+ :max_query_terms => 25, # maximum number of terms in the query built
42
+ :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
43
+ :boost => false,
44
+ :similarity => 'ActsAsFerret::MoreLikeThis::DefaultAAFSimilarity', # class name of the similarity implementation to use
45
+ :analyzer => 'Ferret::Analysis::StandardAnalyzer', # class name of the analyzer to use
46
+ :append_to_query => nil,
47
+ :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
48
+ }.update(options)
49
+ #index.search_each('id:*') do |doc, score|
50
+ # puts "#{doc} == #{index[doc][:description]}"
51
+ #end
52
+ clazz = options[:base_class]
53
+ options[:base_class] = clazz.name
54
+ query = clazz.aaf_index.build_more_like_this_query(self.id, self.class.name, options)
55
+ options[:append_to_query].call(query) if options[:append_to_query]
56
+ clazz.find_by_contents(query, ferret_options, ar_options)
57
+ end
58
+
59
+ end
60
+
61
+ module IndexMethods
62
+
63
+ # TODO to allow morelikethis for unsaved records, we have to give the
64
+ # unsaved record's data to this method. check how this will work out
65
+ # via drb...
66
+ def build_more_like_this_query(id, class_name, options)
67
+ [:similarity, :analyzer].each { |sym| options[sym] = options[sym].constantize.new }
68
+ ferret_index.synchronize do # avoid that concurrent writes close our reader
69
+ ferret_index.send(:ensure_reader_open)
70
+ reader = ferret_index.send(:reader)
71
+ term_freq_map = retrieve_terms(id, class_name, reader, options)
72
+ priority_queue = create_queue(term_freq_map, reader, options)
73
+ create_query(id, class_name, priority_queue, options)
74
+ end
75
+ end
76
+
77
+ protected
78
+
79
+ def create_query(id, class_name, priority_queue, options={})
80
+ query = Ferret::Search::BooleanQuery.new
81
+ qterms = 0
82
+ best_score = nil
83
+ while(cur = priority_queue.pop)
84
+ term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
85
+
86
+ if options[:boost]
87
+ # boost term according to relative score
88
+ # TODO untested
89
+ best_score ||= cur.score
90
+ term_query.boost = cur.score / best_score
91
+ end
92
+ begin
93
+ query.add_query(term_query, :should)
94
+ rescue Ferret::Search::BooleanQuery::TooManyClauses
95
+ break
96
+ end
97
+ qterms += 1
98
+ break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
99
+ end
100
+ # exclude the original record
101
+ query.add_query(query_for_record(id, class_name), :must_not)
102
+ return query
103
+ end
104
+
105
+
106
+
107
+ # creates a term/term_frequency map for terms from the fields
108
+ # given in options[:field_names]
109
+ def retrieve_terms(id, class_name, reader, options)
110
+ raise "more_like_this atm only works on saved records" if id.nil?
111
+ document_number = document_number(id, class_name) rescue nil
112
+ field_names = options[:field_names]
113
+ max_num_tokens = options[:max_num_tokens]
114
+ term_freq_map = Hash.new(0)
115
+ doc = nil
116
+ record = nil
117
+ field_names.each do |field|
118
+ #puts "field: #{field}"
119
+ term_freq_vector = reader.term_vector(document_number, field) if document_number
120
+ #if false
121
+ if term_freq_vector
122
+ # use stored term vector
123
+ # puts 'using stored term vector'
124
+ term_freq_vector.terms.each do |term|
125
+ term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
126
+ end
127
+ else
128
+ # puts 'no stored term vector'
129
+ # no term vector stored, but we have stored the contents in the index
130
+ # -> extract terms from there
131
+ content = nil
132
+ if document_number
133
+ doc = reader[document_number]
134
+ content = doc[field]
135
+ end
136
+ unless content
137
+ # no term vector, no stored content, so try content from this instance
138
+ record ||= options[:base_class].constantize.find(id)
139
+ content = record.content_for_field_name(field.to_s)
140
+ end
141
+ puts "have doc: #{doc[:id]} with #{field} == #{content}"
142
+ token_count = 0
143
+
144
+ ts = options[:analyzer].token_stream(field, content)
145
+ while token = ts.next
146
+ break if (token_count+=1) > max_num_tokens
147
+ next if noise_word?(token.text, options)
148
+ term_freq_map[token.text] += 1
149
+ end
150
+ end
151
+ end
152
+ term_freq_map
153
+ end
154
+
155
+ # create an ordered(by score) list of word,fieldname,score
156
+ # structures
157
+ def create_queue(term_freq_map, reader, options)
158
+ pq = Array.new(term_freq_map.size)
159
+
160
+ similarity = options[:similarity]
161
+ num_docs = reader.num_docs
162
+ term_freq_map.each_pair do |word, tf|
163
+ # filter out words that don't occur enough times in the source
164
+ next if options[:min_term_freq] && tf < options[:min_term_freq]
165
+
166
+ # go through all the fields and find the largest document frequency
167
+ top_field = options[:field_names].first
168
+ doc_freq = 0
169
+ options[:field_names].each do |field_name|
170
+ freq = reader.doc_freq(field_name, word)
171
+ if freq > doc_freq
172
+ top_field = field_name
173
+ doc_freq = freq
174
+ end
175
+ end
176
+ # filter out words that don't occur in enough docs
177
+ next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
178
+ next if doc_freq == 0 # index update problem ?
179
+
180
+ idf = similarity.idf(doc_freq, num_docs)
181
+ score = tf * idf
182
+ pq << FrequencyQueueItem.new(word, top_field, score)
183
+ end
184
+ pq.compact!
185
+ pq.sort! { |a,b| a.score<=>b.score }
186
+ return pq
187
+ end
188
+
189
+ def noise_word?(text, options)
190
+ len = text.length
191
+ (
192
+ (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
193
+ (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
194
+ (options[:stop_words] && options.include?(text))
195
+ )
196
+ end
197
+
198
+ end
199
+
200
+ class DefaultAAFSimilarity
201
+ def idf(doc_freq, num_docs)
202
+ return 0.0 if num_docs == 0
203
+ return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
204
+ end
205
+ end
206
+
207
+
208
+ class FrequencyQueueItem
209
+ attr_reader :word, :field, :score
210
+ def initialize(word, field, score)
211
+ @word = word; @field = field; @score = score
212
+ end
213
+ end
214
+
215
+ end
216
+ end
217
+
@@ -0,0 +1,83 @@
1
+ module ActsAsFerret #:nodoc:
2
+
3
+ # this class is not threadsafe
4
+ class MultiIndex
5
+
6
+ def initialize(model_classes, options = {})
7
+ @model_classes = model_classes
8
+ # ensure all models indexes exist
9
+ @model_classes.each { |m| m.aaf_index.ensure_index_exists }
10
+ default_fields = @model_classes.inject([]) do |fields, c|
11
+ fields + [ c.aaf_configuration[:ferret][:default_field] ].flatten
12
+ end
13
+ @options = {
14
+ :default_field => default_fields
15
+ }.update(options)
16
+ end
17
+
18
+ def search(query, options={})
19
+ #puts "querystring: #{query.to_s}"
20
+ query = process_query(query)
21
+ #puts "parsed query: #{query.to_s}"
22
+ searcher.search(query, options)
23
+ end
24
+
25
+ def search_each(query, options = {}, &block)
26
+ query = process_query(query)
27
+ searcher.search_each(query, options, &block)
28
+ end
29
+
30
+ # checks if all our sub-searchers still are up to date
31
+ def latest?
32
+ return false unless @reader
33
+ # segfaults with 0.10.4 --> TODO report as bug @reader.latest?
34
+ @sub_readers.each do |r|
35
+ return false unless r.latest?
36
+ end
37
+ true
38
+ end
39
+
40
+ def searcher
41
+ ensure_searcher
42
+ @searcher
43
+ end
44
+
45
+ def doc(i)
46
+ searcher[i]
47
+ end
48
+ alias :[] :doc
49
+
50
+ def query_parser
51
+ @query_parser ||= Ferret::QueryParser.new(@options)
52
+ end
53
+
54
+ def process_query(query)
55
+ query = query_parser.parse(query) if query.is_a?(String)
56
+ return query
57
+ end
58
+
59
+ def close
60
+ @searcher.close if @searcher
61
+ @reader.close if @reader
62
+ end
63
+
64
+ protected
65
+
66
+ def ensure_searcher
67
+ unless latest?
68
+ @sub_readers = @model_classes.map { |clazz|
69
+ begin
70
+ reader = Ferret::Index::IndexReader.new(clazz.aaf_configuration[:index_dir])
71
+ rescue Exception
72
+ raise "error opening #{clazz.aaf_configuration[:index_dir]}: #{$!}"
73
+ end
74
+ }
75
+ close
76
+ @reader = Ferret::Index::IndexReader.new(@sub_readers)
77
+ @searcher = Ferret::Search::Searcher.new(@reader)
78
+ end
79
+ end
80
+
81
+ end # of class MultiIndex
82
+
83
+ end