pager-ultrasphinx 1.0.20080510

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,385 @@
1
+
2
+ module Ultrasphinx
3
+ class Search
4
+ module Internals
5
+
6
+ INFINITY = 1/0.0
7
+
8
+ include Associations
9
+
10
+ # These methods are kept stateless to ease debugging
11
+
12
+ private
13
+
14
+ def build_request_with_options opts
15
+
16
+ request = Riddle::Client.new
17
+
18
+ # Basic options
19
+ request.instance_eval do
20
+ @server = Ultrasphinx::CLIENT_SETTINGS['server_host']
21
+ @port = Ultrasphinx::CLIENT_SETTINGS['server_port']
22
+ @match_mode = :extended # Force extended query mode
23
+ @offset = opts['per_page'] * (opts['page'] - 1)
24
+ @limit = opts['per_page']
25
+ @max_matches = [@offset + @limit + Ultrasphinx::Search.client_options['max_matches_offset'], MAX_MATCHES].min
26
+ end
27
+
28
+ # Geosearch location
29
+ loc = opts['location']
30
+ loc.stringify_keys!
31
+ lat, long = loc['lat'], loc['long']
32
+ if lat and long
33
+ # Convert degrees to radians, if requested
34
+ if loc['units'] == 'degrees'
35
+ lat = degrees_to_radians(lat)
36
+ long = degrees_to_radians(long)
37
+ end
38
+ # Set the location/anchor point
39
+ request.set_anchor(loc['lat_attribute_name'], lat, loc['long_attribute_name'], long)
40
+ end
41
+
42
+ # Sorting
43
+ sort_by = opts['sort_by']
44
+ if options['location']
45
+ case sort_by
46
+ when "distance asc", "distance" then sort_by = "@geodist asc"
47
+ when "distance desc" then sort_by = "@geodist desc"
48
+ end
49
+ end
50
+
51
+ # Use the additional sortable column if it is a text type
52
+ sort_by += "_sortable" if Fields.instance.types[sort_by] == "text"
53
+
54
+ unless sort_by.blank?
55
+ if opts['sort_mode'].to_s == 'relevance'
56
+ # If you're sorting by a field you don't want 'relevance' order
57
+ raise UsageError, "Sort mode 'relevance' is not valid with a sort_by field"
58
+ end
59
+ request.sort_by = sort_by.to_s
60
+ end
61
+
62
+ if sort_mode = SPHINX_CLIENT_PARAMS['sort_mode'][opts['sort_mode']]
63
+ request.sort_mode = sort_mode
64
+ else
65
+ raise UsageError, "Sort mode #{opts['sort_mode'].inspect} is invalid"
66
+ end
67
+
68
+ # Weighting
69
+ weights = opts['weights']
70
+ if weights.any?
71
+ # Order according to the field order for Sphinx, and set the missing fields to 1.0
72
+ ordered_weights = []
73
+ Fields.instance.types.map do |name, type|
74
+ name if type == 'text'
75
+ end.compact.sort.each do |name|
76
+ ordered_weights << (weights[name] || 1.0)
77
+ end
78
+ request.weights = ordered_weights
79
+ end
80
+
81
+ # Class names
82
+ unless Array(opts['class_names']).empty?
83
+ request.filters << Riddle::Client::Filter.new(
84
+ 'class_id',
85
+ (opts['class_names'].map do |model|
86
+ MODELS_TO_IDS[model.to_s] or
87
+ MODELS_TO_IDS[model.to_s.constantize.base_class.to_s] or
88
+ raise UsageError, "Invalid class name #{model.inspect}"
89
+ end),
90
+ false)
91
+ end
92
+
93
+ # Extract raw filters
94
+ # XXX This is poorly done. We should coerce based on the Field types, not the value class.
95
+ # That would also allow us to move numeric filters from the query string into the hash.
96
+ Array(opts['filters']).each do |field, value|
97
+
98
+ field = field.to_s
99
+ type = Fields.instance.types[field]
100
+
101
+ # Special derived attribute
102
+ if field == 'distance' and options['location']
103
+ field, type = '@geodist', 'float'
104
+ end
105
+
106
+ raise UsageError, "field #{field.inspect} is invalid" unless type
107
+
108
+ begin
109
+ case value
110
+ when Integer, Float, BigDecimal, NilClass, Array
111
+ # XXX Hack to force floats to be floats
112
+ value = value.to_f if type == 'float'
113
+ # Just bomb the filter in there
114
+ request.filters << Riddle::Client::Filter.new(field, Array(value), false)
115
+ when Range
116
+ # Make sure ranges point in the right direction
117
+ min, max = [value.begin, value.end].map {|x| x._to_numeric }
118
+ raise NoMethodError unless min <=> max and max <=> min
119
+ min, max = max, min if min > max
120
+ # XXX Hack to force floats to be floats
121
+ min, max = min.to_f, max.to_f if type == 'float'
122
+ request.filters << Riddle::Client::Filter.new(field, min..max, false)
123
+ when String
124
+ # XXX Hack to move text filters into the query
125
+ opts['parsed_query'] << " @#{field} #{value}"
126
+ else
127
+ raise NoMethodError
128
+ end
129
+ rescue NoMethodError => e
130
+ raise UsageError, "Filter value #{value.inspect} for field #{field.inspect} is invalid"
131
+ end
132
+ end
133
+
134
+ request
135
+ end
136
+
137
+ def get_subtotals(original_request, query)
138
+ request = original_request._deep_dup
139
+ request.instance_eval { @filters.delete_if {|filter| filter.attribute == 'class_id'} }
140
+
141
+ facets = get_facets(request, query, 'class_id')
142
+
143
+ # Not using the standard facet caching here
144
+ Hash[*(MODELS_TO_IDS.map do |klass, id|
145
+ [klass, facets[id] || 0]
146
+ end.flatten)]
147
+ end
148
+
149
+ def get_facets(original_request, query, original_facet)
150
+ request, facet = original_request._deep_dup, original_facet
151
+ facet += "_facet" if Fields.instance.types[original_facet] == 'text'
152
+
153
+ unless Fields.instance.types[facet]
154
+ if facet == original_facet
155
+ raise UsageError, "Field #{original_facet} does not exist"
156
+ else
157
+ raise UsageError, "Field #{original_facet} is a text field, but was not configured for text faceting"
158
+ end
159
+ end
160
+
161
+ # Set the facet query parameter and modify per-page setting so we snag all the facets
162
+ request.instance_eval do
163
+ @group_by = facet
164
+ @group_function = :attr
165
+ @group_clauses = '@count desc'
166
+ @offset = 0
167
+ @limit = Ultrasphinx::Search.client_options['max_facets']
168
+ @max_matches = [@limit + Ultrasphinx::Search.client_options['max_matches_offset'], MAX_MATCHES].min
169
+ end
170
+
171
+ # Run the query
172
+ begin
173
+ matches = request.query(query, options['indexes'])[:matches]
174
+ rescue DaemonError
175
+ raise ConfigurationError, "Index seems out of date. Run 'rake ultrasphinx:index'"
176
+ end
177
+
178
+ # Map the facets back to something sane
179
+ facets = {}
180
+ matches.each do |match|
181
+ attributes = match[:attributes]
182
+ raise DaemonError if facets[attributes['@groupby']]
183
+ facets[attributes['@groupby']] = attributes['@count']
184
+ end
185
+
186
+ # Invert hash's, if we have them
187
+ reverse_map_facets(facets, original_facet)
188
+ end
189
+
190
+ def reverse_map_facets(facets, facet)
191
+ facets = facets.dup
192
+
193
+ if Fields.instance.types[facet] == 'text'
194
+ # Apply the map, rebuilding if the cache is missing or out-of-date
195
+ facets = Hash[*(facets.map do |hash, value|
196
+ rebuild_facet_cache(facet) unless FACET_CACHE[facet] and FACET_CACHE[facet].has_key?(hash)
197
+ [FACET_CACHE[facet][hash], value]
198
+ end.flatten)]
199
+ end
200
+
201
+ facets
202
+ end
203
+
204
+ def rebuild_facet_cache(facet)
205
+ # Cache the reverse hash map for the textual facet if it hasn't been done yet
206
+ # XXX Not necessarily optimal since it requires a direct DB hit once per mongrel
207
+ Ultrasphinx.say "caching hash reverse map for text facet #{facet}"
208
+
209
+ configured_classes = Fields.instance.classes[facet].map do |klass|
210
+
211
+ # Concatenates might not work well
212
+ type, configuration = nil, nil
213
+ MODEL_CONFIGURATION[klass.name].except('conditions', 'delta').each do |_type, values|
214
+ type = _type
215
+ configuration = values.detect { |this_field| this_field['as'] == facet }
216
+ break if configuration
217
+ end
218
+
219
+ unless configuration and configuration['facet']
220
+ Ultrasphinx.say "model #{klass.name} has the requested '#{facet}' field, but it was not configured for faceting, and will be skipped"
221
+ next
222
+ end
223
+
224
+ FACET_CACHE[facet] ||= {}
225
+
226
+ # XXX This is a duplication of stuff already known in configure.rb, and ought to be cleaned up,
227
+ # but that would mean we have to either parse the .conf or configure every time at boot
228
+
229
+ field_string, join_string = case type
230
+ when 'fields'
231
+ [configuration['field'], ""]
232
+ when 'include'
233
+ # XXX Only handles the basic case. No test coverage.
234
+
235
+ table_alias = configuration['table_alias']
236
+ association_model = if configuration['class_name']
237
+ configuration['class_name'].constantize
238
+ else
239
+ get_association_model(klass, configuration)
240
+ end
241
+
242
+ ["#{table_alias}.#{configuration['field']}",
243
+ (configuration['association_sql'] or "LEFT OUTER JOIN #{association_model.table_name} AS #{table_alias} ON #{table_alias}.#{klass.to_s.downcase}_id = #{klass.table_name}.#{association_model.primary_key}")
244
+ ]
245
+ when 'concatenate'
246
+ # Wait for someone to complain before worrying about this
247
+ raise "Concatenation text facets have not been implemented"
248
+ end
249
+
250
+ klass.connection.execute("SELECT #{field_string} AS value, #{SQL_FUNCTIONS[ADAPTER]['hash']._interpolate(field_string)} AS hash FROM #{klass.table_name} #{join_string} GROUP BY value").each do |value, hash|
251
+ FACET_CACHE[facet][hash.to_i] = value
252
+ end
253
+ klass
254
+ end
255
+
256
+ configured_classes.compact!
257
+ raise ConfigurationError, "no classes were correctly configured for text faceting on '#{facet}'" if configured_classes.empty?
258
+
259
+ FACET_CACHE[facet]
260
+ end
261
+
262
+ # Inverse-modulus map the Sphinx ids to the table-specific ids
263
+ def convert_sphinx_ids(sphinx_ids)
264
+
265
+ number_of_models = IDS_TO_MODELS.size
266
+ raise ConfigurationError, "No model mappings were found. Your #{RAILS_ENV}.conf file is corrupted, or your application container needs to be restarted." if number_of_models == 0
267
+
268
+ sphinx_ids.sort_by do |item|
269
+ item[:index]
270
+ end.map do |item|
271
+ class_name = IDS_TO_MODELS[item[:doc] % number_of_models]
272
+ raise DaemonError, "Impossible Sphinx document id #{item[:doc]} in query result" unless class_name
273
+ [class_name, item[:doc] / number_of_models]
274
+ end
275
+ end
276
+
277
+ # Fetch them for real
278
+ def reify_results(ids)
279
+ results = []
280
+
281
+ ids_hash = {}
282
+ ids.each do |class_name, id|
283
+ (ids_hash[class_name] ||= []) << id
284
+ end
285
+
286
+ ids.map {|ary| ary.first}.uniq.each do |class_name|
287
+ klass = class_name.constantize
288
+
289
+ finder = (
290
+ Ultrasphinx::Search.client_options['finder_methods'].detect do |method_name|
291
+ klass.respond_to? method_name
292
+ end or
293
+ # XXX This default is kind of buried, but I'm not sure why you would need it to be
294
+ # configurable, since you can use ['finder_methods'].
295
+ "find_all_by_#{klass.primary_key}"
296
+ )
297
+
298
+ records = klass.send(finder, ids_hash[class_name])
299
+
300
+ unless Ultrasphinx::Search.client_options['ignore_missing_records']
301
+ if records.size != ids_hash[class_name].size
302
+ missed_ids = ids_hash[class_name] - records.map(&:id)
303
+ msg = if missed_ids.size == 1
304
+ "Couldn't find #{class_name} with ID=#{missed_ids.first}"
305
+ else
306
+ "Couldn't find #{class_name.pluralize} with IDs: #{missed_ids.join(',')} (found #{records.size} results, but was looking for #{ids_hash[class_name].size})"
307
+ end
308
+ raise ActiveRecord::RecordNotFound, msg
309
+ end
310
+ end
311
+
312
+ records.each do |record|
313
+ results[ids.index([class_name, record.id])] = record
314
+ end
315
+ end
316
+
317
+ # Add an accessor for global search rank for each record, if requested
318
+ if self.class.client_options['with_global_rank']
319
+ # XXX Nobody uses this
320
+ results.each_with_index do |result, index|
321
+ if result
322
+ global_index = per_page * (current_page - 1) + index
323
+ result.instance_variable_get('@attributes')['result_index'] = global_index
324
+ end
325
+ end
326
+ end
327
+
328
+ # Add an accessor for distance, if requested
329
+ if self.options['location']['lat'] and self.options['location']['long']
330
+ results.each_with_index do |result, index|
331
+ if result
332
+ distance = (response[:matches][index][:attributes]['@geodist'] or INFINITY)
333
+ result.instance_variable_get('@attributes')['distance'] = distance
334
+ end
335
+ end
336
+ end
337
+
338
+ results.compact!
339
+
340
+ if ids.size - results.size > Ultrasphinx::Search.client_options['max_missing_records']
341
+ # Never reached if Ultrasphinx::Search.client_options['ignore_missing_records'] is false due to raise
342
+ raise ConfigurationError, "Too many results for this query returned ActiveRecord::RecordNotFound. The index is probably out of date"
343
+ end
344
+
345
+ results
346
+ end
347
+
348
+ def perform_action_with_retries
349
+ tries = 0
350
+ exceptions = [NoMethodError, Riddle::VersionError, Riddle::ResponseError, Errno::ECONNREFUSED, Errno::ECONNRESET, Errno::EPIPE]
351
+ begin
352
+ yield
353
+ rescue *exceptions => e
354
+ tries += 1
355
+ if tries <= Ultrasphinx::Search.client_options['max_retries']
356
+ say "restarting query (#{tries} attempts already) (#{e})"
357
+ sleep(Ultrasphinx::Search.client_options['retry_sleep_time'])
358
+ retry
359
+ else
360
+ say "query failed"
361
+ # Clear the rescue list, retry one last time, and let the error fail up the stack
362
+ exceptions = []
363
+ retry
364
+ end
365
+ end
366
+ end
367
+
368
+ def strip_bogus_characters(s)
369
+ # Used to remove some garbage before highlighting
370
+ s.gsub(/<.*?>|\.\.\.|\342\200\246|\n|\r/, " ").gsub(/http.*?( |$)/, ' ') if s
371
+ end
372
+
373
+ def strip_query_commands(s)
374
+ # XXX Hack for query commands, since Sphinx doesn't intelligently parse the query in excerpt mode
375
+ # Also removes apostrophes in the middle of words so that they don't get split in two.
376
+ s.gsub(/(^|\s)(AND|OR|NOT|\@\w+)(\s|$)/i, "").gsub(/(\w)\'(\w)/, '\1\2')
377
+ end
378
+
379
+ def degrees_to_radians(value)
380
+ Math::PI * value / 180.0
381
+ end
382
+
383
+ end
384
+ end
385
+ end
@@ -0,0 +1,139 @@
1
+
2
+ module Ultrasphinx
3
+ class Search
4
+ module Parser
5
+ # We could rewrite this in Treetop, but for now it works well.
6
+
7
+ class Error < RuntimeError
8
+ end
9
+
10
+ OPERATORS = {
11
+ 'OR' => '|',
12
+ 'AND' => '',
13
+ 'NOT' => '-',
14
+ 'or' => '|',
15
+ 'and' => '',
16
+ 'not' => '-'
17
+ }
18
+
19
+ private
20
+
21
+ def parse query
22
+ # Alters a Google query string into Sphinx 0.97 style
23
+ return "" if query.blank?
24
+ # Parse
25
+ token_hash = token_stream_to_hash(query_to_token_stream(query))
26
+ # Join everything up and remove some spaces
27
+ token_hash_to_array(token_hash).join(" ").squeeze(" ").strip
28
+ end
29
+
30
+
31
+ def token_hash_to_array(token_hash)
32
+ query = []
33
+
34
+ token_hash.sort_by do |key, value|
35
+ key or ""
36
+ end.each do |field, contents|
37
+ # First operator always goes outside
38
+ query << contents.first.first
39
+
40
+ query << "@#{field}" if field
41
+ query << "(" if field and contents.size > 1
42
+
43
+ contents.each_with_index do |op_and_content, index|
44
+ op, content = op_and_content
45
+ query << op unless index == 0
46
+ query << content
47
+ end
48
+
49
+ query << ")" if field and contents.size > 1
50
+ end
51
+
52
+ # Collapse fieldsets early so that the swap doesn't split them
53
+ query.each_with_index do |token, index|
54
+ if token =~ /^@/
55
+ query[index] = "#{token} #{query[index + 1]}"
56
+ query[index + 1] = nil
57
+ end
58
+ end
59
+
60
+ # Swap the first pair if the order is reversed
61
+ if [OPERATORS['NOT'], OPERATORS['OR']].include? query.first.upcase
62
+ query[0], query[1] = query[1], query[0]
63
+ end
64
+
65
+ query
66
+ end
67
+
68
+
69
+ def query_to_token_stream(query)
70
+ # First, split query on spaces that are not inside sets of quotes or parens
71
+
72
+ query = query.scan(/[^"() ]*["(][^")]*[")]|[^"() ]+/)
73
+
74
+ token_stream = []
75
+ has_operator = false
76
+
77
+ query.each_with_index do |subtoken, index|
78
+
79
+ # Recurse for parens, if necessary
80
+ if subtoken =~ /^(.*?)\((.*)\)(.*?$)/
81
+ subtoken = query[index] = "#{$1}(#{parse $2})#{$3}"
82
+ end
83
+
84
+ # Reappend missing closing quotes
85
+ if subtoken =~ /(^|\:)\"/
86
+ subtoken = subtoken.chomp('"') + '"'
87
+ end
88
+
89
+ # Strip parentheses within quoted strings
90
+ if subtoken =~ /\"(.*)\"/
91
+ subtoken.sub!($1, $1.gsub(/[()]/, ''))
92
+ end
93
+
94
+ # Add to the stream, converting the operator
95
+ if !has_operator
96
+ if OPERATORS.to_a.flatten.include? subtoken and index != (query.size - 1)
97
+ # Note that operators at the end of the string are not parsed
98
+ token_stream << OPERATORS[subtoken] || subtoken
99
+ has_operator = true # flip
100
+ else
101
+ token_stream << ""
102
+ token_stream << subtoken
103
+ end
104
+ else
105
+ if OPERATORS.to_a.flatten.include? subtoken
106
+ # Drop extra operator
107
+ else
108
+ token_stream << subtoken
109
+ has_operator = false # flop
110
+ end
111
+ end
112
+ end
113
+
114
+ if token_stream.size.zero? or token_stream.size.odd?
115
+ raise Error, "#{token_stream.inspect} is not a valid token stream"
116
+ end
117
+ token_stream.in_groups_of(2)
118
+ end
119
+
120
+
121
+ def token_stream_to_hash(token_stream)
122
+ token_hash = Hash.new([])
123
+ token_stream.map do |operator, content|
124
+ # Remove some spaces
125
+ content.gsub!(/^"\s+|\s+"$/, '"')
126
+ # Convert fields into sphinx style, reformat the stream object
127
+ if content =~ /(.*?):(.*)/
128
+ token_hash[$1] += [[operator, $2]]
129
+ else
130
+ token_hash[nil] += [[operator, content]]
131
+ end
132
+ end
133
+ token_hash
134
+ end
135
+
136
+
137
+ end
138
+ end
139
+ end