pager-ultrasphinx 1.0.20080510

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,385 @@
1
+
2
+ module Ultrasphinx
3
+ class Search
4
+ module Internals
5
+
6
+ INFINITY = 1/0.0
7
+
8
+ include Associations
9
+
10
+ # These methods are kept stateless to ease debugging
11
+
12
+ private
13
+
14
+ def build_request_with_options opts
15
+
16
+ request = Riddle::Client.new
17
+
18
+ # Basic options
19
+ request.instance_eval do
20
+ @server = Ultrasphinx::CLIENT_SETTINGS['server_host']
21
+ @port = Ultrasphinx::CLIENT_SETTINGS['server_port']
22
+ @match_mode = :extended # Force extended query mode
23
+ @offset = opts['per_page'] * (opts['page'] - 1)
24
+ @limit = opts['per_page']
25
+ @max_matches = [@offset + @limit + Ultrasphinx::Search.client_options['max_matches_offset'], MAX_MATCHES].min
26
+ end
27
+
28
+ # Geosearch location
29
+ loc = opts['location']
30
+ loc.stringify_keys!
31
+ lat, long = loc['lat'], loc['long']
32
+ if lat and long
33
+ # Convert degrees to radians, if requested
34
+ if loc['units'] == 'degrees'
35
+ lat = degrees_to_radians(lat)
36
+ long = degrees_to_radians(long)
37
+ end
38
+ # Set the location/anchor point
39
+ request.set_anchor(loc['lat_attribute_name'], lat, loc['long_attribute_name'], long)
40
+ end
41
+
42
+ # Sorting
43
+ sort_by = opts['sort_by']
44
+ if options['location']
45
+ case sort_by
46
+ when "distance asc", "distance" then sort_by = "@geodist asc"
47
+ when "distance desc" then sort_by = "@geodist desc"
48
+ end
49
+ end
50
+
51
+ # Use the additional sortable column if it is a text type
52
+ sort_by += "_sortable" if Fields.instance.types[sort_by] == "text"
53
+
54
+ unless sort_by.blank?
55
+ if opts['sort_mode'].to_s == 'relevance'
56
+ # If you're sorting by a field you don't want 'relevance' order
57
+ raise UsageError, "Sort mode 'relevance' is not valid with a sort_by field"
58
+ end
59
+ request.sort_by = sort_by.to_s
60
+ end
61
+
62
+ if sort_mode = SPHINX_CLIENT_PARAMS['sort_mode'][opts['sort_mode']]
63
+ request.sort_mode = sort_mode
64
+ else
65
+ raise UsageError, "Sort mode #{opts['sort_mode'].inspect} is invalid"
66
+ end
67
+
68
+ # Weighting
69
+ weights = opts['weights']
70
+ if weights.any?
71
+ # Order according to the field order for Sphinx, and set the missing fields to 1.0
72
+ ordered_weights = []
73
+ Fields.instance.types.map do |name, type|
74
+ name if type == 'text'
75
+ end.compact.sort.each do |name|
76
+ ordered_weights << (weights[name] || 1.0)
77
+ end
78
+ request.weights = ordered_weights
79
+ end
80
+
81
+ # Class names
82
+ unless Array(opts['class_names']).empty?
83
+ request.filters << Riddle::Client::Filter.new(
84
+ 'class_id',
85
+ (opts['class_names'].map do |model|
86
+ MODELS_TO_IDS[model.to_s] or
87
+ MODELS_TO_IDS[model.to_s.constantize.base_class.to_s] or
88
+ raise UsageError, "Invalid class name #{model.inspect}"
89
+ end),
90
+ false)
91
+ end
92
+
93
+ # Extract raw filters
94
+ # XXX This is poorly done. We should coerce based on the Field types, not the value class.
95
+ # That would also allow us to move numeric filters from the query string into the hash.
96
+ Array(opts['filters']).each do |field, value|
97
+
98
+ field = field.to_s
99
+ type = Fields.instance.types[field]
100
+
101
+ # Special derived attribute
102
+ if field == 'distance' and options['location']
103
+ field, type = '@geodist', 'float'
104
+ end
105
+
106
+ raise UsageError, "field #{field.inspect} is invalid" unless type
107
+
108
+ begin
109
+ case value
110
+ when Integer, Float, BigDecimal, NilClass, Array
111
+ # XXX Hack to force floats to be floats
112
+ value = value.to_f if type == 'float'
113
+ # Just bomb the filter in there
114
+ request.filters << Riddle::Client::Filter.new(field, Array(value), false)
115
+ when Range
116
+ # Make sure ranges point in the right direction
117
+ min, max = [value.begin, value.end].map {|x| x._to_numeric }
118
+ raise NoMethodError unless min <=> max and max <=> min
119
+ min, max = max, min if min > max
120
+ # XXX Hack to force floats to be floats
121
+ min, max = min.to_f, max.to_f if type == 'float'
122
+ request.filters << Riddle::Client::Filter.new(field, min..max, false)
123
+ when String
124
+ # XXX Hack to move text filters into the query
125
+ opts['parsed_query'] << " @#{field} #{value}"
126
+ else
127
+ raise NoMethodError
128
+ end
129
+ rescue NoMethodError => e
130
+ raise UsageError, "Filter value #{value.inspect} for field #{field.inspect} is invalid"
131
+ end
132
+ end
133
+
134
+ request
135
+ end
136
+
137
+ def get_subtotals(original_request, query)
138
+ request = original_request._deep_dup
139
+ request.instance_eval { @filters.delete_if {|filter| filter.attribute == 'class_id'} }
140
+
141
+ facets = get_facets(request, query, 'class_id')
142
+
143
+ # Not using the standard facet caching here
144
+ Hash[*(MODELS_TO_IDS.map do |klass, id|
145
+ [klass, facets[id] || 0]
146
+ end.flatten)]
147
+ end
148
+
149
+ def get_facets(original_request, query, original_facet)
150
+ request, facet = original_request._deep_dup, original_facet
151
+ facet += "_facet" if Fields.instance.types[original_facet] == 'text'
152
+
153
+ unless Fields.instance.types[facet]
154
+ if facet == original_facet
155
+ raise UsageError, "Field #{original_facet} does not exist"
156
+ else
157
+ raise UsageError, "Field #{original_facet} is a text field, but was not configured for text faceting"
158
+ end
159
+ end
160
+
161
+ # Set the facet query parameter and modify per-page setting so we snag all the facets
162
+ request.instance_eval do
163
+ @group_by = facet
164
+ @group_function = :attr
165
+ @group_clauses = '@count desc'
166
+ @offset = 0
167
+ @limit = Ultrasphinx::Search.client_options['max_facets']
168
+ @max_matches = [@limit + Ultrasphinx::Search.client_options['max_matches_offset'], MAX_MATCHES].min
169
+ end
170
+
171
+ # Run the query
172
+ begin
173
+ matches = request.query(query, options['indexes'])[:matches]
174
+ rescue DaemonError
175
+ raise ConfigurationError, "Index seems out of date. Run 'rake ultrasphinx:index'"
176
+ end
177
+
178
+ # Map the facets back to something sane
179
+ facets = {}
180
+ matches.each do |match|
181
+ attributes = match[:attributes]
182
+ raise DaemonError if facets[attributes['@groupby']]
183
+ facets[attributes['@groupby']] = attributes['@count']
184
+ end
185
+
186
+ # Invert hash's, if we have them
187
+ reverse_map_facets(facets, original_facet)
188
+ end
189
+
190
+ def reverse_map_facets(facets, facet)
191
+ facets = facets.dup
192
+
193
+ if Fields.instance.types[facet] == 'text'
194
+ # Apply the map, rebuilding if the cache is missing or out-of-date
195
+ facets = Hash[*(facets.map do |hash, value|
196
+ rebuild_facet_cache(facet) unless FACET_CACHE[facet] and FACET_CACHE[facet].has_key?(hash)
197
+ [FACET_CACHE[facet][hash], value]
198
+ end.flatten)]
199
+ end
200
+
201
+ facets
202
+ end
203
+
204
+ def rebuild_facet_cache(facet)
205
+ # Cache the reverse hash map for the textual facet if it hasn't been done yet
206
+ # XXX Not necessarily optimal since it requires a direct DB hit once per mongrel
207
+ Ultrasphinx.say "caching hash reverse map for text facet #{facet}"
208
+
209
+ configured_classes = Fields.instance.classes[facet].map do |klass|
210
+
211
+ # Concatenates might not work well
212
+ type, configuration = nil, nil
213
+ MODEL_CONFIGURATION[klass.name].except('conditions', 'delta').each do |_type, values|
214
+ type = _type
215
+ configuration = values.detect { |this_field| this_field['as'] == facet }
216
+ break if configuration
217
+ end
218
+
219
+ unless configuration and configuration['facet']
220
+ Ultrasphinx.say "model #{klass.name} has the requested '#{facet}' field, but it was not configured for faceting, and will be skipped"
221
+ next
222
+ end
223
+
224
+ FACET_CACHE[facet] ||= {}
225
+
226
+ # XXX This is a duplication of stuff already known in configure.rb, and ought to be cleaned up,
227
+ # but that would mean we have to either parse the .conf or configure every time at boot
228
+
229
+ field_string, join_string = case type
230
+ when 'fields'
231
+ [configuration['field'], ""]
232
+ when 'include'
233
+ # XXX Only handles the basic case. No test coverage.
234
+
235
+ table_alias = configuration['table_alias']
236
+ association_model = if configuration['class_name']
237
+ configuration['class_name'].constantize
238
+ else
239
+ get_association_model(klass, configuration)
240
+ end
241
+
242
+ ["#{table_alias}.#{configuration['field']}",
243
+ (configuration['association_sql'] or "LEFT OUTER JOIN #{association_model.table_name} AS #{table_alias} ON #{table_alias}.#{klass.to_s.downcase}_id = #{klass.table_name}.#{association_model.primary_key}")
244
+ ]
245
+ when 'concatenate'
246
+ # Wait for someone to complain before worrying about this
247
+ raise "Concatenation text facets have not been implemented"
248
+ end
249
+
250
+ klass.connection.execute("SELECT #{field_string} AS value, #{SQL_FUNCTIONS[ADAPTER]['hash']._interpolate(field_string)} AS hash FROM #{klass.table_name} #{join_string} GROUP BY value").each do |value, hash|
251
+ FACET_CACHE[facet][hash.to_i] = value
252
+ end
253
+ klass
254
+ end
255
+
256
+ configured_classes.compact!
257
+ raise ConfigurationError, "no classes were correctly configured for text faceting on '#{facet}'" if configured_classes.empty?
258
+
259
+ FACET_CACHE[facet]
260
+ end
261
+
262
+ # Inverse-modulus map the Sphinx ids to the table-specific ids
263
+ def convert_sphinx_ids(sphinx_ids)
264
+
265
+ number_of_models = IDS_TO_MODELS.size
266
+ raise ConfigurationError, "No model mappings were found. Your #{RAILS_ENV}.conf file is corrupted, or your application container needs to be restarted." if number_of_models == 0
267
+
268
+ sphinx_ids.sort_by do |item|
269
+ item[:index]
270
+ end.map do |item|
271
+ class_name = IDS_TO_MODELS[item[:doc] % number_of_models]
272
+ raise DaemonError, "Impossible Sphinx document id #{item[:doc]} in query result" unless class_name
273
+ [class_name, item[:doc] / number_of_models]
274
+ end
275
+ end
276
+
277
+ # Fetch them for real
278
+ def reify_results(ids)
279
+ results = []
280
+
281
+ ids_hash = {}
282
+ ids.each do |class_name, id|
283
+ (ids_hash[class_name] ||= []) << id
284
+ end
285
+
286
+ ids.map {|ary| ary.first}.uniq.each do |class_name|
287
+ klass = class_name.constantize
288
+
289
+ finder = (
290
+ Ultrasphinx::Search.client_options['finder_methods'].detect do |method_name|
291
+ klass.respond_to? method_name
292
+ end or
293
+ # XXX This default is kind of buried, but I'm not sure why you would need it to be
294
+ # configurable, since you can use ['finder_methods'].
295
+ "find_all_by_#{klass.primary_key}"
296
+ )
297
+
298
+ records = klass.send(finder, ids_hash[class_name])
299
+
300
+ unless Ultrasphinx::Search.client_options['ignore_missing_records']
301
+ if records.size != ids_hash[class_name].size
302
+ missed_ids = ids_hash[class_name] - records.map(&:id)
303
+ msg = if missed_ids.size == 1
304
+ "Couldn't find #{class_name} with ID=#{missed_ids.first}"
305
+ else
306
+ "Couldn't find #{class_name.pluralize} with IDs: #{missed_ids.join(',')} (found #{records.size} results, but was looking for #{ids_hash[class_name].size})"
307
+ end
308
+ raise ActiveRecord::RecordNotFound, msg
309
+ end
310
+ end
311
+
312
+ records.each do |record|
313
+ results[ids.index([class_name, record.id])] = record
314
+ end
315
+ end
316
+
317
+ # Add an accessor for global search rank for each record, if requested
318
+ if self.class.client_options['with_global_rank']
319
+ # XXX Nobody uses this
320
+ results.each_with_index do |result, index|
321
+ if result
322
+ global_index = per_page * (current_page - 1) + index
323
+ result.instance_variable_get('@attributes')['result_index'] = global_index
324
+ end
325
+ end
326
+ end
327
+
328
+ # Add an accessor for distance, if requested
329
+ if self.options['location']['lat'] and self.options['location']['long']
330
+ results.each_with_index do |result, index|
331
+ if result
332
+ distance = (response[:matches][index][:attributes]['@geodist'] or INFINITY)
333
+ result.instance_variable_get('@attributes')['distance'] = distance
334
+ end
335
+ end
336
+ end
337
+
338
+ results.compact!
339
+
340
+ if ids.size - results.size > Ultrasphinx::Search.client_options['max_missing_records']
341
+ # Never reached if Ultrasphinx::Search.client_options['ignore_missing_records'] is false due to raise
342
+ raise ConfigurationError, "Too many results for this query returned ActiveRecord::RecordNotFound. The index is probably out of date"
343
+ end
344
+
345
+ results
346
+ end
347
+
348
+ def perform_action_with_retries
349
+ tries = 0
350
+ exceptions = [NoMethodError, Riddle::VersionError, Riddle::ResponseError, Errno::ECONNREFUSED, Errno::ECONNRESET, Errno::EPIPE]
351
+ begin
352
+ yield
353
+ rescue *exceptions => e
354
+ tries += 1
355
+ if tries <= Ultrasphinx::Search.client_options['max_retries']
356
+ say "restarting query (#{tries} attempts already) (#{e})"
357
+ sleep(Ultrasphinx::Search.client_options['retry_sleep_time'])
358
+ retry
359
+ else
360
+ say "query failed"
361
+ # Clear the rescue list, retry one last time, and let the error fail up the stack
362
+ exceptions = []
363
+ retry
364
+ end
365
+ end
366
+ end
367
+
368
+ def strip_bogus_characters(s)
369
+ # Used to remove some garbage before highlighting
370
+ s.gsub(/<.*?>|\.\.\.|\342\200\246|\n|\r/, " ").gsub(/http.*?( |$)/, ' ') if s
371
+ end
372
+
373
+ def strip_query_commands(s)
374
+ # XXX Hack for query commands, since Sphinx doesn't intelligently parse the query in excerpt mode
375
+ # Also removes apostrophes in the middle of words so that they don't get split in two.
376
+ s.gsub(/(^|\s)(AND|OR|NOT|\@\w+)(\s|$)/i, "").gsub(/(\w)\'(\w)/, '\1\2')
377
+ end
378
+
379
+ def degrees_to_radians(value)
380
+ Math::PI * value / 180.0
381
+ end
382
+
383
+ end
384
+ end
385
+ end
@@ -0,0 +1,139 @@
1
+
2
+ module Ultrasphinx
3
+ class Search
4
+ module Parser
5
+ # We could rewrite this in Treetop, but for now it works well.
6
+
7
+ class Error < RuntimeError
8
+ end
9
+
10
+ OPERATORS = {
11
+ 'OR' => '|',
12
+ 'AND' => '',
13
+ 'NOT' => '-',
14
+ 'or' => '|',
15
+ 'and' => '',
16
+ 'not' => '-'
17
+ }
18
+
19
+ private
20
+
21
+ def parse query
22
+ # Alters a Google query string into Sphinx 0.97 style
23
+ return "" if query.blank?
24
+ # Parse
25
+ token_hash = token_stream_to_hash(query_to_token_stream(query))
26
+ # Join everything up and remove some spaces
27
+ token_hash_to_array(token_hash).join(" ").squeeze(" ").strip
28
+ end
29
+
30
+
31
+ def token_hash_to_array(token_hash)
32
+ query = []
33
+
34
+ token_hash.sort_by do |key, value|
35
+ key or ""
36
+ end.each do |field, contents|
37
+ # First operator always goes outside
38
+ query << contents.first.first
39
+
40
+ query << "@#{field}" if field
41
+ query << "(" if field and contents.size > 1
42
+
43
+ contents.each_with_index do |op_and_content, index|
44
+ op, content = op_and_content
45
+ query << op unless index == 0
46
+ query << content
47
+ end
48
+
49
+ query << ")" if field and contents.size > 1
50
+ end
51
+
52
+ # Collapse fieldsets early so that the swap doesn't split them
53
+ query.each_with_index do |token, index|
54
+ if token =~ /^@/
55
+ query[index] = "#{token} #{query[index + 1]}"
56
+ query[index + 1] = nil
57
+ end
58
+ end
59
+
60
+ # Swap the first pair if the order is reversed
61
+ if [OPERATORS['NOT'], OPERATORS['OR']].include? query.first.upcase
62
+ query[0], query[1] = query[1], query[0]
63
+ end
64
+
65
+ query
66
+ end
67
+
68
+
69
+ def query_to_token_stream(query)
70
+ # First, split query on spaces that are not inside sets of quotes or parens
71
+
72
+ query = query.scan(/[^"() ]*["(][^")]*[")]|[^"() ]+/)
73
+
74
+ token_stream = []
75
+ has_operator = false
76
+
77
+ query.each_with_index do |subtoken, index|
78
+
79
+ # Recurse for parens, if necessary
80
+ if subtoken =~ /^(.*?)\((.*)\)(.*?$)/
81
+ subtoken = query[index] = "#{$1}(#{parse $2})#{$3}"
82
+ end
83
+
84
+ # Reappend missing closing quotes
85
+ if subtoken =~ /(^|\:)\"/
86
+ subtoken = subtoken.chomp('"') + '"'
87
+ end
88
+
89
+ # Strip parentheses within quoted strings
90
+ if subtoken =~ /\"(.*)\"/
91
+ subtoken.sub!($1, $1.gsub(/[()]/, ''))
92
+ end
93
+
94
+ # Add to the stream, converting the operator
95
+ if !has_operator
96
+ if OPERATORS.to_a.flatten.include? subtoken and index != (query.size - 1)
97
+ # Note that operators at the end of the string are not parsed
98
+ token_stream << OPERATORS[subtoken] || subtoken
99
+ has_operator = true # flip
100
+ else
101
+ token_stream << ""
102
+ token_stream << subtoken
103
+ end
104
+ else
105
+ if OPERATORS.to_a.flatten.include? subtoken
106
+ # Drop extra operator
107
+ else
108
+ token_stream << subtoken
109
+ has_operator = false # flop
110
+ end
111
+ end
112
+ end
113
+
114
+ if token_stream.size.zero? or token_stream.size.odd?
115
+ raise Error, "#{token_stream.inspect} is not a valid token stream"
116
+ end
117
+ token_stream.in_groups_of(2)
118
+ end
119
+
120
+
121
+ def token_stream_to_hash(token_stream)
122
+ token_hash = Hash.new([])
123
+ token_stream.map do |operator, content|
124
+ # Remove some spaces
125
+ content.gsub!(/^"\s+|\s+"$/, '"')
126
+ # Convert fields into sphinx style, reformat the stream object
127
+ if content =~ /(.*?):(.*)/
128
+ token_hash[$1] += [[operator, $2]]
129
+ else
130
+ token_hash[nil] += [[operator, content]]
131
+ end
132
+ end
133
+ token_hash
134
+ end
135
+
136
+
137
+ end
138
+ end
139
+ end