mm_es_search 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data/.gitignore +4 -0
  2. data/.project +18 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +1 -0
  5. data/lib/mm_es_search/api/facet/abstract_facet.rb +28 -0
  6. data/lib/mm_es_search/api/facet/date_histogram_facet.rb +11 -0
  7. data/lib/mm_es_search/api/facet/filter_facet.rb +9 -0
  8. data/lib/mm_es_search/api/facet/geo_distance_facet.rb +9 -0
  9. data/lib/mm_es_search/api/facet/histogram_facet.rb +9 -0
  10. data/lib/mm_es_search/api/facet/query_facet.rb +9 -0
  11. data/lib/mm_es_search/api/facet/range_facet.rb +36 -0
  12. data/lib/mm_es_search/api/facet/range_facet_row.rb +97 -0
  13. data/lib/mm_es_search/api/facet/range_item.rb +17 -0
  14. data/lib/mm_es_search/api/facet/statistical_facet.rb +33 -0
  15. data/lib/mm_es_search/api/facet/statistical_facet_result.rb +36 -0
  16. data/lib/mm_es_search/api/facet/terms_facet.rb +62 -0
  17. data/lib/mm_es_search/api/facet/terms_facet_row.rb +35 -0
  18. data/lib/mm_es_search/api/facet/terms_stats_facet.rb +9 -0
  19. data/lib/mm_es_search/api/highlight/result_highlight.rb +40 -0
  20. data/lib/mm_es_search/api/query/abstract_filter.rb +15 -0
  21. data/lib/mm_es_search/api/query/abstract_query.rb +48 -0
  22. data/lib/mm_es_search/api/query/and_filter.rb +9 -0
  23. data/lib/mm_es_search/api/query/bool_filter.rb +11 -0
  24. data/lib/mm_es_search/api/query/bool_query.rb +67 -0
  25. data/lib/mm_es_search/api/query/constant_score_query.rb +31 -0
  26. data/lib/mm_es_search/api/query/custom_filters_score_query.rb +52 -0
  27. data/lib/mm_es_search/api/query/custom_score_query.rb +31 -0
  28. data/lib/mm_es_search/api/query/dismax_query.rb +29 -0
  29. data/lib/mm_es_search/api/query/filtered_query.rb +30 -0
  30. data/lib/mm_es_search/api/query/has_child_filter.rb +11 -0
  31. data/lib/mm_es_search/api/query/has_child_query.rb +25 -0
  32. data/lib/mm_es_search/api/query/has_parent_filter.rb +11 -0
  33. data/lib/mm_es_search/api/query/has_parent_query.rb +25 -0
  34. data/lib/mm_es_search/api/query/match_all_filter.rb +11 -0
  35. data/lib/mm_es_search/api/query/match_all_query.rb +19 -0
  36. data/lib/mm_es_search/api/query/nested_filter.rb +22 -0
  37. data/lib/mm_es_search/api/query/nested_query.rb +62 -0
  38. data/lib/mm_es_search/api/query/not_filter.rb +9 -0
  39. data/lib/mm_es_search/api/query/or_filter.rb +9 -0
  40. data/lib/mm_es_search/api/query/prefix_filter.rb +11 -0
  41. data/lib/mm_es_search/api/query/prefix_query.rb +34 -0
  42. data/lib/mm_es_search/api/query/query_filter.rb +28 -0
  43. data/lib/mm_es_search/api/query/query_string_query.rb +37 -0
  44. data/lib/mm_es_search/api/query/range_filter.rb +11 -0
  45. data/lib/mm_es_search/api/query/range_query.rb +57 -0
  46. data/lib/mm_es_search/api/query/scored_filter.rb +29 -0
  47. data/lib/mm_es_search/api/query/single_bool_filter.rb +66 -0
  48. data/lib/mm_es_search/api/query/term_filter.rb +11 -0
  49. data/lib/mm_es_search/api/query/term_query.rb +34 -0
  50. data/lib/mm_es_search/api/query/terms_filter.rb +11 -0
  51. data/lib/mm_es_search/api/query/terms_query.rb +58 -0
  52. data/lib/mm_es_search/api/query/text_query.rb +42 -0
  53. data/lib/mm_es_search/api/query/top_children_query.rb +28 -0
  54. data/lib/mm_es_search/api/sort/root_sort.rb +36 -0
  55. data/lib/mm_es_search/models/abstract_facet_model.rb +23 -0
  56. data/lib/mm_es_search/models/abstract_query_model.rb +21 -0
  57. data/lib/mm_es_search/models/abstract_range_facet_model.rb +365 -0
  58. data/lib/mm_es_search/models/abstract_search_model.OLD +538 -0
  59. data/lib/mm_es_search/models/abstract_search_model.rb +521 -0
  60. data/lib/mm_es_search/models/abstract_sort_model.rb +13 -0
  61. data/lib/mm_es_search/models/abstract_terms_facet_model.rb +87 -0
  62. data/lib/mm_es_search/models/root_sort_model.rb +20 -0
  63. data/lib/mm_es_search/models/virtual_field_sort.rb +52 -0
  64. data/lib/mm_es_search/utils/facet_row_utils.rb +86 -0
  65. data/lib/mm_es_search/utils/search_logger.rb +10 -0
  66. data/lib/mm_es_search/version.rb +3 -0
  67. data/lib/mm_es_search.rb +124 -0
  68. data/mm_es_search.gemspec +24 -0
  69. metadata +132 -0
@@ -0,0 +1,538 @@
1
+ module MmEsSearch
2
+ module Models
3
+
4
+ module AbstractSearchModel
5
+
6
+ extend ActiveSupport::Concern
7
+ include MmEsSearch::Api::Query
8
+ include MmEsSearch::Api::Sort
9
+ include MmEsSearch::Api::Facet
10
+ include MmEsSearch::Api::Highlight
11
+ include MmEsSearch::Models
12
+ include MmEsSearch::Utils
13
+
14
+ included do
15
+
16
+ plugin MmUsesUuid
17
+
18
+ key :query_string, String
19
+ one :query_object, :class_name => 'MmEsSearch::Models::AbstractQueryModel'
20
+ one :sort_object, :class_name => 'MmEsSearch::Models::AbstractSortModel'
21
+ one :highlight_object, :class_name => 'MmEsSearch::Api::Highlight::ResultHighlight'
22
+ many :facets, :class_name => 'MmEsSearch::Models::AbstractFacetModel'
23
+ key :result_ids, Array
24
+ key :result_total, Integer
25
+ key :highlights, Array
26
+
27
+ end
28
+
29
+ module ClassMethods
30
+
31
+ end
32
+
33
+ def run(target, options = {})
34
+
35
+ options.symbolize_keys.reverse_merge!(
36
+ :page => 1,
37
+ :per_page => 10,
38
+ :fields => [],
39
+ :raw_es_response => false,
40
+ :sorted => true,
41
+ :highlight => true,
42
+ :facet_query => false
43
+ )
44
+
45
+ page = options[:page]
46
+ per_page = options[:per_page]
47
+ fields = options[:fields]
48
+
49
+ case target
50
+ when :es
51
+
52
+ if options[:facet_query] and not raw_es_response
53
+ facets_in_display_state = facets.select {|facet| facet.current_state == :ready_for_display}
54
+ facets_in_display_state.each(&:prepare_for_new_data)
55
+ end
56
+
57
+ facet_es_query = case options[:facet_query]
58
+ when AbstractFacet
59
+ options[:facet_query].to_es_query
60
+ when Hash
61
+ options[:facet_query]
62
+ when :auto
63
+ unless type_facet_positively_set?
64
+ options[:facet_query] = :manual
65
+ facets.delete_if(&:unused?)
66
+ unless type_facet_initialized?
67
+ facets << build_facet_model(
68
+ :virtual_field => type_field,
69
+ :data_type => "string",
70
+ :exclude => type_field_excludes
71
+ )
72
+ end
73
+ build_next_facet_es_query(:explore_manual_facets)
74
+ else
75
+ build_next_facet_es_query(:explore_manual_and_auto_facets)
76
+ end
77
+ when :force_auto
78
+ build_next_facet_es_query(:explore_manual_and_auto_facets)
79
+ when :manual
80
+ facets_without_data_type = facets.select {|facet| facet.current_state == :need_data_type}
81
+ add_known_data_types(facets_without_data_type)
82
+ build_next_facet_es_query
83
+ else
84
+ nil
85
+ end
86
+
87
+ request = es_request(sorted, facet_es_query, highlight)
88
+ @search_log.info(request.to_json) if debug_on?
89
+ response = target_collection.search_hits(
90
+ request,
91
+ :page => page,
92
+ :per_page => per_page,
93
+ :ids_only => true
94
+ )
95
+
96
+ return response if raw_es_response
97
+
98
+ @result_ids = response.hits
99
+ @result_total = response.total_entries
100
+ @highlights = response.response['hits']['hits'].map {|hit| hit['highlight']} if highlight_object?
101
+ out = find_hits_in_mongo(@result_ids, fields, page, per_page)
102
+
103
+ if options[:facet_query]
104
+
105
+ write_facet_results_to_models(response.facets)
106
+ update_used_facet_missing_counts_to_zero
107
+ update_show_missing_facet_missing_counts_to_total
108
+ prune_facets
109
+ facets_without_data_type = facets.select {|facet| facet.current_state == :need_data_type}
110
+ add_known_data_types(facets_without_data_type)
111
+
112
+ sanity_count = 0
113
+ until facets.all? {|facet| facet.current_state == :ready_for_display}
114
+ #puts cur_facets_states = self.facets.map {|f| "#{StringUtils.label_from_URI(f.virtual_field)} => #{f.current_state}"}
115
+ facet_query = build_next_facet_es_query
116
+ facet_results = run_for_facets_only(facet_query)
117
+ write_facet_results_to_models(facet_results)
118
+ prune_facets
119
+
120
+ sanity_count += 1
121
+ raise 'until loop has looped too many times!' if sanity_count > 5
122
+ end
123
+ end
124
+
125
+ build_sort_options if respond_to? :build_sort_options
126
+ return out #output result set
127
+
128
+ when :mongo
129
+
130
+ request = mongo_request
131
+ @search_log.info(request.to_json) if debug_on?
132
+ query = target_collection.where(request)
133
+ if sort_object.is_a?(RootSortModel)
134
+ query = query.sort(sort_object.to_mongo_query)
135
+ end
136
+ if not fields.empty?
137
+ query = query.fields(*fields)
138
+ end
139
+
140
+ response = query.paginate(:page => page, :per_page => per_page)
141
+ @result_ids = response.map(&:_id)
142
+ @result_total = response.total_entries
143
+
144
+ return response
145
+
146
+ end
147
+ end
148
+
149
+ def build_next_facet_es_query(mode = nil)
150
+ facet_array = facets.map(&:next_facet_query)
151
+ case mode
152
+ when :explore_manual_facets
153
+ facet_array << manual_facet_coverage_query
154
+ when :explore_auto_facets
155
+ facet_array << auto_facet_exploratory_query
156
+ when :explore_manual_and_auto_facets
157
+ facet_array << manual_facet_coverage_query << auto_facet_exploratory_query
158
+ end
159
+ facet_array_to_es_query(facet_array.compact)
160
+ end
161
+
162
+ def facet_array_to_es_query(query_array)
163
+ es_query = {}
164
+ query_array.each do |q|
165
+ es_query.merge!(q.to_es_query)
166
+ end
167
+
168
+ return es_query.empty? ? nil : es_query
169
+ end
170
+
171
+ def write_facet_results_to_models(facet_results)
172
+ unless facet_results.nil? or facet_results.empty?
173
+ facet_results.each do |label,result|
174
+
175
+ case label
176
+ when 'auto_facet_coverage'
177
+
178
+ result['terms'].each do |params|
179
+ facets << proto_facet.new(
180
+ :virtual_field => params['term'],
181
+ :missing => @result_total - params['count']
182
+ )
183
+ end
184
+
185
+ when 'manual_facet_coverage'
186
+
187
+ result['terms'].each do |params|
188
+ if current_facet = facets.detect {|f| f.virtual_field == params['term']}
189
+ current_facet.missing = @result_total - params['count'] if current_facet
190
+ end
191
+ end
192
+
193
+ when /^data_type_counts_for_/
194
+
195
+ true_label = label[21..-1]
196
+ data_type_counts = result['terms']
197
+ if current_facet = facets.detect {|f| f.label == true_label}
198
+ replace_proto_facet_with_typed_facet(current_facet.virtual_field, data_type_counts)
199
+ end
200
+
201
+ else
202
+
203
+ if current_facet = facets.detect {|f| f.label == label}
204
+ case result['_type']
205
+ when "terms", "range"
206
+ current_facet.build_facet_rows(result)
207
+ when "statistical"
208
+ current_facet.build_field_stats(result)
209
+ end
210
+ end
211
+
212
+ end
213
+
214
+ end
215
+ end
216
+ end
217
+
218
+ def replace_proto_facet_with_typed_facet(virtual_field, data_type_param)
219
+ if indx = facets.find_index {|f| f.virtual_field == virtual_field}
220
+
221
+ current_proto_facet = facets[indx]
222
+
223
+ case data_type_param
224
+ when String
225
+ current_proto_facet.data_type = data_type_param
226
+ when Array
227
+ current_proto_facet.build_data_type_counts(data_type_param)
228
+ end
229
+
230
+ raise 'proto_facet not ready for initialization' if current_proto_facet.current_state != :ready_for_initialization
231
+
232
+ new_params = current_proto_facet.attributes.except('_type').symbolize_keys
233
+ new_facet = build_facet_model(new_params)
234
+ facets[indx] = new_facet
235
+
236
+ end
237
+ end
238
+
239
+ def es_request(sorted = true, facet_query = nil, highlight = true)
240
+ parse_query_string_if_needed
241
+ query = sorted ? sorted_query : unsorted_query
242
+ request = {
243
+ :query => query.to_es_query,
244
+ :query_dsl => false
245
+ }
246
+ if sort_object.is_a?(RootSortModel) and sorted
247
+ request.merge!(:sort => sort_object.to_es_query)
248
+ end
249
+ if facet_query
250
+ request.merge!(:facets => facet_query)
251
+ end
252
+ if highlight_object? and highlight
253
+ request.merge!(:highlight => highlight_object.to_es_query)
254
+ end
255
+ return request
256
+ end
257
+
258
+ def mongo_request
259
+ parse_query_string_if_needed
260
+ sorted_query.to_mongo_query
261
+ end
262
+
263
+ def update_used_facet_missing_counts_to_zero
264
+ #by definition, if it's been applied, all results must have it
265
+ used_facets.each {|facet| facet.missing = 0}
266
+ end
267
+
268
+ def update_show_missing_facet_missing_counts_to_total
269
+ used_facets.each {|facet| facet.missing = @result_total if facet.show_missing}
270
+ end
271
+
272
+ def type_facet_initialized?
273
+ facets.any? {|facet| facet.virtual_field == type_field}
274
+ end
275
+
276
+ def type_facet_positively_set?
277
+ used_facets.any? do |facet|
278
+ if facet.virtual_field == type_field
279
+ facet.rows.any? { |row| ["and", "or"].include?(row.checked) }
280
+ else
281
+ false
282
+ end
283
+ end
284
+ end
285
+
286
+ def parse_query_string_if_needed
287
+ if query_string? and query_object.nil?
288
+ build_query_object
289
+ end
290
+ end
291
+
292
+ def find_hits_in_mongo(hits = @result_ids, fields = [], page = 1, per_page = @result_ids.length)
293
+ #fetch records from db in one call and then reorder to match search result ordering
294
+ return paginate_records([], page, per_page, @result_total) if hits.empty?
295
+
296
+ ranked_ids = case hits.first
297
+ when ElasticSearch::Api::Hit
298
+ hits.map(&:_id)
299
+ else
300
+ #presume we have ids
301
+ hits
302
+ end
303
+
304
+ #NOTE: I use #find_with_fields to avoid redefining the standard MM #find method
305
+ # this can be trivially implemented with the plucky #where and #fields methods
306
+ # but is directly implemented in MmUsesUuid
307
+ unordered_records = target_collection.find_with_fields ranked_ids, :fields => fields
308
+
309
+ if unordered_records.is_a?(Array)
310
+ records = unordered_records.reorder_by(ranked_ids.map(&:to_s), &Proc.new {|r| r.id.to_s})
311
+ elsif unordered_records.nil?
312
+ records = []
313
+ else
314
+ records = [unordered_records]
315
+ end
316
+
317
+ return paginate_records(records, page, per_page, @result_total)
318
+
319
+ end
320
+
321
+ def paginate_records(records, page, per_page, total)
322
+ results = WillPaginate::Collection.new(page, per_page, total)
323
+ results.replace(records)
324
+ results
325
+ end
326
+
327
+ def count(target, options = {})
328
+ parse_query_string_if_needed
329
+ case target
330
+ when :es
331
+ target_collection.search_hits(unsorted_query.to_es_query, :per_page => 0).total_entries
332
+ when :mongo
333
+ target_collection.where(unsorted_query.to_mongo_query).count
334
+ end
335
+ end
336
+
337
+ def combine_queries(scored, unscored)
338
+ query = if scored.empty? and unscored.empty?
339
+ MatchAllQuery.new
340
+ elsif scored.empty?
341
+ ConstantScoreQuery.new(
342
+ :boost => 1,
343
+ :query => BoolQuery.new(
344
+ :musts => unscored
345
+ )
346
+ )
347
+ elsif unscored.empty?
348
+ if scored.length > 1
349
+ BoolQuery.new(
350
+ :musts => scored
351
+ )
352
+ else
353
+ scored.first
354
+ end
355
+ else
356
+ # mod_scored = scored.map {|query| q = query.dup; q.boost = 1e100; q }
357
+ mod_unscored = unscored.map {|query| q = query.dup; q.boost = 0; q }
358
+ BoolQuery.new(
359
+ :musts => scored + mod_unscored
360
+ )
361
+ end
362
+ end
363
+
364
+ def unsorted_query
365
+ parse_query_string_if_needed
366
+ unscored_queries, filters = sort_query_and_facets_as_filters #NOTE: we put non-RootSortModel sorts in as filters as these typically restrict results
367
+ query = combine_queries([], unscored_queries)
368
+ build_filtered_query(query, filters)
369
+ end
370
+
371
+ def sorted_query
372
+ parse_query_string_if_needed
373
+ if (sort_object.nil? and query_object.nil?) or sort_object.is_a?(RootSortModel)
374
+ unsorted_query
375
+ else
376
+ if sort_object.nil?
377
+ query = query_object.to_query
378
+ filters = facets_as_filters
379
+ else
380
+ unscored_queries, filters = query_and_facets_as_filters
381
+ query = combine_queries([sort_object.to_query], unscored_queries)
382
+ end
383
+ build_filtered_query(query, filters)
384
+ end
385
+ end
386
+
387
+ def sort_query_and_facets_as_filters
388
+ unscored_queries, filters = query_and_facets_as_filters
389
+ filters << sort_object.to_filter unless (sort_object.nil? or sort_object.is_a?(RootSortModel))
390
+ return unscored_queries, filters
391
+ end
392
+
393
+ def query_and_facets_as_filters
394
+ filters = facets_as_filters
395
+ unscored_queries = []
396
+ query_as_filter = query_object? ? query_object.to_filter : nil
397
+ if query_as_filter
398
+ filters << query_as_filter
399
+ elsif query_object?
400
+ unscored_queries << query_object.to_query
401
+ end
402
+ return unscored_queries, filters
403
+ end
404
+
405
+ def facets_as_filters
406
+ used_facets.map(&:to_filter).compact
407
+ end
408
+
409
+ def build_filtered_query(query, filters)
410
+ if filters.nil? or filters.empty?
411
+ query
412
+ else
413
+ FilteredQuery.new(
414
+ :query => query,
415
+ :filter => AndFilter.new(
416
+ :filters => filters
417
+ )
418
+ )
419
+ end
420
+ end
421
+
422
+ def build_facet_model(params)
423
+ case params[:data_type]
424
+ when /^string/, 'boolean', 'uri'
425
+ build_term_facet_model(params)
426
+ when 'integer', 'float', 'time', 'date'
427
+ build_range_facet_model(params)
428
+ else
429
+ raise "unable to build a facet model for data_type = #{params[:data_type]}"
430
+ end
431
+ end
432
+
433
+ def run_for_facets_only(facet_es_query)
434
+ facet_result = run(:es,
435
+ :facet_query => facet_es_query,
436
+ :raw_es_response => true,
437
+ :sorted => false,
438
+ :highlight => false,
439
+ :per_page => 0).facets
440
+ facet_result.nil? ? {} : facet_result
441
+ end
442
+
443
+ def used_facets
444
+ facets.select(&:used?)
445
+ end
446
+
447
+ def offered_facets
448
+ facets.select(&:unused?)
449
+ end
450
+
451
+
452
+ def prune_facets
453
+
454
+ prunable_facets = offered_facets.select { |f| not non_prunable_fields.include?(f[:virtual_field]) }
455
+ fields_to_delete = {}
456
+
457
+ prunable_facets.each do |facet|
458
+
459
+ case facet
460
+ when proto_facet
461
+
462
+ total_present = @result_total - facet.missing
463
+ coverage_ratio = total_present / @result_total.to_f
464
+
465
+ if coverage_ratio < self.class::REQUIRED_COVERAGE_RATIO
466
+ fields_to_delete.merge!(facet[:virtual_field] => 'coverage_ratio_too_low')
467
+ elsif total_present < self.class::REQUIRED_COVERAGE_COUNT
468
+ fields_to_delete.merge!(facet[:virtual_field] => 'coverage_count_too_low')
469
+ end
470
+
471
+ when AbstractTermsFacetModel
472
+
473
+ #compute some stats
474
+ largest_term_count = facet.rows.first.count
475
+ prop_of_total = largest_term_count / @result_total.to_f
476
+
477
+ if largest_term_count == 1
478
+ fields_to_delete.merge!(facet.virtual_field => 'top_count_is_unity')
479
+ elsif prop_of_total < 0.05
480
+ fields_to_delete.merge!(facet.virtual_field => 'top_count_too_small')
481
+ elsif prop_of_total > 0.75
482
+ fields_to_delete.merge!(facet.virtual_field => 'top_count_too_big')
483
+ end
484
+
485
+ when AbstractRangeFacetModel
486
+
487
+ # anything we can catch here?
488
+
489
+ end
490
+
491
+ end
492
+
493
+ prune_and_record_reason(fields_to_delete)
494
+
495
+ end
496
+
497
+ def prune_and_record_reason(fields_to_delete)
498
+ fields_to_delete.each do |virtual_field, reason|
499
+ facets.delete_if {|facet| facet[:virtual_field] == virtual_field}
500
+ record_prune_reason(virtual_field, reason)
501
+ end
502
+ end
503
+
504
+
505
+ def debug_on?
506
+ if defined?(@debug_on)
507
+ @debug_on
508
+ else
509
+ debug_off
510
+ false
511
+ end
512
+ end
513
+
514
+ def debug_on
515
+ @debug_on = true
516
+ logfile = File.open(Rails.root.to_s + '/log/search.log', 'a')
517
+ logfile.sync = true
518
+ @search_log = SearchLogger.new(logfile)
519
+ @search_log.info "#{self.class.name} now logging\n"
520
+ return self
521
+ end
522
+
523
+ def debug_off
524
+ @debug_on = false
525
+ @search_log = nil
526
+ return self
527
+ end
528
+
529
+ def target_collection
530
+ #we assume name is of form klass.name + "Search"
531
+ klass_match = self.class.name.match(/(?<klass>\w*)(?=Search)/)
532
+ raise "expected the class name '#{self.class.name}' to be of form 'SomethingSearch' so that we can extract 'Something' as the target collection" unless klass_match[:klass]
533
+ klass_match[:klass].constantize
534
+ end
535
+
536
+ end
537
+ end
538
+ end