mm_es_search 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/.gitignore +4 -0
  2. data/.project +18 -0
  3. data/Gemfile +4 -0
  4. data/Rakefile +1 -0
  5. data/lib/mm_es_search/api/facet/abstract_facet.rb +28 -0
  6. data/lib/mm_es_search/api/facet/date_histogram_facet.rb +11 -0
  7. data/lib/mm_es_search/api/facet/filter_facet.rb +9 -0
  8. data/lib/mm_es_search/api/facet/geo_distance_facet.rb +9 -0
  9. data/lib/mm_es_search/api/facet/histogram_facet.rb +9 -0
  10. data/lib/mm_es_search/api/facet/query_facet.rb +9 -0
  11. data/lib/mm_es_search/api/facet/range_facet.rb +36 -0
  12. data/lib/mm_es_search/api/facet/range_facet_row.rb +97 -0
  13. data/lib/mm_es_search/api/facet/range_item.rb +17 -0
  14. data/lib/mm_es_search/api/facet/statistical_facet.rb +33 -0
  15. data/lib/mm_es_search/api/facet/statistical_facet_result.rb +36 -0
  16. data/lib/mm_es_search/api/facet/terms_facet.rb +62 -0
  17. data/lib/mm_es_search/api/facet/terms_facet_row.rb +35 -0
  18. data/lib/mm_es_search/api/facet/terms_stats_facet.rb +9 -0
  19. data/lib/mm_es_search/api/highlight/result_highlight.rb +40 -0
  20. data/lib/mm_es_search/api/query/abstract_filter.rb +15 -0
  21. data/lib/mm_es_search/api/query/abstract_query.rb +48 -0
  22. data/lib/mm_es_search/api/query/and_filter.rb +9 -0
  23. data/lib/mm_es_search/api/query/bool_filter.rb +11 -0
  24. data/lib/mm_es_search/api/query/bool_query.rb +67 -0
  25. data/lib/mm_es_search/api/query/constant_score_query.rb +31 -0
  26. data/lib/mm_es_search/api/query/custom_filters_score_query.rb +52 -0
  27. data/lib/mm_es_search/api/query/custom_score_query.rb +31 -0
  28. data/lib/mm_es_search/api/query/dismax_query.rb +29 -0
  29. data/lib/mm_es_search/api/query/filtered_query.rb +30 -0
  30. data/lib/mm_es_search/api/query/has_child_filter.rb +11 -0
  31. data/lib/mm_es_search/api/query/has_child_query.rb +25 -0
  32. data/lib/mm_es_search/api/query/has_parent_filter.rb +11 -0
  33. data/lib/mm_es_search/api/query/has_parent_query.rb +25 -0
  34. data/lib/mm_es_search/api/query/match_all_filter.rb +11 -0
  35. data/lib/mm_es_search/api/query/match_all_query.rb +19 -0
  36. data/lib/mm_es_search/api/query/nested_filter.rb +22 -0
  37. data/lib/mm_es_search/api/query/nested_query.rb +62 -0
  38. data/lib/mm_es_search/api/query/not_filter.rb +9 -0
  39. data/lib/mm_es_search/api/query/or_filter.rb +9 -0
  40. data/lib/mm_es_search/api/query/prefix_filter.rb +11 -0
  41. data/lib/mm_es_search/api/query/prefix_query.rb +34 -0
  42. data/lib/mm_es_search/api/query/query_filter.rb +28 -0
  43. data/lib/mm_es_search/api/query/query_string_query.rb +37 -0
  44. data/lib/mm_es_search/api/query/range_filter.rb +11 -0
  45. data/lib/mm_es_search/api/query/range_query.rb +57 -0
  46. data/lib/mm_es_search/api/query/scored_filter.rb +29 -0
  47. data/lib/mm_es_search/api/query/single_bool_filter.rb +66 -0
  48. data/lib/mm_es_search/api/query/term_filter.rb +11 -0
  49. data/lib/mm_es_search/api/query/term_query.rb +34 -0
  50. data/lib/mm_es_search/api/query/terms_filter.rb +11 -0
  51. data/lib/mm_es_search/api/query/terms_query.rb +58 -0
  52. data/lib/mm_es_search/api/query/text_query.rb +42 -0
  53. data/lib/mm_es_search/api/query/top_children_query.rb +28 -0
  54. data/lib/mm_es_search/api/sort/root_sort.rb +36 -0
  55. data/lib/mm_es_search/models/abstract_facet_model.rb +23 -0
  56. data/lib/mm_es_search/models/abstract_query_model.rb +21 -0
  57. data/lib/mm_es_search/models/abstract_range_facet_model.rb +365 -0
  58. data/lib/mm_es_search/models/abstract_search_model.OLD +538 -0
  59. data/lib/mm_es_search/models/abstract_search_model.rb +521 -0
  60. data/lib/mm_es_search/models/abstract_sort_model.rb +13 -0
  61. data/lib/mm_es_search/models/abstract_terms_facet_model.rb +87 -0
  62. data/lib/mm_es_search/models/root_sort_model.rb +20 -0
  63. data/lib/mm_es_search/models/virtual_field_sort.rb +52 -0
  64. data/lib/mm_es_search/utils/facet_row_utils.rb +86 -0
  65. data/lib/mm_es_search/utils/search_logger.rb +10 -0
  66. data/lib/mm_es_search/version.rb +3 -0
  67. data/lib/mm_es_search.rb +124 -0
  68. data/mm_es_search.gemspec +24 -0
  69. metadata +132 -0
@@ -0,0 +1,538 @@
1
+ module MmEsSearch
2
+ module Models
3
+
4
+ module AbstractSearchModel
5
+
6
+ extend ActiveSupport::Concern
7
+ include MmEsSearch::Api::Query
8
+ include MmEsSearch::Api::Sort
9
+ include MmEsSearch::Api::Facet
10
+ include MmEsSearch::Api::Highlight
11
+ include MmEsSearch::Models
12
+ include MmEsSearch::Utils
13
+
14
+ included do
15
+
16
+ plugin MmUsesUuid
17
+
18
+ key :query_string, String
19
+ one :query_object, :class_name => 'MmEsSearch::Models::AbstractQueryModel'
20
+ one :sort_object, :class_name => 'MmEsSearch::Models::AbstractSortModel'
21
+ one :highlight_object, :class_name => 'MmEsSearch::Api::Highlight::ResultHighlight'
22
+ many :facets, :class_name => 'MmEsSearch::Models::AbstractFacetModel'
23
+ key :result_ids, Array
24
+ key :result_total, Integer
25
+ key :highlights, Array
26
+
27
+ end
28
+
29
+ module ClassMethods
30
+
31
+ end
32
+
33
+ def run(target, options = {})
34
+
35
+ options.symbolize_keys.reverse_merge!(
36
+ :page => 1,
37
+ :per_page => 10,
38
+ :fields => [],
39
+ :raw_es_response => false,
40
+ :sorted => true,
41
+ :highlight => true,
42
+ :facet_query => false
43
+ )
44
+
45
+ page = options[:page]
46
+ per_page = options[:per_page]
47
+ fields = options[:fields]
48
+
49
+ case target
50
+ when :es
51
+
52
+ if options[:facet_query] and not raw_es_response
53
+ facets_in_display_state = facets.select {|facet| facet.current_state == :ready_for_display}
54
+ facets_in_display_state.each(&:prepare_for_new_data)
55
+ end
56
+
57
+ facet_es_query = case options[:facet_query]
58
+ when AbstractFacet
59
+ options[:facet_query].to_es_query
60
+ when Hash
61
+ options[:facet_query]
62
+ when :auto
63
+ unless type_facet_positively_set?
64
+ options[:facet_query] = :manual
65
+ facets.delete_if(&:unused?)
66
+ unless type_facet_initialized?
67
+ facets << build_facet_model(
68
+ :virtual_field => type_field,
69
+ :data_type => "string",
70
+ :exclude => type_field_excludes
71
+ )
72
+ end
73
+ build_next_facet_es_query(:explore_manual_facets)
74
+ else
75
+ build_next_facet_es_query(:explore_manual_and_auto_facets)
76
+ end
77
+ when :force_auto
78
+ build_next_facet_es_query(:explore_manual_and_auto_facets)
79
+ when :manual
80
+ facets_without_data_type = facets.select {|facet| facet.current_state == :need_data_type}
81
+ add_known_data_types(facets_without_data_type)
82
+ build_next_facet_es_query
83
+ else
84
+ nil
85
+ end
86
+
87
+ request = es_request(sorted, facet_es_query, highlight)
88
+ @search_log.info(request.to_json) if debug_on?
89
+ response = target_collection.search_hits(
90
+ request,
91
+ :page => page,
92
+ :per_page => per_page,
93
+ :ids_only => true
94
+ )
95
+
96
+ return response if raw_es_response
97
+
98
+ @result_ids = response.hits
99
+ @result_total = response.total_entries
100
+ @highlights = response.response['hits']['hits'].map {|hit| hit['highlight']} if highlight_object?
101
+ out = find_hits_in_mongo(@result_ids, fields, page, per_page)
102
+
103
+ if options[:facet_query]
104
+
105
+ write_facet_results_to_models(response.facets)
106
+ update_used_facet_missing_counts_to_zero
107
+ update_show_missing_facet_missing_counts_to_total
108
+ prune_facets
109
+ facets_without_data_type = facets.select {|facet| facet.current_state == :need_data_type}
110
+ add_known_data_types(facets_without_data_type)
111
+
112
+ sanity_count = 0
113
+ until facets.all? {|facet| facet.current_state == :ready_for_display}
114
+ #puts cur_facets_states = self.facets.map {|f| "#{StringUtils.label_from_URI(f.virtual_field)} => #{f.current_state}"}
115
+ facet_query = build_next_facet_es_query
116
+ facet_results = run_for_facets_only(facet_query)
117
+ write_facet_results_to_models(facet_results)
118
+ prune_facets
119
+
120
+ sanity_count += 1
121
+ raise 'until loop has looped too many times!' if sanity_count > 5
122
+ end
123
+ end
124
+
125
+ build_sort_options if respond_to? :build_sort_options
126
+ return out #output result set
127
+
128
+ when :mongo
129
+
130
+ request = mongo_request
131
+ @search_log.info(request.to_json) if debug_on?
132
+ query = target_collection.where(request)
133
+ if sort_object.is_a?(RootSortModel)
134
+ query = query.sort(sort_object.to_mongo_query)
135
+ end
136
+ if not fields.empty?
137
+ query = query.fields(*fields)
138
+ end
139
+
140
+ response = query.paginate(:page => page, :per_page => per_page)
141
+ @result_ids = response.map(&:_id)
142
+ @result_total = response.total_entries
143
+
144
+ return response
145
+
146
+ end
147
+ end
148
+
149
+ def build_next_facet_es_query(mode = nil)
150
+ facet_array = facets.map(&:next_facet_query)
151
+ case mode
152
+ when :explore_manual_facets
153
+ facet_array << manual_facet_coverage_query
154
+ when :explore_auto_facets
155
+ facet_array << auto_facet_exploratory_query
156
+ when :explore_manual_and_auto_facets
157
+ facet_array << manual_facet_coverage_query << auto_facet_exploratory_query
158
+ end
159
+ facet_array_to_es_query(facet_array.compact)
160
+ end
161
+
162
+ def facet_array_to_es_query(query_array)
163
+ es_query = {}
164
+ query_array.each do |q|
165
+ es_query.merge!(q.to_es_query)
166
+ end
167
+
168
+ return es_query.empty? ? nil : es_query
169
+ end
170
+
171
+ def write_facet_results_to_models(facet_results)
172
+ unless facet_results.nil? or facet_results.empty?
173
+ facet_results.each do |label,result|
174
+
175
+ case label
176
+ when 'auto_facet_coverage'
177
+
178
+ result['terms'].each do |params|
179
+ facets << proto_facet.new(
180
+ :virtual_field => params['term'],
181
+ :missing => @result_total - params['count']
182
+ )
183
+ end
184
+
185
+ when 'manual_facet_coverage'
186
+
187
+ result['terms'].each do |params|
188
+ if current_facet = facets.detect {|f| f.virtual_field == params['term']}
189
+ current_facet.missing = @result_total - params['count'] if current_facet
190
+ end
191
+ end
192
+
193
+ when /^data_type_counts_for_/
194
+
195
+ true_label = label[21..-1]
196
+ data_type_counts = result['terms']
197
+ if current_facet = facets.detect {|f| f.label == true_label}
198
+ replace_proto_facet_with_typed_facet(current_facet.virtual_field, data_type_counts)
199
+ end
200
+
201
+ else
202
+
203
+ if current_facet = facets.detect {|f| f.label == label}
204
+ case result['_type']
205
+ when "terms", "range"
206
+ current_facet.build_facet_rows(result)
207
+ when "statistical"
208
+ current_facet.build_field_stats(result)
209
+ end
210
+ end
211
+
212
+ end
213
+
214
+ end
215
+ end
216
+ end
217
+
218
+ def replace_proto_facet_with_typed_facet(virtual_field, data_type_param)
219
+ if indx = facets.find_index {|f| f.virtual_field == virtual_field}
220
+
221
+ current_proto_facet = facets[indx]
222
+
223
+ case data_type_param
224
+ when String
225
+ current_proto_facet.data_type = data_type_param
226
+ when Array
227
+ current_proto_facet.build_data_type_counts(data_type_param)
228
+ end
229
+
230
+ raise 'proto_facet not ready for initialization' if current_proto_facet.current_state != :ready_for_initialization
231
+
232
+ new_params = current_proto_facet.attributes.except('_type').symbolize_keys
233
+ new_facet = build_facet_model(new_params)
234
+ facets[indx] = new_facet
235
+
236
+ end
237
+ end
238
+
239
+ def es_request(sorted = true, facet_query = nil, highlight = true)
240
+ parse_query_string_if_needed
241
+ query = sorted ? sorted_query : unsorted_query
242
+ request = {
243
+ :query => query.to_es_query,
244
+ :query_dsl => false
245
+ }
246
+ if sort_object.is_a?(RootSortModel) and sorted
247
+ request.merge!(:sort => sort_object.to_es_query)
248
+ end
249
+ if facet_query
250
+ request.merge!(:facets => facet_query)
251
+ end
252
+ if highlight_object? and highlight
253
+ request.merge!(:highlight => highlight_object.to_es_query)
254
+ end
255
+ return request
256
+ end
257
+
258
+ def mongo_request
259
+ parse_query_string_if_needed
260
+ sorted_query.to_mongo_query
261
+ end
262
+
263
+ def update_used_facet_missing_counts_to_zero
264
+ #by definition, if it's been applied, all results must have it
265
+ used_facets.each {|facet| facet.missing = 0}
266
+ end
267
+
268
+ def update_show_missing_facet_missing_counts_to_total
269
+ used_facets.each {|facet| facet.missing = @result_total if facet.show_missing}
270
+ end
271
+
272
+ def type_facet_initialized?
273
+ facets.any? {|facet| facet.virtual_field == type_field}
274
+ end
275
+
276
+ def type_facet_positively_set?
277
+ used_facets.any? do |facet|
278
+ if facet.virtual_field == type_field
279
+ facet.rows.any? { |row| ["and", "or"].include?(row.checked) }
280
+ else
281
+ false
282
+ end
283
+ end
284
+ end
285
+
286
+ def parse_query_string_if_needed
287
+ if query_string? and query_object.nil?
288
+ build_query_object
289
+ end
290
+ end
291
+
292
+ def find_hits_in_mongo(hits = @result_ids, fields = [], page = 1, per_page = @result_ids.length)
293
+ #fetch records from db in one call and then reorder to match search result ordering
294
+ return paginate_records([], page, per_page, @result_total) if hits.empty?
295
+
296
+ ranked_ids = case hits.first
297
+ when ElasticSearch::Api::Hit
298
+ hits.map(&:_id)
299
+ else
300
+ #presume we have ids
301
+ hits
302
+ end
303
+
304
+ #NOTE: I use #find_with_fields to avoid redefining the standard MM #find method
305
+ # this can be trivially implemented with the plucky #where and #fields methods
306
+ # but is directly implemented in MmUsesUuid
307
+ unordered_records = target_collection.find_with_fields ranked_ids, :fields => fields
308
+
309
+ if unordered_records.is_a?(Array)
310
+ records = unordered_records.reorder_by(ranked_ids.map(&:to_s), &Proc.new {|r| r.id.to_s})
311
+ elsif unordered_records.nil?
312
+ records = []
313
+ else
314
+ records = [unordered_records]
315
+ end
316
+
317
+ return paginate_records(records, page, per_page, @result_total)
318
+
319
+ end
320
+
321
+ def paginate_records(records, page, per_page, total)
322
+ results = WillPaginate::Collection.new(page, per_page, total)
323
+ results.replace(records)
324
+ results
325
+ end
326
+
327
+ def count(target, options = {})
328
+ parse_query_string_if_needed
329
+ case target
330
+ when :es
331
+ target_collection.search_hits(unsorted_query.to_es_query, :per_page => 0).total_entries
332
+ when :mongo
333
+ target_collection.where(unsorted_query.to_mongo_query).count
334
+ end
335
+ end
336
+
337
+ def combine_queries(scored, unscored)
338
+ query = if scored.empty? and unscored.empty?
339
+ MatchAllQuery.new
340
+ elsif scored.empty?
341
+ ConstantScoreQuery.new(
342
+ :boost => 1,
343
+ :query => BoolQuery.new(
344
+ :musts => unscored
345
+ )
346
+ )
347
+ elsif unscored.empty?
348
+ if scored.length > 1
349
+ BoolQuery.new(
350
+ :musts => scored
351
+ )
352
+ else
353
+ scored.first
354
+ end
355
+ else
356
+ # mod_scored = scored.map {|query| q = query.dup; q.boost = 1e100; q }
357
+ mod_unscored = unscored.map {|query| q = query.dup; q.boost = 0; q }
358
+ BoolQuery.new(
359
+ :musts => scored + mod_unscored
360
+ )
361
+ end
362
+ end
363
+
364
+ def unsorted_query
365
+ parse_query_string_if_needed
366
+ unscored_queries, filters = sort_query_and_facets_as_filters #NOTE: we put non-RootSortModel sorts in as filters as these typically restrict results
367
+ query = combine_queries([], unscored_queries)
368
+ build_filtered_query(query, filters)
369
+ end
370
+
371
+ def sorted_query
372
+ parse_query_string_if_needed
373
+ if (sort_object.nil? and query_object.nil?) or sort_object.is_a?(RootSortModel)
374
+ unsorted_query
375
+ else
376
+ if sort_object.nil?
377
+ query = query_object.to_query
378
+ filters = facets_as_filters
379
+ else
380
+ unscored_queries, filters = query_and_facets_as_filters
381
+ query = combine_queries([sort_object.to_query], unscored_queries)
382
+ end
383
+ build_filtered_query(query, filters)
384
+ end
385
+ end
386
+
387
+ def sort_query_and_facets_as_filters
388
+ unscored_queries, filters = query_and_facets_as_filters
389
+ filters << sort_object.to_filter unless (sort_object.nil? or sort_object.is_a?(RootSortModel))
390
+ return unscored_queries, filters
391
+ end
392
+
393
+ def query_and_facets_as_filters
394
+ filters = facets_as_filters
395
+ unscored_queries = []
396
+ query_as_filter = query_object? ? query_object.to_filter : nil
397
+ if query_as_filter
398
+ filters << query_as_filter
399
+ elsif query_object?
400
+ unscored_queries << query_object.to_query
401
+ end
402
+ return unscored_queries, filters
403
+ end
404
+
405
+ def facets_as_filters
406
+ used_facets.map(&:to_filter).compact
407
+ end
408
+
409
+ def build_filtered_query(query, filters)
410
+ if filters.nil? or filters.empty?
411
+ query
412
+ else
413
+ FilteredQuery.new(
414
+ :query => query,
415
+ :filter => AndFilter.new(
416
+ :filters => filters
417
+ )
418
+ )
419
+ end
420
+ end
421
+
422
+ def build_facet_model(params)
423
+ case params[:data_type]
424
+ when /^string/, 'boolean', 'uri'
425
+ build_term_facet_model(params)
426
+ when 'integer', 'float', 'time', 'date'
427
+ build_range_facet_model(params)
428
+ else
429
+ raise "unable to build a facet model for data_type = #{params[:data_type]}"
430
+ end
431
+ end
432
+
433
+ def run_for_facets_only(facet_es_query)
434
+ facet_result = run(:es,
435
+ :facet_query => facet_es_query,
436
+ :raw_es_response => true,
437
+ :sorted => false,
438
+ :highlight => false,
439
+ :per_page => 0).facets
440
+ facet_result.nil? ? {} : facet_result
441
+ end
442
+
443
+ def used_facets
444
+ facets.select(&:used?)
445
+ end
446
+
447
+ def offered_facets
448
+ facets.select(&:unused?)
449
+ end
450
+
451
+
452
+ def prune_facets
453
+
454
+ prunable_facets = offered_facets.select { |f| not non_prunable_fields.include?(f[:virtual_field]) }
455
+ fields_to_delete = {}
456
+
457
+ prunable_facets.each do |facet|
458
+
459
+ case facet
460
+ when proto_facet
461
+
462
+ total_present = @result_total - facet.missing
463
+ coverage_ratio = total_present / @result_total.to_f
464
+
465
+ if coverage_ratio < self.class::REQUIRED_COVERAGE_RATIO
466
+ fields_to_delete.merge!(facet[:virtual_field] => 'coverage_ratio_too_low')
467
+ elsif total_present < self.class::REQUIRED_COVERAGE_COUNT
468
+ fields_to_delete.merge!(facet[:virtual_field] => 'coverage_count_too_low')
469
+ end
470
+
471
+ when AbstractTermsFacetModel
472
+
473
+ #compute some stats
474
+ largest_term_count = facet.rows.first.count
475
+ prop_of_total = largest_term_count / @result_total.to_f
476
+
477
+ if largest_term_count == 1
478
+ fields_to_delete.merge!(facet.virtual_field => 'top_count_is_unity')
479
+ elsif prop_of_total < 0.05
480
+ fields_to_delete.merge!(facet.virtual_field => 'top_count_too_small')
481
+ elsif prop_of_total > 0.75
482
+ fields_to_delete.merge!(facet.virtual_field => 'top_count_too_big')
483
+ end
484
+
485
+ when AbstractRangeFacetModel
486
+
487
+ # anything we can catch here?
488
+
489
+ end
490
+
491
+ end
492
+
493
+ prune_and_record_reason(fields_to_delete)
494
+
495
+ end
496
+
497
+ def prune_and_record_reason(fields_to_delete)
498
+ fields_to_delete.each do |virtual_field, reason|
499
+ facets.delete_if {|facet| facet[:virtual_field] == virtual_field}
500
+ record_prune_reason(virtual_field, reason)
501
+ end
502
+ end
503
+
504
+
505
+ def debug_on?
506
+ if defined?(@debug_on)
507
+ @debug_on
508
+ else
509
+ debug_off
510
+ false
511
+ end
512
+ end
513
+
514
+ def debug_on
515
+ @debug_on = true
516
+ logfile = File.open(Rails.root.to_s + '/log/search.log', 'a')
517
+ logfile.sync = true
518
+ @search_log = SearchLogger.new(logfile)
519
+ @search_log.info "#{self.class.name} now logging\n"
520
+ return self
521
+ end
522
+
523
+ def debug_off
524
+ @debug_on = false
525
+ @search_log = nil
526
+ return self
527
+ end
528
+
529
+ def target_collection
530
+ #we assume name is of form klass.name + "Search"
531
+ klass_match = self.class.name.match(/(?<klass>\w*)(?=Search)/)
532
+ raise "expected the class name '#{self.class.name}' to be of form 'SomethingSearch' so that we can extract 'Something' as the target collection" unless klass_match[:klass]
533
+ klass_match[:klass].constantize
534
+ end
535
+
536
+ end
537
+ end
538
+ end