searchkick 2.0.2 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 01c193778cc86142dca494a2aa0b0ac8c6474d0b
4
- data.tar.gz: 7ff77b2f385a8004acaf1dabb4935f90d61c26f8
3
+ metadata.gz: 63b182684f14e8e5c0b30e0505a188d7bbc06f85
4
+ data.tar.gz: 9227d3877f3b6cf944a3413fe4478ec17393bc0c
5
5
  SHA512:
6
- metadata.gz: cbeefdbb0c40dfb0c32789e7468d3b570c7014fe9b27a202fca7b310820fd3c223264e360456ea68ca5bb13378e3770f448b3237abb9f61385f22d72942b09b0
7
- data.tar.gz: 199fb639bcc0d883e87848009e35d2ed0e62e491c5ada17d0eeab3844539513f27c09af46ebb17051f402973dabed86e46e8d6bd08e18b9b3dc43ce3ac2d74e0
6
+ metadata.gz: '08f71c79a49fa5c0c43d4d1d489b0040bfec789edd6c3e3737b48fd749a7dc1ee4d7b2337aa0e57db35a29a20c14ce13c5a0b653441151f64f2241b59f69a4f5'
7
+ data.tar.gz: 9f864b9b4d890a1a220364abd8e6d0376d06e90d853d34201630f3599bc580a5fee3cf41604b79cbed7bf25ee5524352056119cd9c4a8b0883017476639523da
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 2.0.3
2
+
3
+ - Added `async` option to `reindex` [experimental]
4
+ - Added `misspellings?` method to results
5
+
1
6
  ## 2.0.2
2
7
 
3
8
  - Added `retain` option to `reindex`
data/README.md CHANGED
@@ -253,12 +253,16 @@ Available options are:
253
253
 
254
254
  ### Exact Matches
255
255
 
256
+ To match a field exactly (case-insensitive), use:
257
+
256
258
  ```ruby
257
259
  User.search query, fields: [{email: :exact}, :name]
258
260
  ```
259
261
 
260
262
  ### Phrase Matches
261
263
 
264
+ To only match the exact order, use:
265
+
262
266
  ```ruby
263
267
  User.search "fresh honey", match: :phrase
264
268
  ```
@@ -1152,6 +1156,37 @@ Reindex and search with:
1152
1156
  Business.search "ice cream", routing: params[:city_id]
1153
1157
  ```
1154
1158
 
1159
+ ## Large Data Sets
1160
+
1161
+ ### Background Reindexing [experimental, ActiveRecord only]
1162
+
1163
+ For large data sets, you can use background jobs to parallelize reindexing.
1164
+
1165
+ ```ruby
1166
+ Product.reindex(async: true)
1167
+ # {index_name: "products_production_20170111210018065"}
1168
+ ```
1169
+
1170
+ Once the jobs complete, promote the new index with:
1171
+
1172
+ ```ruby
1173
+ Product.searchkick_index.promote(index_name)
1174
+ ```
1175
+
1176
+ You can optionally track the status with Redis:
1177
+
1178
+ ```ruby
1179
+ Searchkick.redis = Redis.new
1180
+ ```
1181
+
1182
+ And use:
1183
+
1184
+ ```ruby
1185
+ Searchkick.reindex_status(index_name)
1186
+ ```
1187
+
1188
+ For more tips, check out [Keeping Elasticsearch in Sync](https://www.elastic.co/blog/found-keeping-elasticsearch-in-sync).
1189
+
1155
1190
  ## Advanced
1156
1191
 
1157
1192
  Prefer to use the [Elasticsearch DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-queries.html) but still want awesome features like zero-downtime reindexing?
@@ -1453,10 +1488,6 @@ Product.search "api", misspellings: {prefix_length: 2} # api, apt, no ahi
1453
1488
  Product.search "ah", misspellings: {prefix_length: 2} # ah, no aha
1454
1489
  ```
1455
1490
 
1456
- ## Large Data Sets
1457
-
1458
- For large data sets, check out [Keeping Elasticsearch in Sync](https://www.elastic.co/blog/found-keeping-elasticsearch-in-sync). Searchkick will make this easy in the future.
1459
-
1460
1491
  ## Testing
1461
1492
 
1462
1493
  This section could use some love.
data/benchmark/Gemfile CHANGED
@@ -3,9 +3,13 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in searchkick.gemspec
4
4
  gemspec path: "../"
5
5
 
6
- gem "sqlite3"
6
+ # gem "sqlite3"
7
+ gem "pg"
7
8
  gem "activerecord", "~> 5.0.0"
8
9
  gem "activerecord-import"
10
+ gem "activejob"
11
+ gem "redis"
12
+ gem "sidekiq"
9
13
 
10
14
  # performance
11
15
  gem "typhoeus"
@@ -2,19 +2,27 @@ require "bundler/setup"
2
2
  Bundler.require(:default)
3
3
  require "active_record"
4
4
  require "benchmark"
5
+ require "active_support/notifications"
6
+
7
+ ActiveSupport::Notifications.subscribe "request.searchkick" do |*args|
8
+ event = ActiveSupport::Notifications::Event.new(*args)
9
+ p event.duration
10
+ end
11
+
12
+ ActiveJob::Base.queue_adapter = :sidekiq
13
+
14
+ Searchkick.redis = Redis.new
5
15
 
6
16
  ActiveRecord::Base.default_timezone = :utc
7
17
  ActiveRecord::Base.time_zone_aware_attributes = true
8
- ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:"
18
+ # ActiveRecord::Base.establish_connection adapter: "sqlite3", database: "/tmp/searchkick"
19
+ ActiveRecord::Base.establish_connection "postgresql://localhost/searchkick_demo_development"
20
+ # ActiveRecord::Base.logger = Logger.new(STDOUT)
9
21
 
10
- ActiveRecord::Migration.create_table :products do |t|
11
- t.string :name
12
- t.string :color
13
- t.integer :store_id
14
- end
22
+ ActiveJob::Base.logger = nil
15
23
 
16
24
  class Product < ActiveRecord::Base
17
- searchkick batch_size: 100
25
+ searchkick batch_size: 1000
18
26
 
19
27
  def search_data
20
28
  {
@@ -25,7 +33,15 @@ class Product < ActiveRecord::Base
25
33
  end
26
34
  end
27
35
 
28
- Product.import ["name", "color", "store_id"], 20000.times.map { |i| ["Product #{i}", ["red", "blue"].sample, rand(10)] }
36
+ total_docs = 100000
37
+
38
+ # ActiveRecord::Migration.create_table :products, force: :cascade do |t|
39
+ # t.string :name
40
+ # t.string :color
41
+ # t.integer :store_id
42
+ # end
43
+
44
+ # Product.import ["name", "color", "store_id"], total_docs.times.map { |i| ["Product #{i}", ["red", "blue"].sample, rand(10)] }
29
45
 
30
46
  puts "Imported"
31
47
 
@@ -35,19 +51,37 @@ stats = nil
35
51
 
36
52
  # p GetProcessMem.new.mb
37
53
 
54
+ Product.searchkick_index.delete rescue nil
55
+
38
56
  time =
39
57
  Benchmark.realtime do
40
58
  # result = RubyProf.profile do
41
59
  # report = MemoryProfiler.report do
42
60
  # stats = AllocationStats.trace do
43
- Product.reindex
61
+ reindex = Product.reindex(async: true)
62
+ p reindex
44
63
  # end
64
+
65
+ 60.times do |i|
66
+ if reindex.is_a?(Hash)
67
+ docs = Searchkick::Index.new(reindex[:index_name]).total_docs
68
+ else
69
+ docs = Product.searchkick_index.total_docs
70
+ end
71
+ puts "#{i}: #{docs}"
72
+ if docs == total_docs
73
+ break
74
+ end
75
+ p Searchkick.reindex_status(reindex[:index_name]) if reindex.is_a?(Hash)
76
+ sleep(1)
77
+ # Product.searchkick_index.refresh
78
+ end
45
79
  end
46
80
 
47
81
  # p GetProcessMem.new.mb
48
82
 
49
83
  puts time.round(1)
50
- puts Product.searchkick_index.total_docs
84
+
51
85
 
52
86
  if result
53
87
  printer = RubyProf::GraphPrinter.new(result)
@@ -0,0 +1,12 @@
1
+ module Searchkick
2
+ class BulkReindexJob < ActiveJob::Base
3
+ queue_as :searchkick
4
+
5
+ def perform(class_name:, record_ids: nil, index_name: nil, method_name: nil, batch_id: nil, min_id: nil, max_id: nil)
6
+ klass = class_name.constantize
7
+ index = index_name ? Searchkick::Index.new(index_name) : klass.searchkick_index
8
+ record_ids ||= min_id..max_id
9
+ index.import_scope(klass.where(klass.primary_key => record_ids), method_name: method_name, batch: true, batch_id: batch_id)
10
+ end
11
+ end
12
+ end
@@ -38,6 +38,10 @@ module Searchkick
38
38
  client.indices.get_settings index: name
39
39
  end
40
40
 
41
+ def update_settings(settings)
42
+ client.indices.put_settings index: name, body: settings
43
+ end
44
+
41
45
  def promote(new_name)
42
46
  old_indices =
43
47
  begin
@@ -187,7 +191,7 @@ module Searchkick
187
191
 
188
192
  # https://gist.github.com/jarosan/3124884
189
193
  # http://www.elasticsearch.org/blog/changing-mapping-with-zero-downtime/
190
- def reindex_scope(scope, import: true, resume: false, retain: false)
194
+ def reindex_scope(scope, import: true, resume: false, retain: false, async: false)
191
195
  if resume
192
196
  index_name = all_indices.sort.last
193
197
  raise Searchkick::Error, "No index to resume" unless index_name
@@ -201,30 +205,58 @@ module Searchkick
201
205
  # check if alias exists
202
206
  if alias_exists?
203
207
  # import before promotion
204
- index.import_scope(scope, resume: resume) if import
208
+ index.import_scope(scope, resume: resume, async: async, full: true) if import
205
209
 
206
210
  # get existing indices to remove
207
- promote(index.name)
208
- clean_indices unless retain
211
+ unless async
212
+ promote(index.name)
213
+ clean_indices unless retain
214
+ end
209
215
  else
210
216
  delete if exists?
211
217
  promote(index.name)
212
218
 
213
219
  # import after promotion
214
- index.import_scope(scope, resume: resume) if import
220
+ index.import_scope(scope, resume: resume, async: async, full: true) if import
215
221
  end
216
222
 
217
- index.refresh
218
-
219
- true
223
+ if async
224
+ {index_name: index.name}
225
+ else
226
+ index.refresh
227
+ true
228
+ end
220
229
  end
221
230
 
222
- def import_scope(scope, resume: false, method_name: nil)
231
+ def import_scope(scope, resume: false, method_name: nil, async: false, batch: false, batch_id: nil, full: false)
223
232
  batch_size = @options[:batch_size] || 1000
224
233
 
225
234
  # use scope for import
226
235
  scope = scope.search_import if scope.respond_to?(:search_import)
227
- if scope.respond_to?(:find_in_batches)
236
+
237
+ if batch
238
+ import_or_update scope.to_a, method_name, async
239
+ Searchkick.redis.srem(batches_key, batch_id) if batch_id && Searchkick.redis
240
+ elsif full && async
241
+ # TODO expire Redis key
242
+ primary_key = scope.primary_key
243
+ starting_id = scope.minimum(primary_key)
244
+ max_id = scope.maximum(primary_key)
245
+ batches_count = ((max_id - starting_id + 1) / batch_size.to_f).ceil
246
+
247
+ batches_count.times do |i|
248
+ batch_id = i + 1
249
+ min_id = starting_id + (i * batch_size)
250
+ Searchkick::BulkReindexJob.perform_later(
251
+ class_name: scope.model_name.name,
252
+ min_id: min_id,
253
+ max_id: min_id + batch_size - 1,
254
+ index_name: name,
255
+ batch_id: batch_id
256
+ )
257
+ Searchkick.redis.sadd(batches_key, batch_id) if Searchkick.redis
258
+ end
259
+ elsif scope.respond_to?(:find_in_batches)
228
260
  if resume
229
261
  # use total docs instead of max id since there's not a great way
230
262
  # to get the max _id without scripting since it's a string
@@ -233,8 +265,10 @@ module Searchkick
233
265
  scope = scope.where("id > ?", total_docs)
234
266
  end
235
267
 
268
+ scope = scope.select("id").except(:includes, :preload) if async
269
+
236
270
  scope.find_in_batches batch_size: batch_size do |batch|
237
- import_or_update batch.select(&:should_index?), method_name
271
+ import_or_update batch, method_name, async
238
272
  end
239
273
  else
240
274
  # https://github.com/karmi/tire/blob/master/lib/tire/model/import.rb
@@ -242,27 +276,18 @@ module Searchkick
242
276
  items = []
243
277
  # TODO add resume
244
278
  scope.all.each do |item|
245
- items << item if item.should_index?
279
+ items << item
246
280
  if items.length == batch_size
247
- import_or_update items, method_name
281
+ import_or_update items, method_name, async
248
282
  items = []
249
283
  end
250
284
  end
251
- import_or_update items, method_name
285
+ import_or_update items, method_name, async
252
286
  end
253
287
  end
254
288
 
255
- def import_or_update(records, method_name)
256
- retries = 0
257
- begin
258
- method_name ? bulk_update(records, method_name) : import(records)
259
- rescue Faraday::ClientError => e
260
- if retries < 1
261
- retries += 1
262
- retry
263
- end
264
- raise e
265
- end
289
+ def batches_left
290
+ Searchkick.redis.scard(batches_key) if Searchkick.redis
266
291
  end
267
292
 
268
293
  # other
@@ -373,5 +398,34 @@ module Searchkick
373
398
  obj
374
399
  end
375
400
  end
401
+
402
+ def import_or_update(records, method_name, async)
403
+ if records.any?
404
+ if async
405
+ Searchkick::BulkReindexJob.perform_later(
406
+ class_name: records.first.class.name,
407
+ record_ids: records.map(&:id),
408
+ index_name: name,
409
+ method_name: method_name ? method_name.to_s : nil
410
+ )
411
+ else
412
+ retries = 0
413
+ records = records.select(&:should_index?)
414
+ begin
415
+ method_name ? bulk_update(records, method_name) : import(records)
416
+ rescue Faraday::ClientError => e
417
+ if retries < 1
418
+ retries += 1
419
+ retry
420
+ end
421
+ raise e
422
+ end
423
+ end
424
+ end
425
+ end
426
+
427
+ def batches_key
428
+ "searchkick:reindex:#{name}:batches"
429
+ end
376
430
  end
377
431
  end
@@ -58,15 +58,16 @@ module Searchkick
58
58
  # update
59
59
  searchkick_index.import_scope(searchkick_klass, method_name: method_name)
60
60
  searchkick_index.refresh if refresh
61
+ true
61
62
  elsif scoped && !full
62
63
  # reindex association
63
64
  searchkick_index.import_scope(searchkick_klass)
64
65
  searchkick_index.refresh if refresh
66
+ true
65
67
  else
66
68
  # full reindex
67
69
  searchkick_index.reindex_scope(searchkick_klass, options)
68
70
  end
69
- true
70
71
  end
71
72
  alias_method :reindex, :searchkick_reindex unless method_defined?(:reindex)
72
73
 
@@ -83,17 +84,27 @@ module Searchkick
83
84
  after_destroy callback_name, if: proc { self.class.search_callbacks? }
84
85
  end
85
86
 
86
- def reindex(method_name = nil, refresh: false)
87
- if method_name
88
- self.class.searchkick_index.bulk_update([self], method_name)
87
+ def reindex(method_name = nil, refresh: false, async: false)
88
+ if async
89
+ if method_name
90
+ # TODO support Mongoid and NoBrainer and non-id primary keys
91
+ Searchkick::BulkReindexJob.perform_later(class_name: self.class.name, record_ids: [id.to_s], method_name: method_name ? method_name.to_s : nil)
92
+ else
93
+ self.class.searchkick_index.reindex_record_async(self)
94
+ end
89
95
  else
90
- self.class.searchkick_index.reindex_record(self)
96
+ if method_name
97
+ self.class.searchkick_index.bulk_update([self], method_name)
98
+ else
99
+ self.class.searchkick_index.reindex_record(self)
100
+ end
101
+ self.class.searchkick_index.refresh if refresh
91
102
  end
92
- self.class.searchkick_index.refresh if refresh
93
103
  end unless method_defined?(:reindex)
94
104
 
105
+ # TODO remove this method in next major version
95
106
  def reindex_async
96
- self.class.searchkick_index.reindex_record_async(self)
107
+ reindex(async: true)
97
108
  end unless method_defined?(:reindex_async)
98
109
 
99
110
  def similar(options = {})
@@ -34,6 +34,7 @@ module Searchkick
34
34
  # prevent Ruby warnings
35
35
  @type = nil
36
36
  @routing = nil
37
+ @misspellings = false
37
38
  @misspellings_below = nil
38
39
  @highlighted_fields = nil
39
40
 
@@ -107,7 +108,8 @@ module Searchkick
107
108
  includes: options[:includes],
108
109
  json: !@json.nil?,
109
110
  match_suffix: @match_suffix,
110
- highlighted_fields: @highlighted_fields || []
111
+ highlighted_fields: @highlighted_fields || [],
112
+ misspellings: @misspellings
111
113
  }
112
114
 
113
115
  if options[:debug]
@@ -256,6 +258,9 @@ module Searchkick
256
258
  prefix_length = (misspellings.is_a?(Hash) && misspellings[:prefix_length]) || 0
257
259
  default_max_expansions = @misspellings_below ? 20 : 3
258
260
  max_expansions = (misspellings.is_a?(Hash) && misspellings[:max_expansions]) || default_max_expansions
261
+ @misspellings = true
262
+ else
263
+ @misspellings = false
259
264
  end
260
265
 
261
266
  fields.each do |field|
@@ -184,6 +184,10 @@ module Searchkick
184
184
  @response["hits"]["hits"]
185
185
  end
186
186
 
187
+ def misspellings?
188
+ @options[:misspellings]
189
+ end
190
+
187
191
  private
188
192
 
189
193
  def results_query(records, hits)
@@ -1,3 +1,3 @@
1
1
  module Searchkick
2
- VERSION = "2.0.2"
2
+ VERSION = "2.0.3"
3
3
  end
data/lib/searchkick.rb CHANGED
@@ -19,7 +19,10 @@ begin
19
19
  rescue LoadError
20
20
  # do nothing
21
21
  end
22
- require "searchkick/reindex_v2_job" if defined?(ActiveJob)
22
+ if defined?(ActiveJob)
23
+ require "searchkick/bulk_reindex_job"
24
+ require "searchkick/reindex_v2_job"
25
+ end
23
26
 
24
27
  module Searchkick
25
28
  class Error < StandardError; end
@@ -30,7 +33,7 @@ module Searchkick
30
33
  class ImportError < Error; end
31
34
 
32
35
  class << self
33
- attr_accessor :search_method_name, :wordnet_path, :timeout, :models, :client_options
36
+ attr_accessor :search_method_name, :wordnet_path, :timeout, :models, :client_options, :redis
34
37
  attr_writer :client, :env, :search_timeout
35
38
  attr_reader :aws_credentials
36
39
  end
@@ -129,6 +132,16 @@ module Searchkick
129
132
  @client = nil # reset client
130
133
  end
131
134
 
135
+ def self.reindex_status(index_name)
136
+ if redis
137
+ batches_left = Searchkick::Index.new(index_name).batches_left
138
+ {
139
+ completed: batches_left == 0,
140
+ batches_left: batches_left
141
+ }
142
+ end
143
+ end
144
+
132
145
  # private
133
146
  def self.indexer
134
147
  Thread.current[:searchkick_indexer] ||= Searchkick::Indexer.new
data/test/match_test.rb CHANGED
@@ -203,6 +203,11 @@ class MatchTest < Minitest::Test
203
203
  assert_search "fresh honey", ["Fresh Honey"], match: :phrase
204
204
  end
205
205
 
206
+ def test_phrase_order
207
+ store_names ["Wheat Bread", "Whole Wheat Bread"]
208
+ assert_order "wheat bread", ["Wheat Bread", "Whole Wheat Bread"], match: :phrase
209
+ end
210
+
206
211
  def test_unsearchable
207
212
  store [
208
213
  {name: "Unsearchable", description: "Almond"}
@@ -39,8 +39,18 @@ class MisspellingsTest < Minitest::Test
39
39
  assert_search "abc", ["abc", "abd"], misspellings: {below: 2}
40
40
  end
41
41
 
42
+ def test_misspellings_below_unmet_result
43
+ store_names ["abc", "abd", "aee"]
44
+ assert Product.search("abc", misspellings: {below: 2}).misspellings?
45
+ end
46
+
42
47
  def test_misspellings_below_met
43
48
  store_names ["abc", "abd", "aee"]
44
49
  assert_search "abc", ["abc"], misspellings: {below: 1}
45
50
  end
51
+
52
+ def test_misspellings_below_met_result
53
+ store_names ["abc", "abd", "aee"]
54
+ assert !Product.search("abc", misspellings: {below: 1}).misspellings?
55
+ end
46
56
  end
data/test/reindex_test.rb CHANGED
@@ -22,4 +22,21 @@ class ReindexTest < Minitest::Test
22
22
  store.products.reindex(refresh: true)
23
23
  assert_search "product", ["Product A", "Product B"]
24
24
  end
25
+
26
+ def test_async
27
+ skip unless defined?(ActiveJob) && defined?(ActiveRecord)
28
+
29
+ Searchkick.callbacks(false) do
30
+ store_names ["Product A"]
31
+ end
32
+ reindex = Product.reindex(async: true)
33
+ assert_search "product", []
34
+
35
+ index = Searchkick::Index.new(reindex[:index_name])
36
+ index.refresh
37
+ assert_equal 1, index.total_docs
38
+
39
+ Product.searchkick_index.promote(reindex[:index_name])
40
+ assert_search "product", ["Product A"]
41
+ end
25
42
  end
data/test/test_helper.rb CHANGED
@@ -372,7 +372,8 @@ class Animal
372
372
  searchkick \
373
373
  text_start: [:name],
374
374
  suggest: [:name],
375
- index_name: -> { "#{name.tableize}-#{Date.today.year}" }
375
+ index_name: -> { "#{name.tableize}-#{Date.today.year}" },
376
+ callbacks: defined?(ActiveJob) ? :async : true
376
377
  # wordnet: true
377
378
  end
378
379
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: searchkick
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-09 00:00:00.000000000 Z
11
+ date: 2017-01-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -111,6 +111,7 @@ files:
111
111
  - benchmark/Gemfile
112
112
  - benchmark/benchmark.rb
113
113
  - lib/searchkick.rb
114
+ - lib/searchkick/bulk_reindex_job.rb
114
115
  - lib/searchkick/index.rb
115
116
  - lib/searchkick/index_options.rb
116
117
  - lib/searchkick/indexer.rb