searchkick 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 01c193778cc86142dca494a2aa0b0ac8c6474d0b
4
- data.tar.gz: 7ff77b2f385a8004acaf1dabb4935f90d61c26f8
3
+ metadata.gz: 63b182684f14e8e5c0b30e0505a188d7bbc06f85
4
+ data.tar.gz: 9227d3877f3b6cf944a3413fe4478ec17393bc0c
5
5
  SHA512:
6
- metadata.gz: cbeefdbb0c40dfb0c32789e7468d3b570c7014fe9b27a202fca7b310820fd3c223264e360456ea68ca5bb13378e3770f448b3237abb9f61385f22d72942b09b0
7
- data.tar.gz: 199fb639bcc0d883e87848009e35d2ed0e62e491c5ada17d0eeab3844539513f27c09af46ebb17051f402973dabed86e46e8d6bd08e18b9b3dc43ce3ac2d74e0
6
+ metadata.gz: '08f71c79a49fa5c0c43d4d1d489b0040bfec789edd6c3e3737b48fd749a7dc1ee4d7b2337aa0e57db35a29a20c14ce13c5a0b653441151f64f2241b59f69a4f5'
7
+ data.tar.gz: 9f864b9b4d890a1a220364abd8e6d0376d06e90d853d34201630f3599bc580a5fee3cf41604b79cbed7bf25ee5524352056119cd9c4a8b0883017476639523da
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 2.0.3
2
+
3
+ - Added `async` option to `reindex` [experimental]
4
+ - Added `misspellings?` method to results
5
+
1
6
  ## 2.0.2
2
7
 
3
8
  - Added `retain` option to `reindex`
data/README.md CHANGED
@@ -253,12 +253,16 @@ Available options are:
253
253
 
254
254
  ### Exact Matches
255
255
 
256
+ To match a field exactly (case-insensitive), use:
257
+
256
258
  ```ruby
257
259
  User.search query, fields: [{email: :exact}, :name]
258
260
  ```
259
261
 
260
262
  ### Phrase Matches
261
263
 
264
+ To only match the exact order, use:
265
+
262
266
  ```ruby
263
267
  User.search "fresh honey", match: :phrase
264
268
  ```
@@ -1152,6 +1156,37 @@ Reindex and search with:
1152
1156
  Business.search "ice cream", routing: params[:city_id]
1153
1157
  ```
1154
1158
 
1159
+ ## Large Data Sets
1160
+
1161
+ ### Background Reindexing [experimental, ActiveRecord only]
1162
+
1163
+ For large data sets, you can use background jobs to parallelize reindexing.
1164
+
1165
+ ```ruby
1166
+ Product.reindex(async: true)
1167
+ # {index_name: "products_production_20170111210018065"}
1168
+ ```
1169
+
1170
+ Once the jobs complete, promote the new index with:
1171
+
1172
+ ```ruby
1173
+ Product.searchkick_index.promote(index_name)
1174
+ ```
1175
+
1176
+ You can optionally track the status with Redis:
1177
+
1178
+ ```ruby
1179
+ Searchkick.redis = Redis.new
1180
+ ```
1181
+
1182
+ And use:
1183
+
1184
+ ```ruby
1185
+ Searchkick.reindex_status(index_name)
1186
+ ```
1187
+
1188
+ For more tips, check out [Keeping Elasticsearch in Sync](https://www.elastic.co/blog/found-keeping-elasticsearch-in-sync).
1189
+
1155
1190
  ## Advanced
1156
1191
 
1157
1192
  Prefer to use the [Elasticsearch DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-queries.html) but still want awesome features like zero-downtime reindexing?
@@ -1453,10 +1488,6 @@ Product.search "api", misspellings: {prefix_length: 2} # api, apt, no ahi
1453
1488
  Product.search "ah", misspellings: {prefix_length: 2} # ah, no aha
1454
1489
  ```
1455
1490
 
1456
- ## Large Data Sets
1457
-
1458
- For large data sets, check out [Keeping Elasticsearch in Sync](https://www.elastic.co/blog/found-keeping-elasticsearch-in-sync). Searchkick will make this easy in the future.
1459
-
1460
1491
  ## Testing
1461
1492
 
1462
1493
  This section could use some love.
data/benchmark/Gemfile CHANGED
@@ -3,9 +3,13 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in searchkick.gemspec
4
4
  gemspec path: "../"
5
5
 
6
- gem "sqlite3"
6
+ # gem "sqlite3"
7
+ gem "pg"
7
8
  gem "activerecord", "~> 5.0.0"
8
9
  gem "activerecord-import"
10
+ gem "activejob"
11
+ gem "redis"
12
+ gem "sidekiq"
9
13
 
10
14
  # performance
11
15
  gem "typhoeus"
@@ -2,19 +2,27 @@ require "bundler/setup"
2
2
  Bundler.require(:default)
3
3
  require "active_record"
4
4
  require "benchmark"
5
+ require "active_support/notifications"
6
+
7
+ ActiveSupport::Notifications.subscribe "request.searchkick" do |*args|
8
+ event = ActiveSupport::Notifications::Event.new(*args)
9
+ p event.duration
10
+ end
11
+
12
+ ActiveJob::Base.queue_adapter = :sidekiq
13
+
14
+ Searchkick.redis = Redis.new
5
15
 
6
16
  ActiveRecord::Base.default_timezone = :utc
7
17
  ActiveRecord::Base.time_zone_aware_attributes = true
8
- ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:"
18
+ # ActiveRecord::Base.establish_connection adapter: "sqlite3", database: "/tmp/searchkick"
19
+ ActiveRecord::Base.establish_connection "postgresql://localhost/searchkick_demo_development"
20
+ # ActiveRecord::Base.logger = Logger.new(STDOUT)
9
21
 
10
- ActiveRecord::Migration.create_table :products do |t|
11
- t.string :name
12
- t.string :color
13
- t.integer :store_id
14
- end
22
+ ActiveJob::Base.logger = nil
15
23
 
16
24
  class Product < ActiveRecord::Base
17
- searchkick batch_size: 100
25
+ searchkick batch_size: 1000
18
26
 
19
27
  def search_data
20
28
  {
@@ -25,7 +33,15 @@ class Product < ActiveRecord::Base
25
33
  end
26
34
  end
27
35
 
28
- Product.import ["name", "color", "store_id"], 20000.times.map { |i| ["Product #{i}", ["red", "blue"].sample, rand(10)] }
36
+ total_docs = 100000
37
+
38
+ # ActiveRecord::Migration.create_table :products, force: :cascade do |t|
39
+ # t.string :name
40
+ # t.string :color
41
+ # t.integer :store_id
42
+ # end
43
+
44
+ # Product.import ["name", "color", "store_id"], total_docs.times.map { |i| ["Product #{i}", ["red", "blue"].sample, rand(10)] }
29
45
 
30
46
  puts "Imported"
31
47
 
@@ -35,19 +51,37 @@ stats = nil
35
51
 
36
52
  # p GetProcessMem.new.mb
37
53
 
54
+ Product.searchkick_index.delete rescue nil
55
+
38
56
  time =
39
57
  Benchmark.realtime do
40
58
  # result = RubyProf.profile do
41
59
  # report = MemoryProfiler.report do
42
60
  # stats = AllocationStats.trace do
43
- Product.reindex
61
+ reindex = Product.reindex(async: true)
62
+ p reindex
44
63
  # end
64
+
65
+ 60.times do |i|
66
+ if reindex.is_a?(Hash)
67
+ docs = Searchkick::Index.new(reindex[:index_name]).total_docs
68
+ else
69
+ docs = Product.searchkick_index.total_docs
70
+ end
71
+ puts "#{i}: #{docs}"
72
+ if docs == total_docs
73
+ break
74
+ end
75
+ p Searchkick.reindex_status(reindex[:index_name]) if reindex.is_a?(Hash)
76
+ sleep(1)
77
+ # Product.searchkick_index.refresh
78
+ end
45
79
  end
46
80
 
47
81
  # p GetProcessMem.new.mb
48
82
 
49
83
  puts time.round(1)
50
- puts Product.searchkick_index.total_docs
84
+
51
85
 
52
86
  if result
53
87
  printer = RubyProf::GraphPrinter.new(result)
@@ -0,0 +1,12 @@
1
+ module Searchkick
2
+ class BulkReindexJob < ActiveJob::Base
3
+ queue_as :searchkick
4
+
5
+ def perform(class_name:, record_ids: nil, index_name: nil, method_name: nil, batch_id: nil, min_id: nil, max_id: nil)
6
+ klass = class_name.constantize
7
+ index = index_name ? Searchkick::Index.new(index_name) : klass.searchkick_index
8
+ record_ids ||= min_id..max_id
9
+ index.import_scope(klass.where(klass.primary_key => record_ids), method_name: method_name, batch: true, batch_id: batch_id)
10
+ end
11
+ end
12
+ end
@@ -38,6 +38,10 @@ module Searchkick
38
38
  client.indices.get_settings index: name
39
39
  end
40
40
 
41
+ def update_settings(settings)
42
+ client.indices.put_settings index: name, body: settings
43
+ end
44
+
41
45
  def promote(new_name)
42
46
  old_indices =
43
47
  begin
@@ -187,7 +191,7 @@ module Searchkick
187
191
 
188
192
  # https://gist.github.com/jarosan/3124884
189
193
  # http://www.elasticsearch.org/blog/changing-mapping-with-zero-downtime/
190
- def reindex_scope(scope, import: true, resume: false, retain: false)
194
+ def reindex_scope(scope, import: true, resume: false, retain: false, async: false)
191
195
  if resume
192
196
  index_name = all_indices.sort.last
193
197
  raise Searchkick::Error, "No index to resume" unless index_name
@@ -201,30 +205,58 @@ module Searchkick
201
205
  # check if alias exists
202
206
  if alias_exists?
203
207
  # import before promotion
204
- index.import_scope(scope, resume: resume) if import
208
+ index.import_scope(scope, resume: resume, async: async, full: true) if import
205
209
 
206
210
  # get existing indices to remove
207
- promote(index.name)
208
- clean_indices unless retain
211
+ unless async
212
+ promote(index.name)
213
+ clean_indices unless retain
214
+ end
209
215
  else
210
216
  delete if exists?
211
217
  promote(index.name)
212
218
 
213
219
  # import after promotion
214
- index.import_scope(scope, resume: resume) if import
220
+ index.import_scope(scope, resume: resume, async: async, full: true) if import
215
221
  end
216
222
 
217
- index.refresh
218
-
219
- true
223
+ if async
224
+ {index_name: index.name}
225
+ else
226
+ index.refresh
227
+ true
228
+ end
220
229
  end
221
230
 
222
- def import_scope(scope, resume: false, method_name: nil)
231
+ def import_scope(scope, resume: false, method_name: nil, async: false, batch: false, batch_id: nil, full: false)
223
232
  batch_size = @options[:batch_size] || 1000
224
233
 
225
234
  # use scope for import
226
235
  scope = scope.search_import if scope.respond_to?(:search_import)
227
- if scope.respond_to?(:find_in_batches)
236
+
237
+ if batch
238
+ import_or_update scope.to_a, method_name, async
239
+ Searchkick.redis.srem(batches_key, batch_id) if batch_id && Searchkick.redis
240
+ elsif full && async
241
+ # TODO expire Redis key
242
+ primary_key = scope.primary_key
243
+ starting_id = scope.minimum(primary_key)
244
+ max_id = scope.maximum(primary_key)
245
+ batches_count = ((max_id - starting_id + 1) / batch_size.to_f).ceil
246
+
247
+ batches_count.times do |i|
248
+ batch_id = i + 1
249
+ min_id = starting_id + (i * batch_size)
250
+ Searchkick::BulkReindexJob.perform_later(
251
+ class_name: scope.model_name.name,
252
+ min_id: min_id,
253
+ max_id: min_id + batch_size - 1,
254
+ index_name: name,
255
+ batch_id: batch_id
256
+ )
257
+ Searchkick.redis.sadd(batches_key, batch_id) if Searchkick.redis
258
+ end
259
+ elsif scope.respond_to?(:find_in_batches)
228
260
  if resume
229
261
  # use total docs instead of max id since there's not a great way
230
262
  # to get the max _id without scripting since it's a string
@@ -233,8 +265,10 @@ module Searchkick
233
265
  scope = scope.where("id > ?", total_docs)
234
266
  end
235
267
 
268
+ scope = scope.select("id").except(:includes, :preload) if async
269
+
236
270
  scope.find_in_batches batch_size: batch_size do |batch|
237
- import_or_update batch.select(&:should_index?), method_name
271
+ import_or_update batch, method_name, async
238
272
  end
239
273
  else
240
274
  # https://github.com/karmi/tire/blob/master/lib/tire/model/import.rb
@@ -242,27 +276,18 @@ module Searchkick
242
276
  items = []
243
277
  # TODO add resume
244
278
  scope.all.each do |item|
245
- items << item if item.should_index?
279
+ items << item
246
280
  if items.length == batch_size
247
- import_or_update items, method_name
281
+ import_or_update items, method_name, async
248
282
  items = []
249
283
  end
250
284
  end
251
- import_or_update items, method_name
285
+ import_or_update items, method_name, async
252
286
  end
253
287
  end
254
288
 
255
- def import_or_update(records, method_name)
256
- retries = 0
257
- begin
258
- method_name ? bulk_update(records, method_name) : import(records)
259
- rescue Faraday::ClientError => e
260
- if retries < 1
261
- retries += 1
262
- retry
263
- end
264
- raise e
265
- end
289
+ def batches_left
290
+ Searchkick.redis.scard(batches_key) if Searchkick.redis
266
291
  end
267
292
 
268
293
  # other
@@ -373,5 +398,34 @@ module Searchkick
373
398
  obj
374
399
  end
375
400
  end
401
+
402
+ def import_or_update(records, method_name, async)
403
+ if records.any?
404
+ if async
405
+ Searchkick::BulkReindexJob.perform_later(
406
+ class_name: records.first.class.name,
407
+ record_ids: records.map(&:id),
408
+ index_name: name,
409
+ method_name: method_name ? method_name.to_s : nil
410
+ )
411
+ else
412
+ retries = 0
413
+ records = records.select(&:should_index?)
414
+ begin
415
+ method_name ? bulk_update(records, method_name) : import(records)
416
+ rescue Faraday::ClientError => e
417
+ if retries < 1
418
+ retries += 1
419
+ retry
420
+ end
421
+ raise e
422
+ end
423
+ end
424
+ end
425
+ end
426
+
427
+ def batches_key
428
+ "searchkick:reindex:#{name}:batches"
429
+ end
376
430
  end
377
431
  end
@@ -58,15 +58,16 @@ module Searchkick
58
58
  # update
59
59
  searchkick_index.import_scope(searchkick_klass, method_name: method_name)
60
60
  searchkick_index.refresh if refresh
61
+ true
61
62
  elsif scoped && !full
62
63
  # reindex association
63
64
  searchkick_index.import_scope(searchkick_klass)
64
65
  searchkick_index.refresh if refresh
66
+ true
65
67
  else
66
68
  # full reindex
67
69
  searchkick_index.reindex_scope(searchkick_klass, options)
68
70
  end
69
- true
70
71
  end
71
72
  alias_method :reindex, :searchkick_reindex unless method_defined?(:reindex)
72
73
 
@@ -83,17 +84,27 @@ module Searchkick
83
84
  after_destroy callback_name, if: proc { self.class.search_callbacks? }
84
85
  end
85
86
 
86
- def reindex(method_name = nil, refresh: false)
87
- if method_name
88
- self.class.searchkick_index.bulk_update([self], method_name)
87
+ def reindex(method_name = nil, refresh: false, async: false)
88
+ if async
89
+ if method_name
90
+ # TODO support Mongoid and NoBrainer and non-id primary keys
91
+ Searchkick::BulkReindexJob.perform_later(class_name: self.class.name, record_ids: [id.to_s], method_name: method_name ? method_name.to_s : nil)
92
+ else
93
+ self.class.searchkick_index.reindex_record_async(self)
94
+ end
89
95
  else
90
- self.class.searchkick_index.reindex_record(self)
96
+ if method_name
97
+ self.class.searchkick_index.bulk_update([self], method_name)
98
+ else
99
+ self.class.searchkick_index.reindex_record(self)
100
+ end
101
+ self.class.searchkick_index.refresh if refresh
91
102
  end
92
- self.class.searchkick_index.refresh if refresh
93
103
  end unless method_defined?(:reindex)
94
104
 
105
+ # TODO remove this method in next major version
95
106
  def reindex_async
96
- self.class.searchkick_index.reindex_record_async(self)
107
+ reindex(async: true)
97
108
  end unless method_defined?(:reindex_async)
98
109
 
99
110
  def similar(options = {})
@@ -34,6 +34,7 @@ module Searchkick
34
34
  # prevent Ruby warnings
35
35
  @type = nil
36
36
  @routing = nil
37
+ @misspellings = false
37
38
  @misspellings_below = nil
38
39
  @highlighted_fields = nil
39
40
 
@@ -107,7 +108,8 @@ module Searchkick
107
108
  includes: options[:includes],
108
109
  json: !@json.nil?,
109
110
  match_suffix: @match_suffix,
110
- highlighted_fields: @highlighted_fields || []
111
+ highlighted_fields: @highlighted_fields || [],
112
+ misspellings: @misspellings
111
113
  }
112
114
 
113
115
  if options[:debug]
@@ -256,6 +258,9 @@ module Searchkick
256
258
  prefix_length = (misspellings.is_a?(Hash) && misspellings[:prefix_length]) || 0
257
259
  default_max_expansions = @misspellings_below ? 20 : 3
258
260
  max_expansions = (misspellings.is_a?(Hash) && misspellings[:max_expansions]) || default_max_expansions
261
+ @misspellings = true
262
+ else
263
+ @misspellings = false
259
264
  end
260
265
 
261
266
  fields.each do |field|
@@ -184,6 +184,10 @@ module Searchkick
184
184
  @response["hits"]["hits"]
185
185
  end
186
186
 
187
+ def misspellings?
188
+ @options[:misspellings]
189
+ end
190
+
187
191
  private
188
192
 
189
193
  def results_query(records, hits)
@@ -1,3 +1,3 @@
1
1
  module Searchkick
2
- VERSION = "2.0.2"
2
+ VERSION = "2.0.3"
3
3
  end
data/lib/searchkick.rb CHANGED
@@ -19,7 +19,10 @@ begin
19
19
  rescue LoadError
20
20
  # do nothing
21
21
  end
22
- require "searchkick/reindex_v2_job" if defined?(ActiveJob)
22
+ if defined?(ActiveJob)
23
+ require "searchkick/bulk_reindex_job"
24
+ require "searchkick/reindex_v2_job"
25
+ end
23
26
 
24
27
  module Searchkick
25
28
  class Error < StandardError; end
@@ -30,7 +33,7 @@ module Searchkick
30
33
  class ImportError < Error; end
31
34
 
32
35
  class << self
33
- attr_accessor :search_method_name, :wordnet_path, :timeout, :models, :client_options
36
+ attr_accessor :search_method_name, :wordnet_path, :timeout, :models, :client_options, :redis
34
37
  attr_writer :client, :env, :search_timeout
35
38
  attr_reader :aws_credentials
36
39
  end
@@ -129,6 +132,16 @@ module Searchkick
129
132
  @client = nil # reset client
130
133
  end
131
134
 
135
+ def self.reindex_status(index_name)
136
+ if redis
137
+ batches_left = Searchkick::Index.new(index_name).batches_left
138
+ {
139
+ completed: batches_left == 0,
140
+ batches_left: batches_left
141
+ }
142
+ end
143
+ end
144
+
132
145
  # private
133
146
  def self.indexer
134
147
  Thread.current[:searchkick_indexer] ||= Searchkick::Indexer.new
data/test/match_test.rb CHANGED
@@ -203,6 +203,11 @@ class MatchTest < Minitest::Test
203
203
  assert_search "fresh honey", ["Fresh Honey"], match: :phrase
204
204
  end
205
205
 
206
+ def test_phrase_order
207
+ store_names ["Wheat Bread", "Whole Wheat Bread"]
208
+ assert_order "wheat bread", ["Wheat Bread", "Whole Wheat Bread"], match: :phrase
209
+ end
210
+
206
211
  def test_unsearchable
207
212
  store [
208
213
  {name: "Unsearchable", description: "Almond"}
@@ -39,8 +39,18 @@ class MisspellingsTest < Minitest::Test
39
39
  assert_search "abc", ["abc", "abd"], misspellings: {below: 2}
40
40
  end
41
41
 
42
+ def test_misspellings_below_unmet_result
43
+ store_names ["abc", "abd", "aee"]
44
+ assert Product.search("abc", misspellings: {below: 2}).misspellings?
45
+ end
46
+
42
47
  def test_misspellings_below_met
43
48
  store_names ["abc", "abd", "aee"]
44
49
  assert_search "abc", ["abc"], misspellings: {below: 1}
45
50
  end
51
+
52
+ def test_misspellings_below_met_result
53
+ store_names ["abc", "abd", "aee"]
54
+ assert !Product.search("abc", misspellings: {below: 1}).misspellings?
55
+ end
46
56
  end
data/test/reindex_test.rb CHANGED
@@ -22,4 +22,21 @@ class ReindexTest < Minitest::Test
22
22
  store.products.reindex(refresh: true)
23
23
  assert_search "product", ["Product A", "Product B"]
24
24
  end
25
+
26
+ def test_async
27
+ skip unless defined?(ActiveJob) && defined?(ActiveRecord)
28
+
29
+ Searchkick.callbacks(false) do
30
+ store_names ["Product A"]
31
+ end
32
+ reindex = Product.reindex(async: true)
33
+ assert_search "product", []
34
+
35
+ index = Searchkick::Index.new(reindex[:index_name])
36
+ index.refresh
37
+ assert_equal 1, index.total_docs
38
+
39
+ Product.searchkick_index.promote(reindex[:index_name])
40
+ assert_search "product", ["Product A"]
41
+ end
25
42
  end
data/test/test_helper.rb CHANGED
@@ -372,7 +372,8 @@ class Animal
372
372
  searchkick \
373
373
  text_start: [:name],
374
374
  suggest: [:name],
375
- index_name: -> { "#{name.tableize}-#{Date.today.year}" }
375
+ index_name: -> { "#{name.tableize}-#{Date.today.year}" },
376
+ callbacks: defined?(ActiveJob) ? :async : true
376
377
  # wordnet: true
377
378
  end
378
379
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: searchkick
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-09 00:00:00.000000000 Z
11
+ date: 2017-01-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -111,6 +111,7 @@ files:
111
111
  - benchmark/Gemfile
112
112
  - benchmark/benchmark.rb
113
113
  - lib/searchkick.rb
114
+ - lib/searchkick/bulk_reindex_job.rb
114
115
  - lib/searchkick/index.rb
115
116
  - lib/searchkick/index_options.rb
116
117
  - lib/searchkick/indexer.rb