hekenga 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 55037feb4349d723dd34938f3eb9aab8431bad7f287081b685705c6a6d40b046
4
- data.tar.gz: 896bdc203b3f30c543a4f7265bdea16e128d64ed59456e32cecb683a789f684c
3
+ metadata.gz: f21c3c1cb0e45c3b9eb2627fc960d032ff981e01b239dc69c02d2aebc1f7b539
4
+ data.tar.gz: 62fbbd8a65bae8bacc537bcc40cc4a3960b795ebb19a13c34b8ebd539eb102d0
5
5
  SHA512:
6
- metadata.gz: 69a634184500d3c5cca3298a3793c0e5f496d31f935af6265840e1ab7e576de9bc06623cff8c03738fed9f2028146d460fd7ed04cd71e3f684c890a792d6986e
7
- data.tar.gz: 2b916adfaa97a95fa645cd6442aefacd5125f78c2b62ff1b13bfa237e373998ecd44f0c88bdc0b7a239afa9e8f25e2bfb2cc05a4de11f142898c42ce963f246c
6
+ metadata.gz: 6317b298a05085564cfaaee1beef6b5749e83f8bc877538b448db66b50c3dde50f0380a5fa7f178dff7feca6840564c486f263891a838d4ed51617b7ff4f8698
7
+ data.tar.gz: 12ba1c564acca2f7d43c80a384a0767b00592abf5126d0016a0bf7cadf610e6d342890cd54597a602654e8449491135a3dda488bb7c1c8aaeb72d035bdc7c5d6
data/CHANGELOG.md CHANGED
@@ -1,5 +1,26 @@
1
1
  # Changelog
2
2
 
3
+ ## v2.0.0
4
+
5
+ - `Hekenga::Iterator` has been replaced by `Hekenga::IdIterator`. If any
6
+ selector or sort is set on a document task migration scope, it no longer forces an
7
+ ascending ID sort. This should help to prevent index misses, though there is a
8
+ tradeoff that documents being concurrently updated may be skipped or
9
+ processed multiple times. Hekenga tries to guard against processing multiple
10
+ times. Manually specifying an `asc(:_id)` on your scope will continue to
11
+ process documents in ID order.
12
+ - Document tasks now support a new option, `cursor_timeout`. This is the maximum
13
+ time a document task's `scope` can be iterated and queue jobs within. The
14
+ default is one day.
15
+
16
+ ## v1.1.0
17
+
18
+ - `setup` is now passed the current batch of documents so it can be used to
19
+ preload effectively
20
+ - `after` has been added - this is a block of code that will be passed the
21
+ current batch of written documents after a write has been completed. If
22
+ no documents were written, this block will not be called
23
+
3
24
  ## v1.0.4
4
25
 
5
26
  - Log errors during document tasks to stdout
data/README.md CHANGED
@@ -3,9 +3,6 @@
3
3
  An attempt at a migration framework for MongoDB that supports parallel document
4
4
  processing via ActiveJob, chained jobs and error recovery.
5
5
 
6
- **Note that this gem is currently in pre-alpha - assume most things have a high
7
- chance of being broken.**
8
-
9
6
  ## Installation
10
7
 
11
8
  Add this line to your application's Gemfile:
data/docker-compose.yml CHANGED
@@ -8,7 +8,7 @@ networks:
8
8
 
9
9
  services:
10
10
  mongo:
11
- image: mongo:5
11
+ image: mongo:6
12
12
  command: ["--replSet", "rs0", "--bind_ip", "localhost,mongo"]
13
13
  volumes:
14
14
  - mongo:/data/db
@@ -18,7 +18,7 @@ services:
18
18
  - hekenga-net
19
19
 
20
20
  mongosetup:
21
- image: mongo:5
21
+ image: mongo:6
22
22
  depends_on:
23
23
  - mongo
24
24
  restart: "no"
@@ -0,0 +1,24 @@
1
+ module Hekenga
2
+ class BaseIterator
3
+ include Enumerable
4
+ DEFAULT_TIMEOUT = 86_400 # 1 day in seconds
5
+
6
+ attr_reader :cursor_timeout
7
+
8
+ def initialize(scope:, cursor_timeout: DEFAULT_TIMEOUT)
9
+ @scope = scope
10
+ @cursor_timeout = cursor_timeout
11
+ end
12
+
13
+ private
14
+
15
+ def iteration_scope
16
+ if @scope.selector.blank? && @scope.options.blank?
17
+ # Apply a default _id sort, it works the best
18
+ @scope.asc(:_id)
19
+ else
20
+ @scope
21
+ end.max_time_ms(cursor_timeout * 1000) # convert to ms
22
+ end
23
+ end
24
+ end
@@ -1,8 +1,9 @@
1
1
  require 'hekenga/irreversible'
2
+ require 'hekenga/base_iterator'
2
3
  module Hekenga
3
4
  class DocumentTask
4
- attr_reader :ups, :downs, :setups, :filters
5
- attr_accessor :parallel, :scope, :timeless, :batch_size
5
+ attr_reader :ups, :downs, :setups, :filters, :after_callbacks
6
+ attr_accessor :parallel, :scope, :timeless, :batch_size, :cursor_timeout
6
7
  attr_accessor :description, :invalid_strategy, :skip_prepare, :write_strategy
7
8
  attr_accessor :always_write, :use_transaction
8
9
 
@@ -11,12 +12,14 @@ module Hekenga
11
12
  @downs = []
12
13
  @setups = []
13
14
  @filters = []
15
+ @after_callbacks = []
14
16
  @invalid_strategy = :continue
15
17
  @write_strategy = :update
16
18
  @skip_prepare = false
17
19
  @batch_size = nil
18
20
  @always_write = false
19
21
  @use_transaction = false
22
+ @cursor_timeout = Hekenga::BaseIterator::DEFAULT_TIMEOUT
20
23
  end
21
24
 
22
25
  def validate!
@@ -28,6 +28,7 @@ module Hekenga
28
28
  # In test mode, the transaction will be aborted - so we need to write
29
29
  # the result outside of the run! block
30
30
  write_result if task_record.test_mode
31
+ after_callback
31
32
  end
32
33
  end
33
34
 
@@ -206,7 +207,7 @@ module Hekenga
206
207
  @context = Hekenga::Context.new(test_mode: task_record.test_mode)
207
208
  begin
208
209
  task.setups&.each do |setup|
209
- @context.instance_exec(&setup)
210
+ @context.instance_exec(records, &setup)
210
211
  end
211
212
  rescue => e
212
213
  fail_and_cancel!(e)
@@ -217,6 +218,18 @@ module Hekenga
217
218
  @context = nil
218
219
  end
219
220
 
221
+ def after_callback
222
+ return if records_to_write.empty?
223
+
224
+ task.after_callbacks&.each do |callback|
225
+ @context.instance_exec(records_to_write, &callback)
226
+ end
227
+ rescue => e
228
+ # Error is just printed for now as we've already migrated, don't
229
+ # want to overwrite the task record state
230
+ print_error(e)
231
+ end
232
+
220
233
  def fail_and_cancel!(error)
221
234
  log = migration.log(task_idx)
222
235
  log.add_failure({
@@ -28,6 +28,10 @@ module Hekenga
28
28
  @object.write_strategy = strategy
29
29
  end
30
30
 
31
+ def cursor_timeout(timeout)
32
+ @object.cursor_timeout = timeout.to_i
33
+ end
34
+
31
35
  def scope(scope)
32
36
  @object.scope = scope
33
37
  end
@@ -67,6 +71,10 @@ module Hekenga
67
71
  def down(&block)
68
72
  @object.downs.push block
69
73
  end
74
+
75
+ def after(&block)
76
+ @object.after_callbacks.push block
77
+ end
70
78
  end
71
79
  end
72
80
  end
@@ -0,0 +1,34 @@
1
+ require "hekenga/base_iterator"
2
+ module Hekenga
3
+ class IdIterator < BaseIterator
4
+ DEFAULT_ID = "_id".freeze
5
+
6
+ attr_reader :id_property
7
+
8
+ def initialize(id_property: DEFAULT_ID, **kwargs)
9
+ super(**kwargs)
10
+ @id_property = id_property
11
+ end
12
+
13
+ def each
14
+ with_view do |view|
15
+ view.each do |doc|
16
+ yield doc[id_property]
17
+ end
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def with_view
24
+ view = iteration_scope.view
25
+ yield view
26
+ ensure
27
+ view.close_query
28
+ end
29
+
30
+ def iteration_scope
31
+ super.only(id_property)
32
+ end
33
+ end
34
+ end
@@ -2,6 +2,7 @@ require 'hekenga/invalid'
2
2
  require 'hekenga/context'
3
3
  require 'hekenga/parallel_job'
4
4
  require 'hekenga/parallel_task'
5
+ require 'hekenga/mongoid_iterator'
5
6
  require 'hekenga/master_process'
6
7
  require 'hekenga/document_task_record'
7
8
  require 'hekenga/document_task_executor'
@@ -132,18 +133,18 @@ module Hekenga
132
133
  records = []
133
134
  task_records(task_idx).delete_all unless recover
134
135
  executor_key = BSON::ObjectId.new
135
- task.scope.asc(:_id).no_timeout.each do |record|
136
+ Hekenga::MongoidIterator.new(scope: task.scope, cursor_timeout: task.cursor_timeout).each do |record|
136
137
  records.push(record)
137
138
  next unless records.length == (task.batch_size || batch_size)
138
139
 
139
- records = filter_out_processed(task, task_idx, records) if recover
140
+ records = filter_out_processed(task, task_idx, records)
140
141
  next unless records.length == (task.batch_size || batch_size)
141
142
 
142
143
  execute_document_task(task_idx, executor_key, records)
143
144
  records = []
144
145
  return if log.cancel
145
146
  end
146
- records = filter_out_processed(task, task_idx, records) if recover
147
+ records = filter_out_processed(task, task_idx, records)
147
148
  execute_document_task(task_idx, executor_key, records) if records.any?
148
149
  return if log.cancel
149
150
  log_done!
@@ -0,0 +1,8 @@
1
+ require "hekenga/base_iterator"
2
+ module Hekenga
3
+ class MongoidIterator < BaseIterator
4
+ def each(&block)
5
+ iteration_scope.each(&block)
6
+ end
7
+ end
8
+ end
@@ -1,4 +1,4 @@
1
- require 'hekenga/iterator'
1
+ require 'hekenga/id_iterator'
2
2
  require 'hekenga/document_task_executor'
3
3
  require 'hekenga/task_splitter'
4
4
 
@@ -15,13 +15,13 @@ module Hekenga
15
15
 
16
16
  def start!
17
17
  clear_task_records!
18
- @executor_key = BSON::ObjectId.new
18
+ regenerate_executor_key
19
19
  generate_for_scope(task.scope)
20
20
  check_for_completion!
21
21
  end
22
22
 
23
23
  def resume!
24
- @executor_key = BSON::ObjectId.new
24
+ regenerate_executor_key
25
25
  task_records.set(executor_key: @executor_key)
26
26
  queue_jobs!(task_records.incomplete)
27
27
  generate_new_records!
@@ -41,16 +41,43 @@ module Hekenga
41
41
 
42
42
  private
43
43
 
44
+ def regenerate_executor_key
45
+ @executor_key = BSON::ObjectId.new
46
+ end
47
+
44
48
  def generate_for_scope(scope)
45
- Hekenga::Iterator.new(scope, size: 100_000).each do |id_block|
46
- task_records = id_block.each_slice(batch_size).map do |id_slice|
47
- generate_task_records!(id_slice)
48
- end
49
+ Hekenga::IdIterator.new(
50
+ scope: scope,
51
+ cursor_timeout: task.cursor_timeout
52
+ # Batch Batches of IDs
53
+ ).each_slice(batch_size).each_slice(enqueue_size) do |id_block|
54
+ sanitize_id_block!(id_block)
55
+ task_records = id_block.reject(&:empty?).map(&method(:generate_task_record!))
49
56
  write_task_records!(task_records)
50
57
  queue_jobs!(task_records)
51
58
  end
52
59
  end
53
60
 
61
+ def enqueue_size
62
+ 500 # task records written + enqueued at a time
63
+ end
64
+
65
+ def sanitize_id_block!(id_block)
66
+ return if task.scope.options.blank? && task.scope.selector.blank?
67
+
68
+ # Custom ordering on cursor with parallel updates may result in the same
69
+ # ID getting yielded into the migration multiple times. Detect this +
70
+ # remove
71
+ doubleups = task_records.in(ids: id_block.flatten).pluck(:ids).flatten.to_set
72
+ return if doubleups.empty?
73
+
74
+ id_block.each do |id_slice|
75
+ id_slice.reject! do |id|
76
+ doubleups.include?(id)
77
+ end
78
+ end
79
+ end
80
+
54
81
  def generate_new_records!
55
82
  last_record = task_records.desc(:_id).first
56
83
  last_id = last_record&.ids&.last
@@ -83,7 +110,7 @@ module Hekenga
83
110
  migration.task_records(task_idx)
84
111
  end
85
112
 
86
- def generate_task_records!(id_slice)
113
+ def generate_task_record!(id_slice)
87
114
  Hekenga::DocumentTaskRecord.new(
88
115
  migration_key: migration.to_key,
89
116
  task_idx: task_idx,
@@ -48,10 +48,11 @@ module Hekenga
48
48
  # #skip_prepare!
49
49
  # #batch_size 25
50
50
  # #write_strategy :update # :delete_then_insert
51
+ # #cursor_timeout 86_400 # max allowed time for the cursor to survive, in seconds
51
52
  #
52
53
  # # Called once per batch, instance variables will be accessible
53
- # # in the filter & up blocks
54
- # #setup do
54
+ # # in the filter, up and after blocks
55
+ # #setup do |docs|
55
56
  # #end
56
57
  #
57
58
  # #filter do |doc|
@@ -59,6 +60,10 @@ module Hekenga
59
60
  #
60
61
  # up do |doc|
61
62
  # end
63
+ #
64
+ # # Called once per batch passing successfully written records
65
+ # #after do |docs|
66
+ # #end
62
67
  #end
63
68
  end
64
69
  EOF
@@ -1,3 +1,3 @@
1
1
  module Hekenga
2
- VERSION = "1.0.4"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hekenga
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tapio Saarinen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-06 00:00:00.000000000 Z
11
+ date: 2024-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -159,6 +159,7 @@ files:
159
159
  - hekenga.gemspec
160
160
  - lib/hekenga.rb
161
161
  - lib/hekenga/base_error.rb
162
+ - lib/hekenga/base_iterator.rb
162
163
  - lib/hekenga/config.rb
163
164
  - lib/hekenga/context.rb
164
165
  - lib/hekenga/document_task.rb
@@ -173,12 +174,13 @@ files:
173
174
  - lib/hekenga/failure/error.rb
174
175
  - lib/hekenga/failure/validation.rb
175
176
  - lib/hekenga/failure/write.rb
177
+ - lib/hekenga/id_iterator.rb
176
178
  - lib/hekenga/invalid.rb
177
179
  - lib/hekenga/irreversible.rb
178
- - lib/hekenga/iterator.rb
179
180
  - lib/hekenga/log.rb
180
181
  - lib/hekenga/master_process.rb
181
182
  - lib/hekenga/migration.rb
183
+ - lib/hekenga/mongoid_iterator.rb
182
184
  - lib/hekenga/parallel_job.rb
183
185
  - lib/hekenga/parallel_task.rb
184
186
  - lib/hekenga/scaffold.rb
@@ -1,26 +0,0 @@
1
- module Hekenga
2
- class Iterator
3
- include Enumerable
4
-
5
- SMALLEST_ID = BSON::ObjectId.from_string('0'*24)
6
-
7
- attr_reader :scope, :size
8
-
9
- def initialize(scope, size:)
10
- @scope = scope
11
- @size = size
12
- end
13
-
14
- def each(&block)
15
- current_id = SMALLEST_ID
16
- base_scope = scope.asc(:_id).limit(size)
17
-
18
- loop do
19
- ids = base_scope.and(_id: {'$gt': current_id}).pluck(:_id)
20
- break if ids.empty?
21
- yield ids
22
- current_id = ids.sort.last
23
- end
24
- end
25
- end
26
- end