hekenga 1.1.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 94824a7ff42ed31e87c54b5093ad9b8a03b27b08ec5cc7c4f45491b71b3dcc79
4
- data.tar.gz: ca0ffaded81560273fa6421de3dfdb9a0efce0fd62f95c410bd31004cb17a816
3
+ metadata.gz: f21c3c1cb0e45c3b9eb2627fc960d032ff981e01b239dc69c02d2aebc1f7b539
4
+ data.tar.gz: 62fbbd8a65bae8bacc537bcc40cc4a3960b795ebb19a13c34b8ebd539eb102d0
5
5
  SHA512:
6
- metadata.gz: 8677a4cd8023f511971b54ac4458e7b230ee693246473d77b9e3237895ef95670aa157d0117f13223a0df4dde71430de9c940bfc89315422203e23ec3434218e
7
- data.tar.gz: 69b879e26c4d7771377881d9e83d411fa3b0a2674ce851b5b9b74f323c1c2dec749218257766b27e467eb409708f221d9125a8451bd9d394b103e654e86785da
6
+ metadata.gz: 6317b298a05085564cfaaee1beef6b5749e83f8bc877538b448db66b50c3dde50f0380a5fa7f178dff7feca6840564c486f263891a838d4ed51617b7ff4f8698
7
+ data.tar.gz: 12ba1c564acca2f7d43c80a384a0767b00592abf5126d0016a0bf7cadf610e6d342890cd54597a602654e8449491135a3dda488bb7c1c8aaeb72d035bdc7c5d6
data/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## v2.0.0
4
+
5
+ - `Hekenga::Iterator` has been replaced by `Hekenga::IdIterator`. If any
6
+ selector or sort is set on a document task migration scope, it no longer forces an
7
+ ascending ID sort. This should help to prevent index misses, though there is a
8
+ tradeoff that documents being concurrently updated may be skipped or
9
+ processed multiple times. Hekenga tries to guard against processing multiple
10
+ times. Manually specifying an `asc(:_id)` on your scope will continue to
11
+ process documents in ID order.
12
+ - Document tasks now support a new option, `cursor_timeout`. This is the maximum
13
+ time a document task's `scope` can be iterated and queue jobs within. The
14
+ default is one day.
15
+
3
16
  ## v1.1.0
4
17
 
5
18
  - `setup` is now passed the current batch of documents so it can be used to
data/README.md CHANGED
@@ -3,9 +3,6 @@
3
3
  An attempt at a migration framework for MongoDB that supports parallel document
4
4
  processing via ActiveJob, chained jobs and error recovery.
5
5
 
6
- **Note that this gem is currently in pre-alpha - assume most things have a high
7
- chance of being broken.**
8
-
9
6
  ## Installation
10
7
 
11
8
  Add this line to your application's Gemfile:
data/docker-compose.yml CHANGED
@@ -8,7 +8,7 @@ networks:
8
8
 
9
9
  services:
10
10
  mongo:
11
- image: mongo:5
11
+ image: mongo:6
12
12
  command: ["--replSet", "rs0", "--bind_ip", "localhost,mongo"]
13
13
  volumes:
14
14
  - mongo:/data/db
@@ -18,7 +18,7 @@ services:
18
18
  - hekenga-net
19
19
 
20
20
  mongosetup:
21
- image: mongo:5
21
+ image: mongo:6
22
22
  depends_on:
23
23
  - mongo
24
24
  restart: "no"
@@ -0,0 +1,24 @@
1
+ module Hekenga
2
+ class BaseIterator
3
+ include Enumerable
4
+ DEFAULT_TIMEOUT = 86_400 # 1 day in seconds
5
+
6
+ attr_reader :cursor_timeout
7
+
8
+ def initialize(scope:, cursor_timeout: DEFAULT_TIMEOUT)
9
+ @scope = scope
10
+ @cursor_timeout = cursor_timeout
11
+ end
12
+
13
+ private
14
+
15
+ def iteration_scope
16
+ if @scope.selector.blank? && @scope.options.blank?
17
+ # Apply a default _id sort, it works the best
18
+ @scope.asc(:_id)
19
+ else
20
+ @scope
21
+ end.max_time_ms(cursor_timeout * 1000) # convert to ms
22
+ end
23
+ end
24
+ end
@@ -1,8 +1,9 @@
1
1
  require 'hekenga/irreversible'
2
+ require 'hekenga/base_iterator'
2
3
  module Hekenga
3
4
  class DocumentTask
4
5
  attr_reader :ups, :downs, :setups, :filters, :after_callbacks
5
- attr_accessor :parallel, :scope, :timeless, :batch_size
6
+ attr_accessor :parallel, :scope, :timeless, :batch_size, :cursor_timeout
6
7
  attr_accessor :description, :invalid_strategy, :skip_prepare, :write_strategy
7
8
  attr_accessor :always_write, :use_transaction
8
9
 
@@ -18,6 +19,7 @@ module Hekenga
18
19
  @batch_size = nil
19
20
  @always_write = false
20
21
  @use_transaction = false
22
+ @cursor_timeout = Hekenga::BaseIterator::DEFAULT_TIMEOUT
21
23
  end
22
24
 
23
25
  def validate!
@@ -28,6 +28,10 @@ module Hekenga
28
28
  @object.write_strategy = strategy
29
29
  end
30
30
 
31
+ def cursor_timeout(timeout)
32
+ @object.cursor_timeout = timeout.to_i
33
+ end
34
+
31
35
  def scope(scope)
32
36
  @object.scope = scope
33
37
  end
@@ -0,0 +1,34 @@
1
+ require "hekenga/base_iterator"
2
+ module Hekenga
3
+ class IdIterator < BaseIterator
4
+ DEFAULT_ID = "_id".freeze
5
+
6
+ attr_reader :id_property
7
+
8
+ def initialize(id_property: DEFAULT_ID, **kwargs)
9
+ super(**kwargs)
10
+ @id_property = id_property
11
+ end
12
+
13
+ def each
14
+ with_view do |view|
15
+ view.each do |doc|
16
+ yield doc[id_property]
17
+ end
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def with_view
24
+ view = iteration_scope.view
25
+ yield view
26
+ ensure
27
+ view.close_query
28
+ end
29
+
30
+ def iteration_scope
31
+ super.only(id_property)
32
+ end
33
+ end
34
+ end
@@ -2,6 +2,7 @@ require 'hekenga/invalid'
2
2
  require 'hekenga/context'
3
3
  require 'hekenga/parallel_job'
4
4
  require 'hekenga/parallel_task'
5
+ require 'hekenga/mongoid_iterator'
5
6
  require 'hekenga/master_process'
6
7
  require 'hekenga/document_task_record'
7
8
  require 'hekenga/document_task_executor'
@@ -132,18 +133,18 @@ module Hekenga
132
133
  records = []
133
134
  task_records(task_idx).delete_all unless recover
134
135
  executor_key = BSON::ObjectId.new
135
- task.scope.asc(:_id).no_timeout.each do |record|
136
+ Hekenga::MongoidIterator.new(scope: task.scope, cursor_timeout: task.cursor_timeout).each do |record|
136
137
  records.push(record)
137
138
  next unless records.length == (task.batch_size || batch_size)
138
139
 
139
- records = filter_out_processed(task, task_idx, records) if recover
140
+ records = filter_out_processed(task, task_idx, records)
140
141
  next unless records.length == (task.batch_size || batch_size)
141
142
 
142
143
  execute_document_task(task_idx, executor_key, records)
143
144
  records = []
144
145
  return if log.cancel
145
146
  end
146
- records = filter_out_processed(task, task_idx, records) if recover
147
+ records = filter_out_processed(task, task_idx, records)
147
148
  execute_document_task(task_idx, executor_key, records) if records.any?
148
149
  return if log.cancel
149
150
  log_done!
@@ -0,0 +1,8 @@
1
+ require "hekenga/base_iterator"
2
+ module Hekenga
3
+ class MongoidIterator < BaseIterator
4
+ def each(&block)
5
+ iteration_scope.each(&block)
6
+ end
7
+ end
8
+ end
@@ -1,4 +1,4 @@
1
- require 'hekenga/iterator'
1
+ require 'hekenga/id_iterator'
2
2
  require 'hekenga/document_task_executor'
3
3
  require 'hekenga/task_splitter'
4
4
 
@@ -15,13 +15,13 @@ module Hekenga
15
15
 
16
16
  def start!
17
17
  clear_task_records!
18
- @executor_key = BSON::ObjectId.new
18
+ regenerate_executor_key
19
19
  generate_for_scope(task.scope)
20
20
  check_for_completion!
21
21
  end
22
22
 
23
23
  def resume!
24
- @executor_key = BSON::ObjectId.new
24
+ regenerate_executor_key
25
25
  task_records.set(executor_key: @executor_key)
26
26
  queue_jobs!(task_records.incomplete)
27
27
  generate_new_records!
@@ -41,16 +41,43 @@ module Hekenga
41
41
 
42
42
  private
43
43
 
44
+ def regenerate_executor_key
45
+ @executor_key = BSON::ObjectId.new
46
+ end
47
+
44
48
  def generate_for_scope(scope)
45
- Hekenga::Iterator.new(scope, size: 100_000).each do |id_block|
46
- task_records = id_block.each_slice(batch_size).map do |id_slice|
47
- generate_task_records!(id_slice)
48
- end
49
+ Hekenga::IdIterator.new(
50
+ scope: scope,
51
+ cursor_timeout: task.cursor_timeout
52
+ # Batch Batches of IDs
53
+ ).each_slice(batch_size).each_slice(enqueue_size) do |id_block|
54
+ sanitize_id_block!(id_block)
55
+ task_records = id_block.reject(&:empty?).map(&method(:generate_task_record!))
49
56
  write_task_records!(task_records)
50
57
  queue_jobs!(task_records)
51
58
  end
52
59
  end
53
60
 
61
+ def enqueue_size
62
+ 500 # task records written + enqueued at a time
63
+ end
64
+
65
+ def sanitize_id_block!(id_block)
66
+ return if task.scope.options.blank? && task.scope.selector.blank?
67
+
68
+ # Custom ordering on cursor with parallel updates may result in the same
69
+ # ID getting yielded into the migration multiple times. Detect this +
70
+ # remove
71
+ doubleups = task_records.in(ids: id_block.flatten).pluck(:ids).flatten.to_set
72
+ return if doubleups.empty?
73
+
74
+ id_block.each do |id_slice|
75
+ id_slice.reject! do |id|
76
+ doubleups.include?(id)
77
+ end
78
+ end
79
+ end
80
+
54
81
  def generate_new_records!
55
82
  last_record = task_records.desc(:_id).first
56
83
  last_id = last_record&.ids&.last
@@ -83,7 +110,7 @@ module Hekenga
83
110
  migration.task_records(task_idx)
84
111
  end
85
112
 
86
- def generate_task_records!(id_slice)
113
+ def generate_task_record!(id_slice)
87
114
  Hekenga::DocumentTaskRecord.new(
88
115
  migration_key: migration.to_key,
89
116
  task_idx: task_idx,
@@ -48,6 +48,7 @@ module Hekenga
48
48
  # #skip_prepare!
49
49
  # #batch_size 25
50
50
  # #write_strategy :update # :delete_then_insert
51
+ # #cursor_timeout 86_400 # max allowed time for the cursor to survive, in seconds
51
52
  #
52
53
  # # Called once per batch, instance variables will be accessible
53
54
  # # in the filter, up and after blocks
@@ -1,3 +1,3 @@
1
1
  module Hekenga
2
- VERSION = "1.1.0"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hekenga
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tapio Saarinen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-20 00:00:00.000000000 Z
11
+ date: 2024-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -159,6 +159,7 @@ files:
159
159
  - hekenga.gemspec
160
160
  - lib/hekenga.rb
161
161
  - lib/hekenga/base_error.rb
162
+ - lib/hekenga/base_iterator.rb
162
163
  - lib/hekenga/config.rb
163
164
  - lib/hekenga/context.rb
164
165
  - lib/hekenga/document_task.rb
@@ -173,12 +174,13 @@ files:
173
174
  - lib/hekenga/failure/error.rb
174
175
  - lib/hekenga/failure/validation.rb
175
176
  - lib/hekenga/failure/write.rb
177
+ - lib/hekenga/id_iterator.rb
176
178
  - lib/hekenga/invalid.rb
177
179
  - lib/hekenga/irreversible.rb
178
- - lib/hekenga/iterator.rb
179
180
  - lib/hekenga/log.rb
180
181
  - lib/hekenga/master_process.rb
181
182
  - lib/hekenga/migration.rb
183
+ - lib/hekenga/mongoid_iterator.rb
182
184
  - lib/hekenga/parallel_job.rb
183
185
  - lib/hekenga/parallel_task.rb
184
186
  - lib/hekenga/scaffold.rb
@@ -1,26 +0,0 @@
1
- module Hekenga
2
- class Iterator
3
- include Enumerable
4
-
5
- SMALLEST_ID = BSON::ObjectId.from_string('0'*24)
6
-
7
- attr_reader :scope, :size
8
-
9
- def initialize(scope, size:)
10
- @scope = scope
11
- @size = size
12
- end
13
-
14
- def each(&block)
15
- current_id = SMALLEST_ID
16
- base_scope = scope.asc(:_id).limit(size)
17
-
18
- loop do
19
- ids = base_scope.and(_id: {'$gt': current_id}).pluck(:_id)
20
- break if ids.empty?
21
- yield ids
22
- current_id = ids.sort.last
23
- end
24
- end
25
- end
26
- end