hekenga 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +0 -3
- data/docker-compose.yml +2 -2
- data/lib/hekenga/base_iterator.rb +24 -0
- data/lib/hekenga/document_task.rb +3 -1
- data/lib/hekenga/dsl/document_task.rb +4 -0
- data/lib/hekenga/id_iterator.rb +34 -0
- data/lib/hekenga/migration.rb +4 -3
- data/lib/hekenga/mongoid_iterator.rb +8 -0
- data/lib/hekenga/parallel_task.rb +35 -8
- data/lib/hekenga/scaffold.rb +1 -0
- data/lib/hekenga/version.rb +1 -1
- metadata +5 -3
- data/lib/hekenga/iterator.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f21c3c1cb0e45c3b9eb2627fc960d032ff981e01b239dc69c02d2aebc1f7b539
|
4
|
+
data.tar.gz: 62fbbd8a65bae8bacc537bcc40cc4a3960b795ebb19a13c34b8ebd539eb102d0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6317b298a05085564cfaaee1beef6b5749e83f8bc877538b448db66b50c3dde50f0380a5fa7f178dff7feca6840564c486f263891a838d4ed51617b7ff4f8698
|
7
|
+
data.tar.gz: 12ba1c564acca2f7d43c80a384a0767b00592abf5126d0016a0bf7cadf610e6d342890cd54597a602654e8449491135a3dda488bb7c1c8aaeb72d035bdc7c5d6
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,18 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## v2.0.0
|
4
|
+
|
5
|
+
- `Hekenga::Iterator` has been replaced by `Hekenga::IdIterator`. If any
|
6
|
+
selector or sort is set on a document task migration scope, it no longer forces an
|
7
|
+
ascending ID sort. This should help to prevent index misses, though there is a
|
8
|
+
tradeoff that documents being concurrently updated may be skipped or
|
9
|
+
processed multiple times. Hekenga tries to guard against processing multiple
|
10
|
+
times. Manually specifying an `asc(:_id)` on your scope will continue to
|
11
|
+
process documents in ID order.
|
12
|
+
- Document tasks now support a new option, `cursor_timeout`. This is the maximum
|
13
|
+
time a document task's `scope` can be iterated and queue jobs within. The
|
14
|
+
default is one day.
|
15
|
+
|
3
16
|
## v1.1.0
|
4
17
|
|
5
18
|
- `setup` is now passed the current batch of documents so it can be used to
|
data/README.md
CHANGED
@@ -3,9 +3,6 @@
|
|
3
3
|
An attempt at a migration framework for MongoDB that supports parallel document
|
4
4
|
processing via ActiveJob, chained jobs and error recovery.
|
5
5
|
|
6
|
-
**Note that this gem is currently in pre-alpha - assume most things have a high
|
7
|
-
chance of being broken.**
|
8
|
-
|
9
6
|
## Installation
|
10
7
|
|
11
8
|
Add this line to your application's Gemfile:
|
data/docker-compose.yml
CHANGED
@@ -8,7 +8,7 @@ networks:
|
|
8
8
|
|
9
9
|
services:
|
10
10
|
mongo:
|
11
|
-
image: mongo:
|
11
|
+
image: mongo:6
|
12
12
|
command: ["--replSet", "rs0", "--bind_ip", "localhost,mongo"]
|
13
13
|
volumes:
|
14
14
|
- mongo:/data/db
|
@@ -18,7 +18,7 @@ services:
|
|
18
18
|
- hekenga-net
|
19
19
|
|
20
20
|
mongosetup:
|
21
|
-
image: mongo:
|
21
|
+
image: mongo:6
|
22
22
|
depends_on:
|
23
23
|
- mongo
|
24
24
|
restart: "no"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Hekenga
|
2
|
+
class BaseIterator
|
3
|
+
include Enumerable
|
4
|
+
DEFAULT_TIMEOUT = 86_400 # 1 day in seconds
|
5
|
+
|
6
|
+
attr_reader :cursor_timeout
|
7
|
+
|
8
|
+
def initialize(scope:, cursor_timeout: DEFAULT_TIMEOUT)
|
9
|
+
@scope = scope
|
10
|
+
@cursor_timeout = cursor_timeout
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def iteration_scope
|
16
|
+
if @scope.selector.blank? && @scope.options.blank?
|
17
|
+
# Apply a default _id sort, it works the best
|
18
|
+
@scope.asc(:_id)
|
19
|
+
else
|
20
|
+
@scope
|
21
|
+
end.max_time_ms(cursor_timeout * 1000) # convert to ms
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require 'hekenga/irreversible'
|
2
|
+
require 'hekenga/base_iterator'
|
2
3
|
module Hekenga
|
3
4
|
class DocumentTask
|
4
5
|
attr_reader :ups, :downs, :setups, :filters, :after_callbacks
|
5
|
-
attr_accessor :parallel, :scope, :timeless, :batch_size
|
6
|
+
attr_accessor :parallel, :scope, :timeless, :batch_size, :cursor_timeout
|
6
7
|
attr_accessor :description, :invalid_strategy, :skip_prepare, :write_strategy
|
7
8
|
attr_accessor :always_write, :use_transaction
|
8
9
|
|
@@ -18,6 +19,7 @@ module Hekenga
|
|
18
19
|
@batch_size = nil
|
19
20
|
@always_write = false
|
20
21
|
@use_transaction = false
|
22
|
+
@cursor_timeout = Hekenga::BaseIterator::DEFAULT_TIMEOUT
|
21
23
|
end
|
22
24
|
|
23
25
|
def validate!
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "hekenga/base_iterator"
|
2
|
+
module Hekenga
|
3
|
+
class IdIterator < BaseIterator
|
4
|
+
DEFAULT_ID = "_id".freeze
|
5
|
+
|
6
|
+
attr_reader :id_property
|
7
|
+
|
8
|
+
def initialize(id_property: DEFAULT_ID, **kwargs)
|
9
|
+
super(**kwargs)
|
10
|
+
@id_property = id_property
|
11
|
+
end
|
12
|
+
|
13
|
+
def each
|
14
|
+
with_view do |view|
|
15
|
+
view.each do |doc|
|
16
|
+
yield doc[id_property]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def with_view
|
24
|
+
view = iteration_scope.view
|
25
|
+
yield view
|
26
|
+
ensure
|
27
|
+
view.close_query
|
28
|
+
end
|
29
|
+
|
30
|
+
def iteration_scope
|
31
|
+
super.only(id_property)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/hekenga/migration.rb
CHANGED
@@ -2,6 +2,7 @@ require 'hekenga/invalid'
|
|
2
2
|
require 'hekenga/context'
|
3
3
|
require 'hekenga/parallel_job'
|
4
4
|
require 'hekenga/parallel_task'
|
5
|
+
require 'hekenga/mongoid_iterator'
|
5
6
|
require 'hekenga/master_process'
|
6
7
|
require 'hekenga/document_task_record'
|
7
8
|
require 'hekenga/document_task_executor'
|
@@ -132,18 +133,18 @@ module Hekenga
|
|
132
133
|
records = []
|
133
134
|
task_records(task_idx).delete_all unless recover
|
134
135
|
executor_key = BSON::ObjectId.new
|
135
|
-
task.scope.
|
136
|
+
Hekenga::MongoidIterator.new(scope: task.scope, cursor_timeout: task.cursor_timeout).each do |record|
|
136
137
|
records.push(record)
|
137
138
|
next unless records.length == (task.batch_size || batch_size)
|
138
139
|
|
139
|
-
records = filter_out_processed(task, task_idx, records)
|
140
|
+
records = filter_out_processed(task, task_idx, records)
|
140
141
|
next unless records.length == (task.batch_size || batch_size)
|
141
142
|
|
142
143
|
execute_document_task(task_idx, executor_key, records)
|
143
144
|
records = []
|
144
145
|
return if log.cancel
|
145
146
|
end
|
146
|
-
records = filter_out_processed(task, task_idx, records)
|
147
|
+
records = filter_out_processed(task, task_idx, records)
|
147
148
|
execute_document_task(task_idx, executor_key, records) if records.any?
|
148
149
|
return if log.cancel
|
149
150
|
log_done!
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'hekenga/
|
1
|
+
require 'hekenga/id_iterator'
|
2
2
|
require 'hekenga/document_task_executor'
|
3
3
|
require 'hekenga/task_splitter'
|
4
4
|
|
@@ -15,13 +15,13 @@ module Hekenga
|
|
15
15
|
|
16
16
|
def start!
|
17
17
|
clear_task_records!
|
18
|
-
|
18
|
+
regenerate_executor_key
|
19
19
|
generate_for_scope(task.scope)
|
20
20
|
check_for_completion!
|
21
21
|
end
|
22
22
|
|
23
23
|
def resume!
|
24
|
-
|
24
|
+
regenerate_executor_key
|
25
25
|
task_records.set(executor_key: @executor_key)
|
26
26
|
queue_jobs!(task_records.incomplete)
|
27
27
|
generate_new_records!
|
@@ -41,16 +41,43 @@ module Hekenga
|
|
41
41
|
|
42
42
|
private
|
43
43
|
|
44
|
+
def regenerate_executor_key
|
45
|
+
@executor_key = BSON::ObjectId.new
|
46
|
+
end
|
47
|
+
|
44
48
|
def generate_for_scope(scope)
|
45
|
-
Hekenga::
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
+
Hekenga::IdIterator.new(
|
50
|
+
scope: scope,
|
51
|
+
cursor_timeout: task.cursor_timeout
|
52
|
+
# Batch Batches of IDs
|
53
|
+
).each_slice(batch_size).each_slice(enqueue_size) do |id_block|
|
54
|
+
sanitize_id_block!(id_block)
|
55
|
+
task_records = id_block.reject(&:empty?).map(&method(:generate_task_record!))
|
49
56
|
write_task_records!(task_records)
|
50
57
|
queue_jobs!(task_records)
|
51
58
|
end
|
52
59
|
end
|
53
60
|
|
61
|
+
def enqueue_size
|
62
|
+
500 # task records written + enqueued at a time
|
63
|
+
end
|
64
|
+
|
65
|
+
def sanitize_id_block!(id_block)
|
66
|
+
return if task.scope.options.blank? && task.scope.selector.blank?
|
67
|
+
|
68
|
+
# Custom ordering on cursor with parallel updates may result in the same
|
69
|
+
# ID getting yielded into the migration multiple times. Detect this +
|
70
|
+
# remove
|
71
|
+
doubleups = task_records.in(ids: id_block.flatten).pluck(:ids).flatten.to_set
|
72
|
+
return if doubleups.empty?
|
73
|
+
|
74
|
+
id_block.each do |id_slice|
|
75
|
+
id_slice.reject! do |id|
|
76
|
+
doubleups.include?(id)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
54
81
|
def generate_new_records!
|
55
82
|
last_record = task_records.desc(:_id).first
|
56
83
|
last_id = last_record&.ids&.last
|
@@ -83,7 +110,7 @@ module Hekenga
|
|
83
110
|
migration.task_records(task_idx)
|
84
111
|
end
|
85
112
|
|
86
|
-
def
|
113
|
+
def generate_task_record!(id_slice)
|
87
114
|
Hekenga::DocumentTaskRecord.new(
|
88
115
|
migration_key: migration.to_key,
|
89
116
|
task_idx: task_idx,
|
data/lib/hekenga/scaffold.rb
CHANGED
@@ -48,6 +48,7 @@ module Hekenga
|
|
48
48
|
# #skip_prepare!
|
49
49
|
# #batch_size 25
|
50
50
|
# #write_strategy :update # :delete_then_insert
|
51
|
+
# #cursor_timeout 86_400 # max allowed time for the cursor to survive, in seconds
|
51
52
|
#
|
52
53
|
# # Called once per batch, instance variables will be accessible
|
53
54
|
# # in the filter, up and after blocks
|
data/lib/hekenga/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hekenga
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tapio Saarinen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-07-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- hekenga.gemspec
|
160
160
|
- lib/hekenga.rb
|
161
161
|
- lib/hekenga/base_error.rb
|
162
|
+
- lib/hekenga/base_iterator.rb
|
162
163
|
- lib/hekenga/config.rb
|
163
164
|
- lib/hekenga/context.rb
|
164
165
|
- lib/hekenga/document_task.rb
|
@@ -173,12 +174,13 @@ files:
|
|
173
174
|
- lib/hekenga/failure/error.rb
|
174
175
|
- lib/hekenga/failure/validation.rb
|
175
176
|
- lib/hekenga/failure/write.rb
|
177
|
+
- lib/hekenga/id_iterator.rb
|
176
178
|
- lib/hekenga/invalid.rb
|
177
179
|
- lib/hekenga/irreversible.rb
|
178
|
-
- lib/hekenga/iterator.rb
|
179
180
|
- lib/hekenga/log.rb
|
180
181
|
- lib/hekenga/master_process.rb
|
181
182
|
- lib/hekenga/migration.rb
|
183
|
+
- lib/hekenga/mongoid_iterator.rb
|
182
184
|
- lib/hekenga/parallel_job.rb
|
183
185
|
- lib/hekenga/parallel_task.rb
|
184
186
|
- lib/hekenga/scaffold.rb
|
data/lib/hekenga/iterator.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module Hekenga
|
2
|
-
class Iterator
|
3
|
-
include Enumerable
|
4
|
-
|
5
|
-
SMALLEST_ID = BSON::ObjectId.from_string('0'*24)
|
6
|
-
|
7
|
-
attr_reader :scope, :size
|
8
|
-
|
9
|
-
def initialize(scope, size:)
|
10
|
-
@scope = scope
|
11
|
-
@size = size
|
12
|
-
end
|
13
|
-
|
14
|
-
def each(&block)
|
15
|
-
current_id = SMALLEST_ID
|
16
|
-
base_scope = scope.asc(:_id).limit(size)
|
17
|
-
|
18
|
-
loop do
|
19
|
-
ids = base_scope.and(_id: {'$gt': current_id}).pluck(:_id)
|
20
|
-
break if ids.empty?
|
21
|
-
yield ids
|
22
|
-
current_id = ids.sort.last
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|