hekenga 1.1.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +0 -3
- data/docker-compose.yml +2 -2
- data/lib/hekenga/base_iterator.rb +24 -0
- data/lib/hekenga/document_task.rb +3 -1
- data/lib/hekenga/dsl/document_task.rb +4 -0
- data/lib/hekenga/id_iterator.rb +34 -0
- data/lib/hekenga/migration.rb +4 -3
- data/lib/hekenga/mongoid_iterator.rb +8 -0
- data/lib/hekenga/parallel_task.rb +35 -8
- data/lib/hekenga/scaffold.rb +1 -0
- data/lib/hekenga/version.rb +1 -1
- metadata +5 -3
- data/lib/hekenga/iterator.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f21c3c1cb0e45c3b9eb2627fc960d032ff981e01b239dc69c02d2aebc1f7b539
|
4
|
+
data.tar.gz: 62fbbd8a65bae8bacc537bcc40cc4a3960b795ebb19a13c34b8ebd539eb102d0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6317b298a05085564cfaaee1beef6b5749e83f8bc877538b448db66b50c3dde50f0380a5fa7f178dff7feca6840564c486f263891a838d4ed51617b7ff4f8698
|
7
|
+
data.tar.gz: 12ba1c564acca2f7d43c80a384a0767b00592abf5126d0016a0bf7cadf610e6d342890cd54597a602654e8449491135a3dda488bb7c1c8aaeb72d035bdc7c5d6
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,18 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## v2.0.0
|
4
|
+
|
5
|
+
- `Hekenga::Iterator` has been replaced by `Hekenga::IdIterator`. If any
|
6
|
+
selector or sort is set on a document task migration scope, it no longer forces an
|
7
|
+
ascending ID sort. This should help to prevent index misses, though there is a
|
8
|
+
tradeoff that documents being concurrently updated may be skipped or
|
9
|
+
processed multiple times. Hekenga tries to guard against processing multiple
|
10
|
+
times. Manually specifying an `asc(:_id)` on your scope will continue to
|
11
|
+
process documents in ID order.
|
12
|
+
- Document tasks now support a new option, `cursor_timeout`. This is the maximum
|
13
|
+
time a document task's `scope` can be iterated and queue jobs within. The
|
14
|
+
default is one day.
|
15
|
+
|
3
16
|
## v1.1.0
|
4
17
|
|
5
18
|
- `setup` is now passed the current batch of documents so it can be used to
|
data/README.md
CHANGED
@@ -3,9 +3,6 @@
|
|
3
3
|
An attempt at a migration framework for MongoDB that supports parallel document
|
4
4
|
processing via ActiveJob, chained jobs and error recovery.
|
5
5
|
|
6
|
-
**Note that this gem is currently in pre-alpha - assume most things have a high
|
7
|
-
chance of being broken.**
|
8
|
-
|
9
6
|
## Installation
|
10
7
|
|
11
8
|
Add this line to your application's Gemfile:
|
data/docker-compose.yml
CHANGED
@@ -8,7 +8,7 @@ networks:
|
|
8
8
|
|
9
9
|
services:
|
10
10
|
mongo:
|
11
|
-
image: mongo:
|
11
|
+
image: mongo:6
|
12
12
|
command: ["--replSet", "rs0", "--bind_ip", "localhost,mongo"]
|
13
13
|
volumes:
|
14
14
|
- mongo:/data/db
|
@@ -18,7 +18,7 @@ services:
|
|
18
18
|
- hekenga-net
|
19
19
|
|
20
20
|
mongosetup:
|
21
|
-
image: mongo:
|
21
|
+
image: mongo:6
|
22
22
|
depends_on:
|
23
23
|
- mongo
|
24
24
|
restart: "no"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Hekenga
|
2
|
+
class BaseIterator
|
3
|
+
include Enumerable
|
4
|
+
DEFAULT_TIMEOUT = 86_400 # 1 day in seconds
|
5
|
+
|
6
|
+
attr_reader :cursor_timeout
|
7
|
+
|
8
|
+
def initialize(scope:, cursor_timeout: DEFAULT_TIMEOUT)
|
9
|
+
@scope = scope
|
10
|
+
@cursor_timeout = cursor_timeout
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def iteration_scope
|
16
|
+
if @scope.selector.blank? && @scope.options.blank?
|
17
|
+
# Apply a default _id sort, it works the best
|
18
|
+
@scope.asc(:_id)
|
19
|
+
else
|
20
|
+
@scope
|
21
|
+
end.max_time_ms(cursor_timeout * 1000) # convert to ms
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require 'hekenga/irreversible'
|
2
|
+
require 'hekenga/base_iterator'
|
2
3
|
module Hekenga
|
3
4
|
class DocumentTask
|
4
5
|
attr_reader :ups, :downs, :setups, :filters, :after_callbacks
|
5
|
-
attr_accessor :parallel, :scope, :timeless, :batch_size
|
6
|
+
attr_accessor :parallel, :scope, :timeless, :batch_size, :cursor_timeout
|
6
7
|
attr_accessor :description, :invalid_strategy, :skip_prepare, :write_strategy
|
7
8
|
attr_accessor :always_write, :use_transaction
|
8
9
|
|
@@ -18,6 +19,7 @@ module Hekenga
|
|
18
19
|
@batch_size = nil
|
19
20
|
@always_write = false
|
20
21
|
@use_transaction = false
|
22
|
+
@cursor_timeout = Hekenga::BaseIterator::DEFAULT_TIMEOUT
|
21
23
|
end
|
22
24
|
|
23
25
|
def validate!
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "hekenga/base_iterator"
|
2
|
+
module Hekenga
|
3
|
+
class IdIterator < BaseIterator
|
4
|
+
DEFAULT_ID = "_id".freeze
|
5
|
+
|
6
|
+
attr_reader :id_property
|
7
|
+
|
8
|
+
def initialize(id_property: DEFAULT_ID, **kwargs)
|
9
|
+
super(**kwargs)
|
10
|
+
@id_property = id_property
|
11
|
+
end
|
12
|
+
|
13
|
+
def each
|
14
|
+
with_view do |view|
|
15
|
+
view.each do |doc|
|
16
|
+
yield doc[id_property]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def with_view
|
24
|
+
view = iteration_scope.view
|
25
|
+
yield view
|
26
|
+
ensure
|
27
|
+
view.close_query
|
28
|
+
end
|
29
|
+
|
30
|
+
def iteration_scope
|
31
|
+
super.only(id_property)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/hekenga/migration.rb
CHANGED
@@ -2,6 +2,7 @@ require 'hekenga/invalid'
|
|
2
2
|
require 'hekenga/context'
|
3
3
|
require 'hekenga/parallel_job'
|
4
4
|
require 'hekenga/parallel_task'
|
5
|
+
require 'hekenga/mongoid_iterator'
|
5
6
|
require 'hekenga/master_process'
|
6
7
|
require 'hekenga/document_task_record'
|
7
8
|
require 'hekenga/document_task_executor'
|
@@ -132,18 +133,18 @@ module Hekenga
|
|
132
133
|
records = []
|
133
134
|
task_records(task_idx).delete_all unless recover
|
134
135
|
executor_key = BSON::ObjectId.new
|
135
|
-
task.scope.
|
136
|
+
Hekenga::MongoidIterator.new(scope: task.scope, cursor_timeout: task.cursor_timeout).each do |record|
|
136
137
|
records.push(record)
|
137
138
|
next unless records.length == (task.batch_size || batch_size)
|
138
139
|
|
139
|
-
records = filter_out_processed(task, task_idx, records)
|
140
|
+
records = filter_out_processed(task, task_idx, records)
|
140
141
|
next unless records.length == (task.batch_size || batch_size)
|
141
142
|
|
142
143
|
execute_document_task(task_idx, executor_key, records)
|
143
144
|
records = []
|
144
145
|
return if log.cancel
|
145
146
|
end
|
146
|
-
records = filter_out_processed(task, task_idx, records)
|
147
|
+
records = filter_out_processed(task, task_idx, records)
|
147
148
|
execute_document_task(task_idx, executor_key, records) if records.any?
|
148
149
|
return if log.cancel
|
149
150
|
log_done!
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'hekenga/
|
1
|
+
require 'hekenga/id_iterator'
|
2
2
|
require 'hekenga/document_task_executor'
|
3
3
|
require 'hekenga/task_splitter'
|
4
4
|
|
@@ -15,13 +15,13 @@ module Hekenga
|
|
15
15
|
|
16
16
|
def start!
|
17
17
|
clear_task_records!
|
18
|
-
|
18
|
+
regenerate_executor_key
|
19
19
|
generate_for_scope(task.scope)
|
20
20
|
check_for_completion!
|
21
21
|
end
|
22
22
|
|
23
23
|
def resume!
|
24
|
-
|
24
|
+
regenerate_executor_key
|
25
25
|
task_records.set(executor_key: @executor_key)
|
26
26
|
queue_jobs!(task_records.incomplete)
|
27
27
|
generate_new_records!
|
@@ -41,16 +41,43 @@ module Hekenga
|
|
41
41
|
|
42
42
|
private
|
43
43
|
|
44
|
+
def regenerate_executor_key
|
45
|
+
@executor_key = BSON::ObjectId.new
|
46
|
+
end
|
47
|
+
|
44
48
|
def generate_for_scope(scope)
|
45
|
-
Hekenga::
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
+
Hekenga::IdIterator.new(
|
50
|
+
scope: scope,
|
51
|
+
cursor_timeout: task.cursor_timeout
|
52
|
+
# Batch Batches of IDs
|
53
|
+
).each_slice(batch_size).each_slice(enqueue_size) do |id_block|
|
54
|
+
sanitize_id_block!(id_block)
|
55
|
+
task_records = id_block.reject(&:empty?).map(&method(:generate_task_record!))
|
49
56
|
write_task_records!(task_records)
|
50
57
|
queue_jobs!(task_records)
|
51
58
|
end
|
52
59
|
end
|
53
60
|
|
61
|
+
def enqueue_size
|
62
|
+
500 # task records written + enqueued at a time
|
63
|
+
end
|
64
|
+
|
65
|
+
def sanitize_id_block!(id_block)
|
66
|
+
return if task.scope.options.blank? && task.scope.selector.blank?
|
67
|
+
|
68
|
+
# Custom ordering on cursor with parallel updates may result in the same
|
69
|
+
# ID getting yielded into the migration multiple times. Detect this +
|
70
|
+
# remove
|
71
|
+
doubleups = task_records.in(ids: id_block.flatten).pluck(:ids).flatten.to_set
|
72
|
+
return if doubleups.empty?
|
73
|
+
|
74
|
+
id_block.each do |id_slice|
|
75
|
+
id_slice.reject! do |id|
|
76
|
+
doubleups.include?(id)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
54
81
|
def generate_new_records!
|
55
82
|
last_record = task_records.desc(:_id).first
|
56
83
|
last_id = last_record&.ids&.last
|
@@ -83,7 +110,7 @@ module Hekenga
|
|
83
110
|
migration.task_records(task_idx)
|
84
111
|
end
|
85
112
|
|
86
|
-
def
|
113
|
+
def generate_task_record!(id_slice)
|
87
114
|
Hekenga::DocumentTaskRecord.new(
|
88
115
|
migration_key: migration.to_key,
|
89
116
|
task_idx: task_idx,
|
data/lib/hekenga/scaffold.rb
CHANGED
@@ -48,6 +48,7 @@ module Hekenga
|
|
48
48
|
# #skip_prepare!
|
49
49
|
# #batch_size 25
|
50
50
|
# #write_strategy :update # :delete_then_insert
|
51
|
+
# #cursor_timeout 86_400 # max allowed time for the cursor to survive, in seconds
|
51
52
|
#
|
52
53
|
# # Called once per batch, instance variables will be accessible
|
53
54
|
# # in the filter, up and after blocks
|
data/lib/hekenga/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hekenga
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tapio Saarinen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-07-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- hekenga.gemspec
|
160
160
|
- lib/hekenga.rb
|
161
161
|
- lib/hekenga/base_error.rb
|
162
|
+
- lib/hekenga/base_iterator.rb
|
162
163
|
- lib/hekenga/config.rb
|
163
164
|
- lib/hekenga/context.rb
|
164
165
|
- lib/hekenga/document_task.rb
|
@@ -173,12 +174,13 @@ files:
|
|
173
174
|
- lib/hekenga/failure/error.rb
|
174
175
|
- lib/hekenga/failure/validation.rb
|
175
176
|
- lib/hekenga/failure/write.rb
|
177
|
+
- lib/hekenga/id_iterator.rb
|
176
178
|
- lib/hekenga/invalid.rb
|
177
179
|
- lib/hekenga/irreversible.rb
|
178
|
-
- lib/hekenga/iterator.rb
|
179
180
|
- lib/hekenga/log.rb
|
180
181
|
- lib/hekenga/master_process.rb
|
181
182
|
- lib/hekenga/migration.rb
|
183
|
+
- lib/hekenga/mongoid_iterator.rb
|
182
184
|
- lib/hekenga/parallel_job.rb
|
183
185
|
- lib/hekenga/parallel_task.rb
|
184
186
|
- lib/hekenga/scaffold.rb
|
data/lib/hekenga/iterator.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module Hekenga
|
2
|
-
class Iterator
|
3
|
-
include Enumerable
|
4
|
-
|
5
|
-
SMALLEST_ID = BSON::ObjectId.from_string('0'*24)
|
6
|
-
|
7
|
-
attr_reader :scope, :size
|
8
|
-
|
9
|
-
def initialize(scope, size:)
|
10
|
-
@scope = scope
|
11
|
-
@size = size
|
12
|
-
end
|
13
|
-
|
14
|
-
def each(&block)
|
15
|
-
current_id = SMALLEST_ID
|
16
|
-
base_scope = scope.asc(:_id).limit(size)
|
17
|
-
|
18
|
-
loop do
|
19
|
-
ids = base_scope.and(_id: {'$gt': current_id}).pluck(:_id)
|
20
|
-
break if ids.empty?
|
21
|
-
yield ids
|
22
|
-
current_id = ids.sort.last
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|