job-iteration 1.3.6 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +42 -22
- data/.github/workflows/cla.yml +22 -0
- data/.rubocop.yml +3 -3
- data/CHANGELOG.md +23 -1
- data/Gemfile +0 -1
- data/Gemfile.lock +64 -67
- data/README.md +26 -8
- data/dev.yml +2 -2
- data/guides/argument-semantics.md +128 -0
- data/guides/best-practices.md +72 -32
- data/guides/custom-enumerator.md +76 -28
- data/guides/iteration-how-it-works.md +2 -18
- data/{railgun.yml → isogun.yml} +0 -4
- data/lib/job-iteration/active_record_batch_enumerator.rb +1 -1
- data/lib/job-iteration/active_record_cursor.rb +6 -3
- data/lib/job-iteration/active_record_enumerator.rb +5 -1
- data/lib/job-iteration/csv_enumerator.rb +1 -1
- data/lib/job-iteration/enumerator_builder.rb +47 -9
- data/lib/job-iteration/iteration.rb +64 -35
- data/lib/job-iteration/log_subscriber.rb +38 -0
- data/lib/job-iteration/nested_enumerator.rb +48 -0
- data/lib/job-iteration/version.rb +1 -1
- data/lib/job-iteration.rb +25 -0
- metadata +8 -4
data/guides/best-practices.md
CHANGED
@@ -1,20 +1,67 @@
|
|
1
1
|
# Best practices
|
2
2
|
|
3
|
-
##
|
3
|
+
## Batch iteration
|
4
4
|
|
5
|
-
|
5
|
+
Regardless of the active record enumerator used in the task, `job-iteration` gem loads records in batches of 100 (by default).
|
6
|
+
The following two tasks produce equivalent database queries,
|
7
|
+
however `RecordsJob` task allows for more frequent interruptions by doing just one thing in the `each_iteration` method.
|
6
8
|
|
7
9
|
```ruby
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
# bad
|
11
|
+
class BatchesJob < ApplicationJob
|
12
|
+
include JobIteration::Iteration
|
13
|
+
|
14
|
+
def build_enumerator(product_id, cursor:)
|
15
|
+
enumerator_builder.active_record_on_batches(
|
16
|
+
Comment.where(product_id: product_id),
|
17
|
+
cursor: cursor,
|
18
|
+
batch_size: 5,
|
19
|
+
)
|
20
|
+
end
|
21
|
+
|
22
|
+
def each_iteration(batch_of_comments, product_id)
|
23
|
+
batch_of_comments.each(&:destroy)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# good
|
28
|
+
class RecordsJob < ApplicationJob
|
29
|
+
include JobIteration::Iteration
|
30
|
+
|
31
|
+
def build_enumerator(product_id, cursor:)
|
32
|
+
enumerator_builder.active_record_on_records(
|
33
|
+
Comment.where(product_id: product_id),
|
34
|
+
cursor: cursor,
|
35
|
+
batch_size: 5,
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def each_iteration(comment, product_id)
|
40
|
+
comment.destroy
|
41
|
+
end
|
15
42
|
end
|
43
|
+
```
|
16
44
|
|
17
|
-
|
45
|
+
## Instrumentation
|
46
|
+
|
47
|
+
Iteration leverages [`ActiveSupport::Notifications`](https://guides.rubyonrails.org/active_support_instrumentation.html)
|
48
|
+
to notify you what it's doing. You can subscribe to the following events (listed in order of job lifecycle):
|
49
|
+
|
50
|
+
- `build_enumerator.iteration`
|
51
|
+
- `throttled.iteration` (when using ThrottleEnumerator)
|
52
|
+
- `nil_enumerator.iteration`
|
53
|
+
- `resumed.iteration`
|
54
|
+
- `each_iteration.iteration`
|
55
|
+
- `not_found.iteration`
|
56
|
+
- `interrupted.iteration`
|
57
|
+
- `completed.iteration`
|
58
|
+
|
59
|
+
All events have tags including the job class name and cursor position, some add the amount of times interrupted and/or
|
60
|
+
total time the job spent running across interruptions.
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
# config/initializers/instrumentation.rb
|
64
|
+
ActiveSupport::Notifications.monotonic_subscribe("each_iteration.iteration") do |_, started, finished, _, tags|
|
18
65
|
elapsed = finished - started
|
19
66
|
StatsD.distribution(
|
20
67
|
"iteration.each_iteration",
|
@@ -27,28 +74,6 @@ ActiveSupport::Notifications.subscribe('each_iteration.iteration') do |_, starte
|
|
27
74
|
"each_iteration runtime exceeded limit of #{BackgroundQueue.max_iteration_runtime}s"
|
28
75
|
end
|
29
76
|
end
|
30
|
-
|
31
|
-
ActiveSupport::Notifications.subscribe('resumed.iteration') do |_, _, _, _, tags|
|
32
|
-
StatsD.increment(
|
33
|
-
"iteration.resumed",
|
34
|
-
tags: { job_class: tags[:job_class]&.underscore }
|
35
|
-
)
|
36
|
-
end
|
37
|
-
|
38
|
-
ActiveSupport::Notifications.subscribe('interrupted.iteration') do |_, _, _, _, tags|
|
39
|
-
StatsD.increment(
|
40
|
-
"iteration.interrupted",
|
41
|
-
tags: { job_class: tags[:job_class]&.underscore }
|
42
|
-
)
|
43
|
-
end
|
44
|
-
|
45
|
-
# If you're using ThrottleEnumerator
|
46
|
-
ActiveSupport::Notifications.subscribe('throttled.iteration') do |_, _, _, _, tags|
|
47
|
-
StatsD.increment(
|
48
|
-
"iteration.throttled",
|
49
|
-
tags: { job_class: tags[:job_class]&.underscore }
|
50
|
-
)
|
51
|
-
end
|
52
77
|
```
|
53
78
|
|
54
79
|
## Max iteration time
|
@@ -66,3 +91,18 @@ JobIteration.max_job_runtime = 5.minutes # nil by default
|
|
66
91
|
```
|
67
92
|
|
68
93
|
Use this accessor to tweak how often you'd like the job to interrupt itself.
|
94
|
+
|
95
|
+
### Per job max job runtime
|
96
|
+
|
97
|
+
For more granular control, `job_iteration_max_job_runtime` can be set **per-job class**. This allows both incremental adoption, as well as using a conservative global setting, and an aggressive setting on a per-job basis.
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
class MyJob < ApplicationJob
|
101
|
+
include JobIteration::Iteration
|
102
|
+
|
103
|
+
self.job_iteration_max_job_runtime = 3.minutes
|
104
|
+
|
105
|
+
# ...
|
106
|
+
```
|
107
|
+
|
108
|
+
This setting will be inherited by any child classes, although it can be further overridden. Note that no class can **increase** the `max_job_runtime` it has inherited; it can only be **decreased**. No job can increase its `max_job_runtime` beyond the global limit.
|
data/guides/custom-enumerator.md
CHANGED
@@ -1,38 +1,34 @@
|
|
1
|
-
Iteration leverages the [Enumerator](
|
1
|
+
Iteration leverages the [Enumerator](https://ruby-doc.org/3.2.1/Enumerator.html) pattern from the Ruby standard library,
|
2
|
+
which allows us to use almost any resource as a collection to iterate.
|
2
3
|
|
3
|
-
|
4
|
+
Before writing an enumerator, it is important to understand [how Iteration works](iteration-how-it-works.md) and how
|
5
|
+
your enumerator will be used by it. An enumerator must `yield` two things in the following order as positional
|
6
|
+
arguments:
|
7
|
+
- An object to be processed in a job `each_iteration` method
|
8
|
+
- A cursor position, which Iteration will persist if `each_iteration` returns succesfully and the job is forced to shut
|
9
|
+
down. It can be any data type your job backend can serialize and deserialize correctly.
|
4
10
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
def build_enumerator(*)
|
10
|
-
@redis = Redis.new
|
11
|
-
Enumerator.new do |yielder|
|
12
|
-
yielder.yield @redis.lpop(key), nil
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
def each_iteration(item_from_redis)
|
17
|
-
# ...
|
18
|
-
end
|
19
|
-
end
|
20
|
-
```
|
11
|
+
A job that includes Iteration is first started with `nil` as the cursor. When resuming an interrupted job, Iteration
|
12
|
+
will deserialize the persisted cursor and pass it to the job's `build_enumerator` method, which your enumerator uses to
|
13
|
+
find objects that come _after_ the last successfully processed object. The [array enumerator](https://github.com/Shopify/job-iteration/blob/v1.3.6/lib/job-iteration/enumerator_builder.rb#L50-L67)
|
14
|
+
is a simple example which uses the array index as the cursor position.
|
21
15
|
|
22
|
-
|
16
|
+
For a more complex example, consider this Enumerator that wraps a third party API (Stripe) for paginated iteration and
|
17
|
+
stores a string as the cursor position:
|
23
18
|
|
24
19
|
```ruby
|
25
20
|
class StripeListEnumerator
|
21
|
+
# @see https://stripe.com/docs/api/pagination
|
26
22
|
# @param resource [Stripe::APIResource] The type of Stripe object to request
|
27
23
|
# @param params [Hash] Query parameters for the request
|
28
24
|
# @param options [Hash] Request options, such as API key or version
|
29
|
-
# @param cursor [String]
|
25
|
+
# @param cursor [nil, String] The Stripe ID of the last item iterated over
|
30
26
|
def initialize(resource, params: {}, options: {}, cursor:)
|
31
27
|
pagination_params = {}
|
32
28
|
pagination_params[:starting_after] = cursor unless cursor.nil?
|
33
29
|
|
30
|
+
# The following line makes a request, consider adding your rate limiter here.
|
34
31
|
@list = resource.public_send(:list, params.merge(pagination_params), options)
|
35
|
-
.auto_paging_each.lazy
|
36
32
|
end
|
37
33
|
|
38
34
|
def to_enumerator
|
@@ -45,27 +41,75 @@ class StripeListEnumerator
|
|
45
41
|
# as the cursor on the job. This allows us to properly set the
|
46
42
|
# `starting_after` parameter for the API request when resuming.
|
47
43
|
def each
|
48
|
-
|
49
|
-
|
44
|
+
loop do
|
45
|
+
@list.each do |item, _index|
|
46
|
+
# The first argument is what gets passed to `each_iteration`.
|
47
|
+
# The second argument (item.id) is going to be persisted as the cursor,
|
48
|
+
# it doesn't get passed to `each_iteration`.
|
49
|
+
yield item, item.id
|
50
|
+
end
|
51
|
+
|
52
|
+
# The following line makes a request, consider adding your rate limiter here.
|
53
|
+
@list = @list.next_page
|
54
|
+
|
55
|
+
break if @list.empty?
|
50
56
|
end
|
51
57
|
end
|
52
58
|
end
|
53
59
|
```
|
54
60
|
|
61
|
+
Here we leverage the Stripe cursor pagination where the cursor is an ID of a specific item in the collection. The job
|
62
|
+
which uses such an `Enumerator` would then look like so:
|
63
|
+
|
55
64
|
```ruby
|
56
|
-
class
|
65
|
+
class LoadRefundsForChargeJob < ActiveJob::Base
|
57
66
|
include JobIteration::Iteration
|
58
67
|
|
59
|
-
|
68
|
+
# If you added your own rate limiting above, handle it here. For example:
|
69
|
+
# retry_on(MyRateLimiter::LimitExceededError, wait: 30.seconds, attempts: :unlimited)
|
70
|
+
# Use an exponential back-off strategy when Stripe's API returns errors.
|
71
|
+
|
72
|
+
def build_enumerator(charge_id, cursor:)
|
60
73
|
StripeListEnumerator.new(
|
61
74
|
Stripe::Refund,
|
62
|
-
params: { charge: "
|
75
|
+
params: { charge: charge_id}, # "charge_id" will be a prefixed Stripe ID such as "chrg_123"
|
63
76
|
options: { api_key: "sk_test_123", stripe_version: "2018-01-18" },
|
64
77
|
cursor: cursor
|
65
78
|
).to_enumerator
|
66
79
|
end
|
67
80
|
|
68
|
-
|
81
|
+
# Note that in this case `each_iteration` will only receive one positional argument per iteration.
|
82
|
+
# If what your enumerator yields is a composite object you will need to unpack it yourself
|
83
|
+
# inside the `each_iteration`.
|
84
|
+
def each_iteration(stripe_refund, charge_id)
|
85
|
+
# ...
|
86
|
+
end
|
87
|
+
end
|
88
|
+
```
|
89
|
+
|
90
|
+
and you initiate the job with
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
LoadRefundsForChargeJob.perform_later(_charge_id = "chrg_345")
|
94
|
+
```
|
95
|
+
|
96
|
+
Sometimes you can ignore the cursor. Consider the following custom Enumerator that takes items from a Redis list, which
|
97
|
+
is essentially a queue. Even if this job doesn't need to persist a cursor in order to resume, it can still use
|
98
|
+
Iteration's signal handling to finish `each_iteration` and gracefully terminate.
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
class RedisPopListJob < ActiveJob::Base
|
102
|
+
include JobIteration::Iteration
|
103
|
+
|
104
|
+
# @see https://redis.io/commands/lpop/
|
105
|
+
def build_enumerator(*)
|
106
|
+
@redis = Redis.new
|
107
|
+
Enumerator.new do |yielder|
|
108
|
+
yielder.yield @redis.lpop(key), nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def each_iteration(item_from_redis)
|
69
113
|
# ...
|
70
114
|
end
|
71
115
|
end
|
@@ -73,4 +117,8 @@ end
|
|
73
117
|
|
74
118
|
We recommend that you read the implementation of the other enumerators that come with the library (`CsvEnumerator`, `ActiveRecordEnumerator`) to gain a better understanding of building Enumerator objects.
|
75
119
|
|
76
|
-
Code that is written after the `yield` in a custom enumerator is not guaranteed to execute. In the case that a job is
|
120
|
+
Code that is written after the `yield` in a custom enumerator is not guaranteed to execute. In the case that a job is
|
121
|
+
forced to exit ie `job_should_exit?` is true, then the job is re-enqueued during the yield and the rest of the code in
|
122
|
+
the enumerator does not run. You can follow that logic
|
123
|
+
[here](https://github.com/Shopify/job-iteration/blob/v1.3.6/lib/job-iteration/iteration.rb#L161-L165) and
|
124
|
+
[here](https://github.com/Shopify/job-iteration/blob/v1.3.6/lib/job-iteration/iteration.rb#L131-L143)
|
@@ -34,22 +34,6 @@ Further reading: [Sidekiq signals](https://github.com/mperham/sidekiq/wiki/Signa
|
|
34
34
|
|
35
35
|
In the early versions of Iteration, `build_enumerator` used to return ActiveRecord relations directly, and we would infer the Enumerator based on the type of object. We used to support ActiveRecord relations, arrays and CSVs. This made it hard to add support for other types of enumerations, and it was easy for developers to make mistakes and return an array of ActiveRecord objects, and for us starting to treat that as an array instead of as an ActiveRecord relation.
|
36
36
|
|
37
|
-
The current version of Iteration supports _any_ Enumerator. We expose helpers to build enumerators conveniently (`enumerator_builder.active_record_on_records`), but it's up
|
38
|
-
|
39
|
-
```ruby
|
40
|
-
class MyJob < ActiveJob::Base
|
41
|
-
include JobIteration::Iteration
|
42
|
-
|
43
|
-
def build_enumerator(cursor:)
|
44
|
-
Enumerator.new do
|
45
|
-
Redis.lpop("mylist") # or: Kafka.poll(timeout: 10.seconds)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def each_iteration(element_from_redis)
|
50
|
-
# ...
|
51
|
-
end
|
52
|
-
end
|
53
|
-
```
|
37
|
+
The current version of Iteration supports _any_ Enumerator. We expose helpers to build common enumerators conveniently (`enumerator_builder.active_record_on_records`), but it's up to a developer to implement [a custom Enumerator](custom-enumerator.md).
|
54
38
|
|
55
|
-
Further reading: [ruby-doc](
|
39
|
+
Further reading: [ruby-doc](https://ruby-doc.org/3.2.1/Enumerator.html), [a great post about Enumerators](http://blog.arkency.com/2014/01/ruby-to-enum-for-enumerator/).
|
data/{railgun.yml → isogun.yml}
RENAMED
@@ -26,7 +26,7 @@ module JobIteration
|
|
26
26
|
end
|
27
27
|
|
28
28
|
if relation.arel.orders.present? || relation.arel.taken.present?
|
29
|
-
raise ConditionNotSupportedError
|
29
|
+
raise JobIteration::ActiveRecordCursor::ConditionNotSupportedError
|
30
30
|
end
|
31
31
|
|
32
32
|
@base_relation = relation.reorder(@columns.join(","))
|
@@ -19,8 +19,11 @@ module JobIteration
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def initialize(relation, columns = nil, position = nil)
|
22
|
-
columns
|
23
|
-
|
22
|
+
@columns = if columns
|
23
|
+
Array(columns)
|
24
|
+
else
|
25
|
+
Array(relation.primary_key).map { |pk| "#{relation.table_name}.#{pk}" }
|
26
|
+
end
|
24
27
|
self.position = Array.wrap(position)
|
25
28
|
raise ArgumentError, "Must specify at least one column" if columns.empty?
|
26
29
|
if relation.joins_values.present? && !@columns.all? { |column| column.to_s.include?(".") }
|
@@ -57,7 +60,7 @@ module JobIteration
|
|
57
60
|
end
|
58
61
|
|
59
62
|
def next_batch(batch_size)
|
60
|
-
return
|
63
|
+
return if @reached_end
|
61
64
|
|
62
65
|
relation = @base_relation.limit(batch_size)
|
63
66
|
|
@@ -10,7 +10,11 @@ module JobIteration
|
|
10
10
|
def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
|
11
11
|
@relation = relation
|
12
12
|
@batch_size = batch_size
|
13
|
-
@columns =
|
13
|
+
@columns = if columns
|
14
|
+
Array(columns)
|
15
|
+
else
|
16
|
+
Array(relation.primary_key).map { |pk| "#{relation.table_name}.#{pk}" }
|
17
|
+
end
|
14
18
|
@cursor = cursor
|
15
19
|
end
|
16
20
|
|
@@ -4,6 +4,7 @@ require_relative "./active_record_batch_enumerator"
|
|
4
4
|
require_relative "./active_record_enumerator"
|
5
5
|
require_relative "./csv_enumerator"
|
6
6
|
require_relative "./throttle_enumerator"
|
7
|
+
require_relative "./nested_enumerator"
|
7
8
|
require "forwardable"
|
8
9
|
|
9
10
|
module JobIteration
|
@@ -19,10 +20,12 @@ module JobIteration
|
|
19
20
|
# compatibility with raw calls to EnumeratorBuilder. Think of these wrappers
|
20
21
|
# the way you should a middleware.
|
21
22
|
class Wrapper < Enumerator
|
22
|
-
|
23
|
-
|
24
|
-
enum.
|
25
|
-
|
23
|
+
class << self
|
24
|
+
def wrap(_builder, enum)
|
25
|
+
new(-> { enum.size }) do |yielder|
|
26
|
+
enum.each do |*val|
|
27
|
+
yielder.yield(*val)
|
28
|
+
end
|
26
29
|
end
|
27
30
|
end
|
28
31
|
end
|
@@ -99,7 +102,7 @@ module JobIteration
|
|
99
102
|
enum = build_active_record_enumerator(
|
100
103
|
scope,
|
101
104
|
cursor: cursor,
|
102
|
-
**args
|
105
|
+
**args,
|
103
106
|
).records
|
104
107
|
wrap(self, enum)
|
105
108
|
end
|
@@ -114,7 +117,7 @@ module JobIteration
|
|
114
117
|
enum = build_active_record_enumerator(
|
115
118
|
scope,
|
116
119
|
cursor: cursor,
|
117
|
-
**args
|
120
|
+
**args,
|
118
121
|
).batches
|
119
122
|
wrap(self, enum)
|
120
123
|
end
|
@@ -125,7 +128,7 @@ module JobIteration
|
|
125
128
|
enum = JobIteration::ActiveRecordBatchEnumerator.new(
|
126
129
|
scope,
|
127
130
|
cursor: cursor,
|
128
|
-
**args
|
131
|
+
**args,
|
129
132
|
).each
|
130
133
|
enum = wrap(self, enum) if wrap
|
131
134
|
enum
|
@@ -136,7 +139,7 @@ module JobIteration
|
|
136
139
|
enum,
|
137
140
|
@job,
|
138
141
|
throttle_on: throttle_on,
|
139
|
-
backoff: backoff
|
142
|
+
backoff: backoff,
|
140
143
|
).to_enum
|
141
144
|
end
|
142
145
|
|
@@ -144,6 +147,40 @@ module JobIteration
|
|
144
147
|
CsvEnumerator.new(enumerable).rows(cursor: cursor)
|
145
148
|
end
|
146
149
|
|
150
|
+
# Builds Enumerator for nested iteration.
|
151
|
+
#
|
152
|
+
# @param enums [Array<Proc>] an Array of Procs, each should return an Enumerator.
|
153
|
+
# Each proc from enums should accept the yielded items from the parent enumerators
|
154
|
+
# and the `cursor` as its arguments.
|
155
|
+
# Each proc's `cursor` argument is its part from the `build_enumerator`'s `cursor` array.
|
156
|
+
# @param cursor [Array<Object>] array of offsets for each of the enums to start iteration from
|
157
|
+
#
|
158
|
+
# @example
|
159
|
+
# def build_enumerator(cursor:)
|
160
|
+
# enumerator_builder.nested(
|
161
|
+
# [
|
162
|
+
# ->(cursor) {
|
163
|
+
# enumerator_builder.active_record_on_records(Shop.all, cursor: cursor)
|
164
|
+
# },
|
165
|
+
# ->(shop, cursor) {
|
166
|
+
# enumerator_builder.active_record_on_records(shop.products, cursor: cursor)
|
167
|
+
# },
|
168
|
+
# ->(_shop, product, cursor) {
|
169
|
+
# enumerator_builder.active_record_on_batch_relations(product.product_variants, cursor: cursor)
|
170
|
+
# }
|
171
|
+
# ],
|
172
|
+
# cursor: cursor
|
173
|
+
# )
|
174
|
+
# end
|
175
|
+
#
|
176
|
+
# def each_iteration(product_variants_relation)
|
177
|
+
# # do something
|
178
|
+
# end
|
179
|
+
#
|
180
|
+
def build_nested_enumerator(enums, cursor:)
|
181
|
+
NestedEnumerator.new(enums, cursor: cursor).each
|
182
|
+
end
|
183
|
+
|
147
184
|
alias_method :once, :build_once_enumerator
|
148
185
|
alias_method :times, :build_times_enumerator
|
149
186
|
alias_method :array, :build_array_enumerator
|
@@ -152,6 +189,7 @@ module JobIteration
|
|
152
189
|
alias_method :active_record_on_batch_relations, :build_active_record_enumerator_on_batch_relations
|
153
190
|
alias_method :throttle, :build_throttle_enumerator
|
154
191
|
alias_method :csv, :build_csv_enumerator
|
192
|
+
alias_method :nested, :build_nested_enumerator
|
155
193
|
|
156
194
|
private
|
157
195
|
|
@@ -163,7 +201,7 @@ module JobIteration
|
|
163
201
|
JobIteration::ActiveRecordEnumerator.new(
|
164
202
|
scope,
|
165
203
|
cursor: cursor,
|
166
|
-
**args
|
204
|
+
**args,
|
167
205
|
)
|
168
206
|
end
|
169
207
|
end
|