pluck_in_batches 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +7 -3
- data/lib/pluck_in_batches/extensions.rb +13 -3
- data/lib/pluck_in_batches/iterator.rb +39 -27
- data/lib/pluck_in_batches/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2100a773ebc1bbfd8f51101298b35ddc7152dfac6810a443d89d78236de21b2
|
4
|
+
data.tar.gz: 0d4e03fb76a2d33822ab4a78ee3d43d464f4183e8485afba7a34e6ec276c2a48
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4fb12334c280f32979eb3a3c4cac9670c39c29e4811f4e0f5bcf35951b804bf0bb2ce99604acd2b8188766436e4cbfe774d49b6a68f70c731de5d745e30c25b
|
7
|
+
data.tar.gz: 91c216a345b096137287ccec020f9a1942777d873476751e8f31b9b0f3c3dd8185c6997d49c3e03d72239b77af96f41b13f9385cf44d87770be91312b5b76273
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## master (unreleased)
|
2
2
|
|
3
|
+
## 0.2.0 (2023-07-24)
|
4
|
+
|
5
|
+
- Support specifying per cursor column ordering when batching
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
Book.pluck_in_batches(:title, cursor_columns: [:author_id, :version], order: [:asc, :desc])
|
9
|
+
```
|
10
|
+
|
11
|
+
- Add `:of` as an alias for `:batch_size` option
|
12
|
+
|
3
13
|
## 0.1.0 (2023-05-16)
|
4
14
|
|
5
15
|
- First release
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[](https://github.com/fatkodima/pluck_in_batches/actions/workflows/ci.yml)
|
4
4
|
|
5
|
-
ActiveRecord comes with `find_each`
|
5
|
+
ActiveRecord comes with `find_each` / `find_in_batches` / `in_batches` methods to batch process records from a database.
|
6
6
|
ActiveRecord also has the `pluck` method which allows the selection of a set of fields without pulling
|
7
7
|
the entire record into memory.
|
8
8
|
|
@@ -14,7 +14,7 @@ It performs half of the number of SQL queries, allocates up to half of the memor
|
|
14
14
|
|
15
15
|
```ruby
|
16
16
|
# Before
|
17
|
-
User.in_batches do |batch|
|
17
|
+
User.in_batches do |batch| # or .find_in_batches, or .select(:email).find_each etc
|
18
18
|
emails = batch.pluck(:emails)
|
19
19
|
# do something with emails
|
20
20
|
end
|
@@ -25,6 +25,8 @@ User.pluck_in_batches(:email) do |emails|
|
|
25
25
|
end
|
26
26
|
```
|
27
27
|
|
28
|
+
**Note**: You may also find [`sidekiq-iteration`](https://github.com/fatkodima/sidekiq-iteration) useful when iterating over large collections in Sidekiq jobs.
|
29
|
+
|
28
30
|
## Requirements
|
29
31
|
|
30
32
|
- Ruby 2.7+
|
@@ -94,13 +96,15 @@ end
|
|
94
96
|
Both methods support the following configuration options:
|
95
97
|
|
96
98
|
* `:batch_size` - Specifies the size of the batch. Defaults to 1000.
|
99
|
+
Also aliased as `:of`.
|
97
100
|
* `:start` - Specifies the primary key value to start from, inclusive of the value.
|
98
101
|
* `:finish` - Specifies the primary key value to end at, inclusive of the value.
|
99
102
|
* `:error_on_ignore` - Overrides the application config to specify if an error should be raised when
|
100
103
|
an order is present in the relation.
|
101
104
|
* `:cursor_column` - Specifies the column(s) on which the iteration should be done.
|
102
105
|
This column(s) should be orderable (e.g. an integer or string). Defaults to primary key.
|
103
|
-
* `:order` - Specifies the primary key order (can be `:asc` or `:desc`
|
106
|
+
* `:order` - Specifies the primary key order (can be `:asc` or `:desc` or
|
107
|
+
an array consisting of :asc or :desc). Defaults to `:asc`.
|
104
108
|
|
105
109
|
## Development
|
106
110
|
|
@@ -13,7 +13,7 @@ module PluckInBatches
|
|
13
13
|
#
|
14
14
|
# See #pluck_in_batches for all the details.
|
15
15
|
#
|
16
|
-
def pluck_each(*columns, start: nil, finish: nil,
|
16
|
+
def pluck_each(*columns, start: nil, finish: nil, of: 1000, batch_size: of, error_on_ignore: nil, order: :asc, cursor_column: primary_key, &block)
|
17
17
|
iterator = Iterator.new(self)
|
18
18
|
iterator.each(*columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order, &block)
|
19
19
|
end
|
@@ -37,13 +37,23 @@ module PluckInBatches
|
|
37
37
|
#
|
38
38
|
# ==== Options
|
39
39
|
# * <tt>:batch_size</tt> - Specifies the size of the batch. Defaults to 1000.
|
40
|
+
# * <tt>:of</tt> - Same as +:batch_size+.
|
40
41
|
# * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value.
|
41
42
|
# * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value.
|
42
43
|
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
|
43
44
|
# an order is present in the relation.
|
44
45
|
# * <tt>:cursor_column</tt> - Specifies the column(s) on which the iteration should be done.
|
45
46
|
# This column(s) should be orderable (e.g. an integer or string). Defaults to primary key.
|
46
|
-
# * <tt>:order</tt> - Specifies the cursor column(s) order (can be +:asc+ or +:desc+
|
47
|
+
# * <tt>:order</tt> - Specifies the cursor column(s) order (can be +:asc+ or +:desc+ or an array consisting
|
48
|
+
# of :asc or :desc). Defaults to +:asc+.
|
49
|
+
#
|
50
|
+
# class Book < ActiveRecord::Base
|
51
|
+
# self.primary_key = [:author_id, :version]
|
52
|
+
# end
|
53
|
+
#
|
54
|
+
# Book.pluck_in_batches(:title, order: [:asc, :desc])
|
55
|
+
#
|
56
|
+
# In the above code, +author_id+ is sorted in ascending order and +version+ in descending order.
|
47
57
|
#
|
48
58
|
# Limits are honored, and if present there is no requirement for the batch
|
49
59
|
# size: it can be less than, equal to, or greater than the limit.
|
@@ -68,7 +78,7 @@ module PluckInBatches
|
|
68
78
|
# NOTE: By its nature, batch processing is subject to race conditions if
|
69
79
|
# other processes are modifying the database.
|
70
80
|
#
|
71
|
-
def pluck_in_batches(*columns, start: nil, finish: nil,
|
81
|
+
def pluck_in_batches(*columns, start: nil, finish: nil, of: 1000, batch_size: of, error_on_ignore: nil, cursor_column: primary_key, order: :asc, &block)
|
72
82
|
iterator = Iterator.new(self)
|
73
83
|
iterator.each_batch(*columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order, &block)
|
74
84
|
end
|
@@ -2,12 +2,15 @@
|
|
2
2
|
|
3
3
|
module PluckInBatches
|
4
4
|
class Iterator # :nodoc:
|
5
|
+
VALID_ORDERS = [:asc, :desc].freeze
|
6
|
+
DEFAULT_ORDER = :asc
|
7
|
+
|
5
8
|
def initialize(relation)
|
6
9
|
@relation = relation
|
7
10
|
@klass = relation.klass
|
8
11
|
end
|
9
12
|
|
10
|
-
def each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order:
|
13
|
+
def each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: DEFAULT_ORDER, &block)
|
11
14
|
if columns.empty?
|
12
15
|
raise ArgumentError, "Call `pluck_each' with at least one column."
|
13
16
|
end
|
@@ -18,18 +21,18 @@ module PluckInBatches
|
|
18
21
|
end
|
19
22
|
else
|
20
23
|
enum_for(__callee__, *columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order) do
|
21
|
-
apply_limits(@relation, start, finish, order).size
|
24
|
+
apply_limits(@relation, start, finish, build_batch_orders(order)).size
|
22
25
|
end
|
23
26
|
end
|
24
27
|
end
|
25
28
|
|
26
|
-
def each_batch(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order:
|
29
|
+
def each_batch(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: DEFAULT_ORDER)
|
27
30
|
if columns.empty?
|
28
31
|
raise ArgumentError, "Call `pluck_in_batches' with at least one column."
|
29
32
|
end
|
30
33
|
|
31
|
-
unless order
|
32
|
-
raise ArgumentError, ":order must be :asc or :desc, got #{order.inspect}"
|
34
|
+
unless Array(order).all? { |ord| VALID_ORDERS.include?(ord) }
|
35
|
+
raise ArgumentError, ":order must be :asc or :desc or an array consisting of :asc or :desc, got #{order.inspect}"
|
33
36
|
end
|
34
37
|
|
35
38
|
pluck_columns = columns.map(&:to_s)
|
@@ -44,10 +47,11 @@ module PluckInBatches
|
|
44
47
|
end
|
45
48
|
|
46
49
|
relation = @relation
|
50
|
+
batch_orders = build_batch_orders(cursor_columns, order)
|
47
51
|
|
48
52
|
unless block_given?
|
49
53
|
return to_enum(__callee__, *columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order) do
|
50
|
-
total = apply_limits(relation, cursor_columns, start, finish,
|
54
|
+
total = apply_limits(relation, cursor_columns, start, finish, batch_orders).size
|
51
55
|
(total - 1).div(batch_size) + 1
|
52
56
|
end
|
53
57
|
end
|
@@ -62,8 +66,8 @@ module PluckInBatches
|
|
62
66
|
batch_limit = remaining if remaining < batch_limit
|
63
67
|
end
|
64
68
|
|
65
|
-
relation = relation.reorder(
|
66
|
-
relation = apply_limits(relation, cursor_columns, start, finish,
|
69
|
+
relation = relation.reorder(batch_orders.to_h).limit(batch_limit)
|
70
|
+
relation = apply_limits(relation, cursor_columns, start, finish, batch_orders)
|
67
71
|
relation.skip_query_cache! # Retaining the results in the query cache would undermine the point of batching
|
68
72
|
batch_relation = relation
|
69
73
|
|
@@ -99,9 +103,13 @@ module PluckInBatches
|
|
99
103
|
end
|
100
104
|
end
|
101
105
|
|
102
|
-
|
103
|
-
|
104
|
-
|
106
|
+
_last_column, last_order = batch_orders.last
|
107
|
+
operators = batch_orders.map do |_column, order| # rubocop:disable Lint/ShadowingOuterLocalVariable
|
108
|
+
order == :desc ? :lteq : :gteq
|
109
|
+
end
|
110
|
+
operators[-1] = (last_order == :desc ? :lt : :gt)
|
111
|
+
|
112
|
+
batch_relation = batch_condition(relation, cursor_columns, cursor_column_offsets, operators)
|
105
113
|
end
|
106
114
|
end
|
107
115
|
|
@@ -135,29 +143,33 @@ module PluckInBatches
|
|
135
143
|
end
|
136
144
|
end
|
137
145
|
|
138
|
-
def apply_limits(relation, columns, start, finish,
|
139
|
-
relation = apply_start_limit(relation, columns, start,
|
140
|
-
relation = apply_finish_limit(relation, columns, finish,
|
146
|
+
def apply_limits(relation, columns, start, finish, batch_orders)
|
147
|
+
relation = apply_start_limit(relation, columns, start, batch_orders) if start
|
148
|
+
relation = apply_finish_limit(relation, columns, finish, batch_orders) if finish
|
141
149
|
relation
|
142
150
|
end
|
143
151
|
|
144
|
-
def apply_start_limit(relation, columns, start,
|
145
|
-
|
152
|
+
def apply_start_limit(relation, columns, start, batch_orders)
|
153
|
+
operators = batch_orders.map do |_column, order|
|
154
|
+
order == :desc ? :lteq : :gteq
|
155
|
+
end
|
156
|
+
batch_condition(relation, columns, start, operators)
|
146
157
|
end
|
147
158
|
|
148
|
-
def apply_finish_limit(relation, columns, finish,
|
149
|
-
|
159
|
+
def apply_finish_limit(relation, columns, finish, batch_orders)
|
160
|
+
operators = batch_orders.map do |_column, order|
|
161
|
+
order == :desc ? :gteq : :lteq
|
162
|
+
end
|
163
|
+
batch_condition(relation, columns, finish, operators)
|
150
164
|
end
|
151
165
|
|
152
|
-
def batch_condition(relation, columns, values,
|
153
|
-
|
154
|
-
values = Array(values)
|
155
|
-
cursor_positions = columns.zip(values)
|
166
|
+
def batch_condition(relation, columns, values, operators)
|
167
|
+
cursor_positions = Array(columns).zip(Array(values), operators)
|
156
168
|
|
157
|
-
first_clause_column, first_clause_value = cursor_positions.pop
|
169
|
+
first_clause_column, first_clause_value, operator = cursor_positions.pop
|
158
170
|
where_clause = build_attribute_predicate(first_clause_column, first_clause_value, operator)
|
159
171
|
|
160
|
-
cursor_positions.reverse_each do |column_name, value|
|
172
|
+
cursor_positions.reverse_each do |column_name, value, operator| # rubocop:disable Lint/ShadowingOuterLocalVariable
|
161
173
|
where_clause = build_attribute_predicate(column_name, value, operator == :lteq ? :lt : :gt).or(
|
162
174
|
build_attribute_predicate(column_name, value, :eq).and(where_clause)
|
163
175
|
)
|
@@ -170,9 +182,9 @@ module PluckInBatches
|
|
170
182
|
@relation.bind_attribute(column, value) { |attr, bind| attr.public_send(operator, bind) }
|
171
183
|
end
|
172
184
|
|
173
|
-
def
|
174
|
-
cursor_columns.map do |column|
|
175
|
-
|
185
|
+
def build_batch_orders(cursor_columns, order)
|
186
|
+
cursor_columns.zip(Array(order)).map do |column, ord|
|
187
|
+
[column, ord || DEFAULT_ORDER]
|
176
188
|
end
|
177
189
|
end
|
178
190
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pluck_in_batches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- fatkodima
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -60,8 +60,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
requirements: []
|
63
|
-
rubygems_version: 3.4.
|
63
|
+
rubygems_version: 3.4.6
|
64
64
|
signing_key:
|
65
65
|
specification_version: 4
|
66
|
-
summary:
|
66
|
+
summary: A faster alternative to the custom use of `in_batches` with `pluck`.
|
67
67
|
test_files: []
|