pluck_in_batches 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +7 -3
- data/lib/pluck_in_batches/extensions.rb +13 -3
- data/lib/pluck_in_batches/iterator.rb +39 -27
- data/lib/pluck_in_batches/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2100a773ebc1bbfd8f51101298b35ddc7152dfac6810a443d89d78236de21b2
|
4
|
+
data.tar.gz: 0d4e03fb76a2d33822ab4a78ee3d43d464f4183e8485afba7a34e6ec276c2a48
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4fb12334c280f32979eb3a3c4cac9670c39c29e4811f4e0f5bcf35951b804bf0bb2ce99604acd2b8188766436e4cbfe774d49b6a68f70c731de5d745e30c25b
|
7
|
+
data.tar.gz: 91c216a345b096137287ccec020f9a1942777d873476751e8f31b9b0f3c3dd8185c6997d49c3e03d72239b77af96f41b13f9385cf44d87770be91312b5b76273
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## master (unreleased)
|
2
2
|
|
3
|
+
## 0.2.0 (2023-07-24)
|
4
|
+
|
5
|
+
- Support specifying per cursor column ordering when batching
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
Book.pluck_in_batches(:title, cursor_columns: [:author_id, :version], order: [:asc, :desc])
|
9
|
+
```
|
10
|
+
|
11
|
+
- Add `:of` as an alias for `:batch_size` option
|
12
|
+
|
3
13
|
## 0.1.0 (2023-05-16)
|
4
14
|
|
5
15
|
- First release
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://github.com/fatkodima/pluck_in_batches/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/fatkodima/pluck_in_batches/actions/workflows/ci.yml)
|
4
4
|
|
5
|
-
ActiveRecord comes with `find_each`
|
5
|
+
ActiveRecord comes with `find_each` / `find_in_batches` / `in_batches` methods to batch process records from a database.
|
6
6
|
ActiveRecord also has the `pluck` method which allows the selection of a set of fields without pulling
|
7
7
|
the entire record into memory.
|
8
8
|
|
@@ -14,7 +14,7 @@ It performs half of the number of SQL queries, allocates up to half of the memor
|
|
14
14
|
|
15
15
|
```ruby
|
16
16
|
# Before
|
17
|
-
User.in_batches do |batch|
|
17
|
+
User.in_batches do |batch| # or .find_in_batches, or .select(:email).find_each etc
|
18
18
|
emails = batch.pluck(:emails)
|
19
19
|
# do something with emails
|
20
20
|
end
|
@@ -25,6 +25,8 @@ User.pluck_in_batches(:email) do |emails|
|
|
25
25
|
end
|
26
26
|
```
|
27
27
|
|
28
|
+
**Note**: You may also find [`sidekiq-iteration`](https://github.com/fatkodima/sidekiq-iteration) useful when iterating over large collections in Sidekiq jobs.
|
29
|
+
|
28
30
|
## Requirements
|
29
31
|
|
30
32
|
- Ruby 2.7+
|
@@ -94,13 +96,15 @@ end
|
|
94
96
|
Both methods support the following configuration options:
|
95
97
|
|
96
98
|
* `:batch_size` - Specifies the size of the batch. Defaults to 1000.
|
99
|
+
Also aliased as `:of`.
|
97
100
|
* `:start` - Specifies the primary key value to start from, inclusive of the value.
|
98
101
|
* `:finish` - Specifies the primary key value to end at, inclusive of the value.
|
99
102
|
* `:error_on_ignore` - Overrides the application config to specify if an error should be raised when
|
100
103
|
an order is present in the relation.
|
101
104
|
* `:cursor_column` - Specifies the column(s) on which the iteration should be done.
|
102
105
|
This column(s) should be orderable (e.g. an integer or string). Defaults to primary key.
|
103
|
-
* `:order` - Specifies the primary key order (can be `:asc` or `:desc`
|
106
|
+
* `:order` - Specifies the primary key order (can be `:asc` or `:desc` or
|
107
|
+
an array consisting of :asc or :desc). Defaults to `:asc`.
|
104
108
|
|
105
109
|
## Development
|
106
110
|
|
@@ -13,7 +13,7 @@ module PluckInBatches
|
|
13
13
|
#
|
14
14
|
# See #pluck_in_batches for all the details.
|
15
15
|
#
|
16
|
-
def pluck_each(*columns, start: nil, finish: nil,
|
16
|
+
def pluck_each(*columns, start: nil, finish: nil, of: 1000, batch_size: of, error_on_ignore: nil, order: :asc, cursor_column: primary_key, &block)
|
17
17
|
iterator = Iterator.new(self)
|
18
18
|
iterator.each(*columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order, &block)
|
19
19
|
end
|
@@ -37,13 +37,23 @@ module PluckInBatches
|
|
37
37
|
#
|
38
38
|
# ==== Options
|
39
39
|
# * <tt>:batch_size</tt> - Specifies the size of the batch. Defaults to 1000.
|
40
|
+
# * <tt>:of</tt> - Same as +:batch_size+.
|
40
41
|
# * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value.
|
41
42
|
# * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value.
|
42
43
|
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
|
43
44
|
# an order is present in the relation.
|
44
45
|
# * <tt>:cursor_column</tt> - Specifies the column(s) on which the iteration should be done.
|
45
46
|
# This column(s) should be orderable (e.g. an integer or string). Defaults to primary key.
|
46
|
-
# * <tt>:order</tt> - Specifies the cursor column(s) order (can be +:asc+ or +:desc+
|
47
|
+
# * <tt>:order</tt> - Specifies the cursor column(s) order (can be +:asc+ or +:desc+ or an array consisting
|
48
|
+
# of :asc or :desc). Defaults to +:asc+.
|
49
|
+
#
|
50
|
+
# class Book < ActiveRecord::Base
|
51
|
+
# self.primary_key = [:author_id, :version]
|
52
|
+
# end
|
53
|
+
#
|
54
|
+
# Book.pluck_in_batches(:title, order: [:asc, :desc])
|
55
|
+
#
|
56
|
+
# In the above code, +author_id+ is sorted in ascending order and +version+ in descending order.
|
47
57
|
#
|
48
58
|
# Limits are honored, and if present there is no requirement for the batch
|
49
59
|
# size: it can be less than, equal to, or greater than the limit.
|
@@ -68,7 +78,7 @@ module PluckInBatches
|
|
68
78
|
# NOTE: By its nature, batch processing is subject to race conditions if
|
69
79
|
# other processes are modifying the database.
|
70
80
|
#
|
71
|
-
def pluck_in_batches(*columns, start: nil, finish: nil,
|
81
|
+
def pluck_in_batches(*columns, start: nil, finish: nil, of: 1000, batch_size: of, error_on_ignore: nil, cursor_column: primary_key, order: :asc, &block)
|
72
82
|
iterator = Iterator.new(self)
|
73
83
|
iterator.each_batch(*columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order, &block)
|
74
84
|
end
|
@@ -2,12 +2,15 @@
|
|
2
2
|
|
3
3
|
module PluckInBatches
|
4
4
|
class Iterator # :nodoc:
|
5
|
+
VALID_ORDERS = [:asc, :desc].freeze
|
6
|
+
DEFAULT_ORDER = :asc
|
7
|
+
|
5
8
|
def initialize(relation)
|
6
9
|
@relation = relation
|
7
10
|
@klass = relation.klass
|
8
11
|
end
|
9
12
|
|
10
|
-
def each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order:
|
13
|
+
def each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: DEFAULT_ORDER, &block)
|
11
14
|
if columns.empty?
|
12
15
|
raise ArgumentError, "Call `pluck_each' with at least one column."
|
13
16
|
end
|
@@ -18,18 +21,18 @@ module PluckInBatches
|
|
18
21
|
end
|
19
22
|
else
|
20
23
|
enum_for(__callee__, *columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order) do
|
21
|
-
apply_limits(@relation, start, finish, order).size
|
24
|
+
apply_limits(@relation, start, finish, build_batch_orders(order)).size
|
22
25
|
end
|
23
26
|
end
|
24
27
|
end
|
25
28
|
|
26
|
-
def each_batch(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order:
|
29
|
+
def each_batch(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: DEFAULT_ORDER)
|
27
30
|
if columns.empty?
|
28
31
|
raise ArgumentError, "Call `pluck_in_batches' with at least one column."
|
29
32
|
end
|
30
33
|
|
31
|
-
unless order
|
32
|
-
raise ArgumentError, ":order must be :asc or :desc, got #{order.inspect}"
|
34
|
+
unless Array(order).all? { |ord| VALID_ORDERS.include?(ord) }
|
35
|
+
raise ArgumentError, ":order must be :asc or :desc or an array consisting of :asc or :desc, got #{order.inspect}"
|
33
36
|
end
|
34
37
|
|
35
38
|
pluck_columns = columns.map(&:to_s)
|
@@ -44,10 +47,11 @@ module PluckInBatches
|
|
44
47
|
end
|
45
48
|
|
46
49
|
relation = @relation
|
50
|
+
batch_orders = build_batch_orders(cursor_columns, order)
|
47
51
|
|
48
52
|
unless block_given?
|
49
53
|
return to_enum(__callee__, *columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order) do
|
50
|
-
total = apply_limits(relation, cursor_columns, start, finish,
|
54
|
+
total = apply_limits(relation, cursor_columns, start, finish, batch_orders).size
|
51
55
|
(total - 1).div(batch_size) + 1
|
52
56
|
end
|
53
57
|
end
|
@@ -62,8 +66,8 @@ module PluckInBatches
|
|
62
66
|
batch_limit = remaining if remaining < batch_limit
|
63
67
|
end
|
64
68
|
|
65
|
-
relation = relation.reorder(
|
66
|
-
relation = apply_limits(relation, cursor_columns, start, finish,
|
69
|
+
relation = relation.reorder(batch_orders.to_h).limit(batch_limit)
|
70
|
+
relation = apply_limits(relation, cursor_columns, start, finish, batch_orders)
|
67
71
|
relation.skip_query_cache! # Retaining the results in the query cache would undermine the point of batching
|
68
72
|
batch_relation = relation
|
69
73
|
|
@@ -99,9 +103,13 @@ module PluckInBatches
|
|
99
103
|
end
|
100
104
|
end
|
101
105
|
|
102
|
-
|
103
|
-
|
104
|
-
|
106
|
+
_last_column, last_order = batch_orders.last
|
107
|
+
operators = batch_orders.map do |_column, order| # rubocop:disable Lint/ShadowingOuterLocalVariable
|
108
|
+
order == :desc ? :lteq : :gteq
|
109
|
+
end
|
110
|
+
operators[-1] = (last_order == :desc ? :lt : :gt)
|
111
|
+
|
112
|
+
batch_relation = batch_condition(relation, cursor_columns, cursor_column_offsets, operators)
|
105
113
|
end
|
106
114
|
end
|
107
115
|
|
@@ -135,29 +143,33 @@ module PluckInBatches
|
|
135
143
|
end
|
136
144
|
end
|
137
145
|
|
138
|
-
def apply_limits(relation, columns, start, finish,
|
139
|
-
relation = apply_start_limit(relation, columns, start,
|
140
|
-
relation = apply_finish_limit(relation, columns, finish,
|
146
|
+
def apply_limits(relation, columns, start, finish, batch_orders)
|
147
|
+
relation = apply_start_limit(relation, columns, start, batch_orders) if start
|
148
|
+
relation = apply_finish_limit(relation, columns, finish, batch_orders) if finish
|
141
149
|
relation
|
142
150
|
end
|
143
151
|
|
144
|
-
def apply_start_limit(relation, columns, start,
|
145
|
-
|
152
|
+
def apply_start_limit(relation, columns, start, batch_orders)
|
153
|
+
operators = batch_orders.map do |_column, order|
|
154
|
+
order == :desc ? :lteq : :gteq
|
155
|
+
end
|
156
|
+
batch_condition(relation, columns, start, operators)
|
146
157
|
end
|
147
158
|
|
148
|
-
def apply_finish_limit(relation, columns, finish,
|
149
|
-
|
159
|
+
def apply_finish_limit(relation, columns, finish, batch_orders)
|
160
|
+
operators = batch_orders.map do |_column, order|
|
161
|
+
order == :desc ? :gteq : :lteq
|
162
|
+
end
|
163
|
+
batch_condition(relation, columns, finish, operators)
|
150
164
|
end
|
151
165
|
|
152
|
-
def batch_condition(relation, columns, values,
|
153
|
-
|
154
|
-
values = Array(values)
|
155
|
-
cursor_positions = columns.zip(values)
|
166
|
+
def batch_condition(relation, columns, values, operators)
|
167
|
+
cursor_positions = Array(columns).zip(Array(values), operators)
|
156
168
|
|
157
|
-
first_clause_column, first_clause_value = cursor_positions.pop
|
169
|
+
first_clause_column, first_clause_value, operator = cursor_positions.pop
|
158
170
|
where_clause = build_attribute_predicate(first_clause_column, first_clause_value, operator)
|
159
171
|
|
160
|
-
cursor_positions.reverse_each do |column_name, value|
|
172
|
+
cursor_positions.reverse_each do |column_name, value, operator| # rubocop:disable Lint/ShadowingOuterLocalVariable
|
161
173
|
where_clause = build_attribute_predicate(column_name, value, operator == :lteq ? :lt : :gt).or(
|
162
174
|
build_attribute_predicate(column_name, value, :eq).and(where_clause)
|
163
175
|
)
|
@@ -170,9 +182,9 @@ module PluckInBatches
|
|
170
182
|
@relation.bind_attribute(column, value) { |attr, bind| attr.public_send(operator, bind) }
|
171
183
|
end
|
172
184
|
|
173
|
-
def
|
174
|
-
cursor_columns.map do |column|
|
175
|
-
|
185
|
+
def build_batch_orders(cursor_columns, order)
|
186
|
+
cursor_columns.zip(Array(order)).map do |column, ord|
|
187
|
+
[column, ord || DEFAULT_ORDER]
|
176
188
|
end
|
177
189
|
end
|
178
190
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pluck_in_batches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- fatkodima
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -60,8 +60,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
requirements: []
|
63
|
-
rubygems_version: 3.4.
|
63
|
+
rubygems_version: 3.4.6
|
64
64
|
signing_key:
|
65
65
|
specification_version: 4
|
66
|
-
summary:
|
66
|
+
summary: A faster alternative to the custom use of `in_batches` with `pluck`.
|
67
67
|
test_files: []
|