pluck_in_batches 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d86f3ce009db02836e820ec434ae17fa43685651a74e85e2260675d7f2aeb945
4
- data.tar.gz: e27703d5c07b89db1d75adc5331ce597c8b95669575a65596e0e141a44287412
3
+ metadata.gz: f2100a773ebc1bbfd8f51101298b35ddc7152dfac6810a443d89d78236de21b2
4
+ data.tar.gz: 0d4e03fb76a2d33822ab4a78ee3d43d464f4183e8485afba7a34e6ec276c2a48
5
5
  SHA512:
6
- metadata.gz: f96ef381074b16ab8cd5fa48ca6f38245cae06ae0213fcb3c1ebb159b7c6712a441bbdf08c5ae36bb63a12bc46d04daa70c38ef2f5bd04462397bf254ab610f0
7
- data.tar.gz: bd5e5382cbd444f4cc0d9b81bad56792c6a6f436cad3376064d1b1cfe46699c2b129e36b4311cedda26aec827b5a140d7f344ebeb959bb1e22594414e1577d6b
6
+ metadata.gz: f4fb12334c280f32979eb3a3c4cac9670c39c29e4811f4e0f5bcf35951b804bf0bb2ce99604acd2b8188766436e4cbfe774d49b6a68f70c731de5d745e30c25b
7
+ data.tar.gz: 91c216a345b096137287ccec020f9a1942777d873476751e8f31b9b0f3c3dd8185c6997d49c3e03d72239b77af96f41b13f9385cf44d87770be91312b5b76273
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## master (unreleased)
2
2
 
3
+ ## 0.2.0 (2023-07-24)
4
+
5
+ - Support specifying per cursor column ordering when batching
6
+
7
+ ```ruby
8
+ Book.pluck_in_batches(:title, cursor_columns: [:author_id, :version], order: [:asc, :desc])
9
+ ```
10
+
11
+ - Add `:of` as an alias for `:batch_size` option
12
+
3
13
  ## 0.1.0 (2023-05-16)
4
14
 
5
15
  - First release
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![Build Status](https://github.com/fatkodima/pluck_in_batches/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/fatkodima/pluck_in_batches/actions/workflows/ci.yml)
4
4
 
5
- ActiveRecord comes with `find_each` and `find_in_batches` methods to batch process records from a database.
5
+ ActiveRecord comes with `find_each` / `find_in_batches` / `in_batches` methods to batch process records from a database.
6
6
  ActiveRecord also has the `pluck` method which allows the selection of a set of fields without pulling
7
7
  the entire record into memory.
8
8
 
@@ -14,7 +14,7 @@ It performs half of the number of SQL queries, allocates up to half of the memor
14
14
 
15
15
  ```ruby
16
16
  # Before
17
- User.in_batches do |batch|
17
+ User.in_batches do |batch| # or .find_in_batches, or .select(:email).find_each etc
18
18
  emails = batch.pluck(:emails)
19
19
  # do something with emails
20
20
  end
@@ -25,6 +25,8 @@ User.pluck_in_batches(:email) do |emails|
25
25
  end
26
26
  ```
27
27
 
28
+ **Note**: You may also find [`sidekiq-iteration`](https://github.com/fatkodima/sidekiq-iteration) useful when iterating over large collections in Sidekiq jobs.
29
+
28
30
  ## Requirements
29
31
 
30
32
  - Ruby 2.7+
@@ -94,13 +96,15 @@ end
94
96
  Both methods support the following configuration options:
95
97
 
96
98
  * `:batch_size` - Specifies the size of the batch. Defaults to 1000.
99
+ Also aliased as `:of`.
97
100
  * `:start` - Specifies the primary key value to start from, inclusive of the value.
98
101
  * `:finish` - Specifies the primary key value to end at, inclusive of the value.
99
102
  * `:error_on_ignore` - Overrides the application config to specify if an error should be raised when
100
103
  an order is present in the relation.
101
104
  * `:cursor_column` - Specifies the column(s) on which the iteration should be done.
102
105
  This column(s) should be orderable (e.g. an integer or string). Defaults to primary key.
103
- * `:order` - Specifies the primary key order (can be `:asc` or `:desc`). Defaults to `:asc`.
106
+ * `:order` - Specifies the primary key order (can be `:asc` or `:desc` or
107
+ an array consisting of :asc or :desc). Defaults to `:asc`.
104
108
 
105
109
  ## Development
106
110
 
@@ -13,7 +13,7 @@ module PluckInBatches
13
13
  #
14
14
  # See #pluck_in_batches for all the details.
15
15
  #
16
- def pluck_each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, order: :asc, cursor_column: primary_key, &block)
16
+ def pluck_each(*columns, start: nil, finish: nil, of: 1000, batch_size: of, error_on_ignore: nil, order: :asc, cursor_column: primary_key, &block)
17
17
  iterator = Iterator.new(self)
18
18
  iterator.each(*columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order, &block)
19
19
  end
@@ -37,13 +37,23 @@ module PluckInBatches
37
37
  #
38
38
  # ==== Options
39
39
  # * <tt>:batch_size</tt> - Specifies the size of the batch. Defaults to 1000.
40
+ # * <tt>:of</tt> - Same as +:batch_size+.
40
41
  # * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value.
41
42
  # * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value.
42
43
  # * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
43
44
  # an order is present in the relation.
44
45
  # * <tt>:cursor_column</tt> - Specifies the column(s) on which the iteration should be done.
45
46
  # This column(s) should be orderable (e.g. an integer or string). Defaults to primary key.
46
- # * <tt>:order</tt> - Specifies the cursor column(s) order (can be +:asc+ or +:desc+). Defaults to +:asc+.
47
+ # * <tt>:order</tt> - Specifies the cursor column(s) order (can be +:asc+ or +:desc+ or an array consisting
48
+ # of :asc or :desc). Defaults to +:asc+.
49
+ #
50
+ # class Book < ActiveRecord::Base
51
+ # self.primary_key = [:author_id, :version]
52
+ # end
53
+ #
54
+ # Book.pluck_in_batches(:title, order: [:asc, :desc])
55
+ #
56
+ # In the above code, +author_id+ is sorted in ascending order and +version+ in descending order.
47
57
  #
48
58
  # Limits are honored, and if present there is no requirement for the batch
49
59
  # size: it can be less than, equal to, or greater than the limit.
@@ -68,7 +78,7 @@ module PluckInBatches
68
78
  # NOTE: By its nature, batch processing is subject to race conditions if
69
79
  # other processes are modifying the database.
70
80
  #
71
- def pluck_in_batches(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: primary_key, order: :asc, &block)
81
+ def pluck_in_batches(*columns, start: nil, finish: nil, of: 1000, batch_size: of, error_on_ignore: nil, cursor_column: primary_key, order: :asc, &block)
72
82
  iterator = Iterator.new(self)
73
83
  iterator.each_batch(*columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order, &block)
74
84
  end
@@ -2,12 +2,15 @@
2
2
 
3
3
  module PluckInBatches
4
4
  class Iterator # :nodoc:
5
+ VALID_ORDERS = [:asc, :desc].freeze
6
+ DEFAULT_ORDER = :asc
7
+
5
8
  def initialize(relation)
6
9
  @relation = relation
7
10
  @klass = relation.klass
8
11
  end
9
12
 
10
- def each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: :asc, &block)
13
+ def each(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: DEFAULT_ORDER, &block)
11
14
  if columns.empty?
12
15
  raise ArgumentError, "Call `pluck_each' with at least one column."
13
16
  end
@@ -18,18 +21,18 @@ module PluckInBatches
18
21
  end
19
22
  else
20
23
  enum_for(__callee__, *columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order) do
21
- apply_limits(@relation, start, finish, order).size
24
+ apply_limits(@relation, start, finish, build_batch_orders(order)).size
22
25
  end
23
26
  end
24
27
  end
25
28
 
26
- def each_batch(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: :asc)
29
+ def each_batch(*columns, start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil, cursor_column: @relation.primary_key, order: DEFAULT_ORDER)
27
30
  if columns.empty?
28
31
  raise ArgumentError, "Call `pluck_in_batches' with at least one column."
29
32
  end
30
33
 
31
- unless order == :asc || order == :desc
32
- raise ArgumentError, ":order must be :asc or :desc, got #{order.inspect}"
34
+ unless Array(order).all? { |ord| VALID_ORDERS.include?(ord) }
35
+ raise ArgumentError, ":order must be :asc or :desc or an array consisting of :asc or :desc, got #{order.inspect}"
33
36
  end
34
37
 
35
38
  pluck_columns = columns.map(&:to_s)
@@ -44,10 +47,11 @@ module PluckInBatches
44
47
  end
45
48
 
46
49
  relation = @relation
50
+ batch_orders = build_batch_orders(cursor_columns, order)
47
51
 
48
52
  unless block_given?
49
53
  return to_enum(__callee__, *columns, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore, cursor_column: cursor_column, order: order) do
50
- total = apply_limits(relation, cursor_columns, start, finish, order).size
54
+ total = apply_limits(relation, cursor_columns, start, finish, batch_orders).size
51
55
  (total - 1).div(batch_size) + 1
52
56
  end
53
57
  end
@@ -62,8 +66,8 @@ module PluckInBatches
62
66
  batch_limit = remaining if remaining < batch_limit
63
67
  end
64
68
 
65
- relation = relation.reorder(*batch_order(cursor_columns, order)).limit(batch_limit)
66
- relation = apply_limits(relation, cursor_columns, start, finish, order)
69
+ relation = relation.reorder(batch_orders.to_h).limit(batch_limit)
70
+ relation = apply_limits(relation, cursor_columns, start, finish, batch_orders)
67
71
  relation.skip_query_cache! # Retaining the results in the query cache would undermine the point of batching
68
72
  batch_relation = relation
69
73
 
@@ -99,9 +103,13 @@ module PluckInBatches
99
103
  end
100
104
  end
101
105
 
102
- batch_relation = batch_condition(
103
- relation, cursor_columns, cursor_column_offsets, order == :desc ? :lt : :gt
104
- )
106
+ _last_column, last_order = batch_orders.last
107
+ operators = batch_orders.map do |_column, order| # rubocop:disable Lint/ShadowingOuterLocalVariable
108
+ order == :desc ? :lteq : :gteq
109
+ end
110
+ operators[-1] = (last_order == :desc ? :lt : :gt)
111
+
112
+ batch_relation = batch_condition(relation, cursor_columns, cursor_column_offsets, operators)
105
113
  end
106
114
  end
107
115
 
@@ -135,29 +143,33 @@ module PluckInBatches
135
143
  end
136
144
  end
137
145
 
138
- def apply_limits(relation, columns, start, finish, order)
139
- relation = apply_start_limit(relation, columns, start, order) if start
140
- relation = apply_finish_limit(relation, columns, finish, order) if finish
146
+ def apply_limits(relation, columns, start, finish, batch_orders)
147
+ relation = apply_start_limit(relation, columns, start, batch_orders) if start
148
+ relation = apply_finish_limit(relation, columns, finish, batch_orders) if finish
141
149
  relation
142
150
  end
143
151
 
144
- def apply_start_limit(relation, columns, start, order)
145
- batch_condition(relation, columns, start, order == :desc ? :lteq : :gteq)
152
+ def apply_start_limit(relation, columns, start, batch_orders)
153
+ operators = batch_orders.map do |_column, order|
154
+ order == :desc ? :lteq : :gteq
155
+ end
156
+ batch_condition(relation, columns, start, operators)
146
157
  end
147
158
 
148
- def apply_finish_limit(relation, columns, finish, order)
149
- batch_condition(relation, columns, finish, order == :desc ? :gteq : :lteq)
159
+ def apply_finish_limit(relation, columns, finish, batch_orders)
160
+ operators = batch_orders.map do |_column, order|
161
+ order == :desc ? :gteq : :lteq
162
+ end
163
+ batch_condition(relation, columns, finish, operators)
150
164
  end
151
165
 
152
- def batch_condition(relation, columns, values, operator)
153
- columns = Array(columns)
154
- values = Array(values)
155
- cursor_positions = columns.zip(values)
166
+ def batch_condition(relation, columns, values, operators)
167
+ cursor_positions = Array(columns).zip(Array(values), operators)
156
168
 
157
- first_clause_column, first_clause_value = cursor_positions.pop
169
+ first_clause_column, first_clause_value, operator = cursor_positions.pop
158
170
  where_clause = build_attribute_predicate(first_clause_column, first_clause_value, operator)
159
171
 
160
- cursor_positions.reverse_each do |column_name, value|
172
+ cursor_positions.reverse_each do |column_name, value, operator| # rubocop:disable Lint/ShadowingOuterLocalVariable
161
173
  where_clause = build_attribute_predicate(column_name, value, operator == :lteq ? :lt : :gt).or(
162
174
  build_attribute_predicate(column_name, value, :eq).and(where_clause)
163
175
  )
@@ -170,9 +182,9 @@ module PluckInBatches
170
182
  @relation.bind_attribute(column, value) { |attr, bind| attr.public_send(operator, bind) }
171
183
  end
172
184
 
173
- def batch_order(cursor_columns, order)
174
- cursor_columns.map do |column|
175
- @relation.arel_table[column].public_send(order)
185
+ def build_batch_orders(cursor_columns, order)
186
+ cursor_columns.zip(Array(order)).map do |column, ord|
187
+ [column, ord || DEFAULT_ORDER]
176
188
  end
177
189
  end
178
190
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PluckInBatches
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pluck_in_batches
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - fatkodima
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-05-16 00:00:00.000000000 Z
11
+ date: 2023-07-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -60,8 +60,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  requirements: []
63
- rubygems_version: 3.4.12
63
+ rubygems_version: 3.4.6
64
64
  signing_key:
65
65
  specification_version: 4
66
- summary: Change
66
+ summary: A faster alternative to the custom use of `in_batches` with `pluck`.
67
67
  test_files: []