sidekiq-iteration 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SidekiqIteration
4
+ # Batch Enumerator based on ActiveRecord Relation.
5
+ # @private
6
+ class ActiveRecordBatchEnumerator
7
+ include Enumerable
8
+
9
+ SQL_DATETIME_WITH_NSEC = "%Y-%m-%d %H:%M:%S.%N"
10
+
11
+ def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
12
+ @primary_key = "#{relation.table_name}.#{relation.primary_key}"
13
+ @columns = Array(columns&.map(&:to_s) || @primary_key)
14
+ @primary_key_index = @columns.index(@primary_key) || @columns.index(relation.primary_key)
15
+ @pluck_columns = if @primary_key_index
16
+ @columns
17
+ else
18
+ @columns + [@primary_key]
19
+ end
20
+ @batch_size = batch_size
21
+ @cursor = Array.wrap(cursor)
22
+ @initial_cursor = @cursor
23
+ raise ArgumentError, "Must specify at least one column" if @columns.empty?
24
+ if relation.joins_values.present? && !@columns.all?(/\./)
25
+ raise ArgumentError, "You need to specify fully-qualified columns if you join a table"
26
+ end
27
+
28
+ if relation.arel.orders.present? || relation.arel.taken.present?
29
+ raise ArgumentError,
30
+ "The relation cannot use ORDER BY or LIMIT due to the way how iteration with a cursor is designed. " \
31
+ "You can use other ways to limit the number of rows, e.g. a WHERE condition on the primary key column."
32
+ end
33
+
34
+ @base_relation = relation.reorder(@columns.join(", "))
35
+ end
36
+
37
+ def each
38
+ return to_enum { size } unless block_given?
39
+
40
+ while (relation = next_batch)
41
+ yield relation, cursor_value
42
+ end
43
+ end
44
+
45
+ def size
46
+ (@base_relation.count(:all) + @batch_size - 1) / @batch_size # ceiling division
47
+ end
48
+
49
+ private
50
+ def next_batch
51
+ relation = @base_relation.limit(@batch_size)
52
+ if conditions.any?
53
+ relation = relation.where(*conditions)
54
+ end
55
+
56
+ cursor_values, ids = relation.uncached do
57
+ pluck_columns(relation)
58
+ end
59
+
60
+ cursor = cursor_values.last
61
+ unless cursor.present?
62
+ @cursor = @initial_cursor
63
+ return
64
+ end
65
+ # The primary key was plucked, but original cursor did not include it, so we should remove it
66
+ cursor.pop unless @primary_key_index
67
+ @cursor = Array.wrap(cursor)
68
+
69
+ # Yields relations by selecting the primary keys of records in the batch.
70
+ # Post.where(published: nil) results in an enumerator of relations like:
71
+ # Post.where(published: nil, ids: batch_of_ids)
72
+ @base_relation.where(@primary_key => ids)
73
+ end
74
+
75
+ def pluck_columns(relation)
76
+ if @pluck_columns.size == 1 # only the primary key
77
+ column_values = relation.pluck(*@pluck_columns)
78
+ return [column_values, column_values]
79
+ end
80
+
81
+ column_values = relation.pluck(*@pluck_columns)
82
+ primary_key_index = @primary_key_index || -1
83
+ primary_key_values = column_values.map { |values| values[primary_key_index] }
84
+
85
+ serialize_column_values!(column_values)
86
+ [column_values, primary_key_values]
87
+ end
88
+
89
+ def cursor_value
90
+ if @cursor.size == 1
91
+ @cursor.first
92
+ else
93
+ @cursor
94
+ end
95
+ end
96
+
97
+ def conditions
98
+ column_index = @cursor.size - 1
99
+ column = @columns[column_index]
100
+ where_clause = if @columns.size == @cursor.size
101
+ "#{column} > ?"
102
+ else
103
+ "#{column} >= ?"
104
+ end
105
+ while column_index > 0
106
+ column_index -= 1
107
+ column = @columns[column_index]
108
+ where_clause = "#{column} > ? OR (#{column} = ? AND (#{where_clause}))"
109
+ end
110
+ ret = @cursor.reduce([where_clause]) { |params, value| params << value << value }
111
+ ret.pop
112
+ ret
113
+ end
114
+
115
+ def serialize_column_values!(column_values)
116
+ column_values.map! { |values| values.map! { |value| column_value(value) } }
117
+ end
118
+
119
+ def column_value(value)
120
+ if value.is_a?(Time)
121
+ value.strftime(SQL_DATETIME_WITH_NSEC)
122
+ else
123
+ value
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SidekiqIteration
4
+ # @private
5
+ class ActiveRecordCursor
6
+ include Comparable
7
+
8
+ attr_reader :position, :reached_end
9
+
10
+ def initialize(relation, columns = nil, position = nil)
11
+ columns ||= "#{relation.table_name}.#{relation.primary_key}"
12
+ @columns = Array.wrap(columns)
13
+ raise ArgumentError, "Must specify at least one column" if @columns.empty?
14
+
15
+ self.position = Array.wrap(position)
16
+ if relation.joins_values.present? && !@columns.all?(/\./)
17
+ raise ArgumentError, "You need to specify fully-qualified columns if you join a table"
18
+ end
19
+
20
+ if relation.arel.orders.present? || relation.arel.taken.present?
21
+ raise ArgumentError,
22
+ "The relation cannot use ORDER BY or LIMIT due to the way how iteration with a cursor is designed. " \
23
+ "You can use other ways to limit the number of rows, e.g. a WHERE condition on the primary key column."
24
+ end
25
+
26
+ @base_relation = relation.reorder(@columns.join(", "))
27
+ @reached_end = false
28
+ end
29
+
30
+ def <=>(other)
31
+ if reached_end == other.reached_end
32
+ position <=> other.position
33
+ else
34
+ reached_end ? 1 : -1
35
+ end
36
+ end
37
+
38
+ def position=(position)
39
+ raise ArgumentError, "Cursor position cannot contain nil values" if position.any?(&:nil?)
40
+
41
+ @position = position
42
+ end
43
+
44
+ def next_batch(batch_size)
45
+ return if @reached_end
46
+
47
+ relation = @base_relation.limit(batch_size)
48
+
49
+ if (conditions = self.conditions).any?
50
+ relation = relation.where(*conditions)
51
+ end
52
+
53
+ records = relation.uncached do
54
+ relation.to_a
55
+ end
56
+
57
+ update_from_record(records.last) if records.any?
58
+ @reached_end = records.size < batch_size
59
+
60
+ records if records.any?
61
+ end
62
+
63
+ private
64
+ def conditions
65
+ i = @position.size - 1
66
+ column = @columns[i]
67
+ conditions = if @columns.size == @position.size
68
+ "#{column} > ?"
69
+ else
70
+ "#{column} >= ?"
71
+ end
72
+ while i > 0
73
+ i -= 1
74
+ column = @columns[i]
75
+ conditions = "#{column} > ? OR (#{column} = ? AND (#{conditions}))"
76
+ end
77
+ ret = @position.reduce([conditions]) { |params, value| params << value << value }
78
+ ret.pop
79
+ ret
80
+ end
81
+
82
+ def update_from_record(record)
83
+ self.position = @columns.map do |column|
84
+ method = column.to_s.split(".").last
85
+ record.send(method)
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "active_record_cursor"
4
+
5
+ module SidekiqIteration
6
+ # Builds Enumerator based on ActiveRecord Relation. Supports enumerating on rows and batches.
7
+ # @private
8
+ class ActiveRecordEnumerator
9
+ SQL_DATETIME_WITH_NSEC = "%Y-%m-%d %H:%M:%S.%N"
10
+
11
+ def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
12
+ unless relation.is_a?(ActiveRecord::Relation)
13
+ raise ArgumentError, "relation must be an ActiveRecord::Relation"
14
+ end
15
+
16
+ @relation = relation
17
+ @batch_size = batch_size
18
+ @columns = Array(columns || "#{relation.table_name}.#{relation.primary_key}")
19
+ @cursor = cursor
20
+ end
21
+
22
+ def records
23
+ Enumerator.new(-> { size }) do |yielder|
24
+ batches.each do |batch, _|
25
+ batch.each do |record|
26
+ yielder.yield(record, cursor_value(record))
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def batches
33
+ cursor = ActiveRecordCursor.new(@relation, @columns, @cursor)
34
+ Enumerator.new(-> { size }) do |yielder|
35
+ while (records = cursor.next_batch(@batch_size))
36
+ yielder.yield(records, cursor_value(records.last)) if records.any?
37
+ end
38
+ end
39
+ end
40
+
41
+ def size
42
+ @relation.count(:all)
43
+ end
44
+
45
+ private
46
+ def cursor_value(record)
47
+ positions = @columns.map do |column|
48
+ attribute_name = column.to_s.split(".").last
49
+ column_value(record, attribute_name)
50
+ end
51
+
52
+ if positions.size == 1
53
+ positions.first
54
+ else
55
+ positions
56
+ end
57
+ end
58
+
59
+ def column_value(record, attribute)
60
+ value = record.read_attribute(attribute.to_sym)
61
+ case record.class.columns_hash.fetch(attribute).type
62
+ when :datetime
63
+ value.strftime(SQL_DATETIME_WITH_NSEC)
64
+ else
65
+ value
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SidekiqIteration
4
+ # @private
5
+ # CsvEnumerator makes it possible to write an Iteration job
6
+ # that uses CSV file as a collection to iterate.
7
+ #
8
+ # @example Enumerator to iterate on rows
9
+ # def build_enumerator(cursor:)
10
+ # csv = CSV.open('tmp/files', { converters: :integer, headers: true })
11
+ # csv_enumerator(csv, cursor: cursor)
12
+ # end
13
+ #
14
+ # def each_iteration(row)
15
+ # ...
16
+ # end
17
+ #
18
+ # @example Enumerator to iterate on batches of rows
19
+ # def build_enumerator(cursor:)
20
+ # csv = CSV.open('tmp/files', { converters: :integer, headers: true })
21
+ # csv_batches_enumerator(csv, cursor: cursor)
22
+ # end
23
+ #
24
+ # def each_iteration(row)
25
+ # ...
26
+ # end
27
+ #
28
+ class CsvEnumerator
29
+ # Constructs CsvEnumerator instance based on a CSV file.
30
+ #
31
+ # @param [CSV] csv An instance of CSV object
32
+ # @return [SidekiqIteration::CsvEnumerator]
33
+ #
34
+ # @example
35
+ # csv = CSV.open('tmp/files', { converters: :integer, headers: true })
36
+ # SidekiqIteration::CsvEnumerator.new(csv).rows(cursor: cursor)
37
+ #
38
+ def initialize(csv)
39
+ unless csv.instance_of?(CSV)
40
+ raise ArgumentError, "CsvEnumerator.new takes CSV object"
41
+ end
42
+
43
+ @csv = csv
44
+ end
45
+
46
+ # Constructs a enumerator on CSV rows
47
+ # @return [Enumerator] Enumerator instance
48
+ #
49
+ def rows(cursor:)
50
+ @csv.lazy
51
+ .each_with_index
52
+ .drop(count_of_processed_rows(cursor))
53
+ .to_enum { count_of_rows_in_file }
54
+ end
55
+
56
+ # Constructs a enumerator on batches of CSV rows
57
+ # @return [Enumerator] Enumerator instance
58
+ #
59
+ def batches(cursor:, batch_size: 100)
60
+ @csv.lazy
61
+ .each_slice(batch_size)
62
+ .with_index
63
+ .drop(count_of_processed_rows(cursor))
64
+ .to_enum { (count_of_rows_in_file.to_f / batch_size).ceil }
65
+ end
66
+
67
+ private
68
+ def count_of_rows_in_file
69
+ filepath = @csv.path
70
+ return unless filepath
71
+
72
+ count = `wc -l < #{filepath}`.strip.to_i
73
+ count -= 1 if @csv.headers
74
+ count
75
+ end
76
+
77
+ def count_of_processed_rows(cursor)
78
+ if cursor
79
+ cursor + 1
80
+ else
81
+ 0
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "active_record_enumerator"
4
+ require_relative "active_record_batch_enumerator"
5
+ require_relative "csv_enumerator"
6
+ require_relative "nested_enumerator"
7
+
8
+ module SidekiqIteration
9
+ module Enumerators
10
+ # Builds Enumerator object from a given array, using +cursor+ as an offset.
11
+ #
12
+ # @param array [Array]
13
+ # @param cursor [Integer] offset to start iteration from
14
+ #
15
+ # @example
16
+ # array_enumerator(['build', 'enumerator', 'from', 'any', 'array'], cursor: cursor)
17
+ #
18
+ def array_enumerator(array, cursor:)
19
+ raise ArgumentError, "array must be an Array" unless array.is_a?(Array)
20
+
21
+ if defined?(ActiveRecord) && array.any?(ActiveRecord::Base)
22
+ raise ArgumentError, "array cannot contain ActiveRecord objects"
23
+ end
24
+
25
+ drop = cursor ? cursor + 1 : 0
26
+ array.each_with_index.drop(drop).to_enum { array.size }
27
+ end
28
+
29
+ # Builds Enumerator from Active Record Relation. Each Enumerator tick moves the cursor one row forward.
30
+ #
31
+ # @param scope [ActiveRecord::Relation] scope to iterate
32
+ # @param cursor [Object] offset to start iteration from, usually an id
33
+ # @option options :columns [Array<String, Symbol>] used to build the actual query for iteration,
34
+ # defaults to primary key
35
+ # @option options :batch_size [Integer] (100) size of the batch
36
+ #
37
+ # +columns:+ argument is used to build the actual query for iteration. +columns+: defaults to primary key:
38
+ #
39
+ # 1) SELECT * FROM users ORDER BY id LIMIT 100
40
+ #
41
+ # When iteration is resumed, +cursor:+ and +columns:+ values will be used to continue from the point
42
+ # where iteration stopped:
43
+ #
44
+ # 2) SELECT * FROM users WHERE id > $CURSOR ORDER BY id LIMIT 100
45
+ #
46
+ # +columns:+ can also take more than one column. In that case, +cursor+ will contain serialized values
47
+ # of all columns at the point where iteration stopped.
48
+ #
49
+ # Consider this example with +columns: [:created_at, :id]+. Here's the query will use on the first iteration:
50
+ #
51
+ # 1) SELECT * FROM "products" ORDER BY created_at, id LIMIT 100
52
+ #
53
+ # And the query on the next iteration:
54
+ #
55
+ # 2) SELECT * FROM "products"
56
+ # WHERE (created_at > '$LAST_CREATED_AT_CURSOR'
57
+ # OR (created_at = '$LAST_CREATED_AT_CURSOR' AND (id > '$LAST_ID_CURSOR')))
58
+ # ORDER BY created_at, id LIMIT 100
59
+ #
60
+ # As a result of this query pattern, if the values in these columns change for the records in scope during
61
+ # iteration, they may be skipped or yielded multiple times depending on the nature of the update and the
62
+ # cursor's value. If the value gets updated to a greater value than the cursor's value, it will get yielded
63
+ # again. Similarly, if the value gets updated to a lesser value than the curor's value, it will get skipped.
64
+ #
65
+ # @example
66
+ # def build_enumerator(cursor:)
67
+ # active_record_records_enumerator(User.all, cursor: cursor)
68
+ # end
69
+ #
70
+ # def each_iteration(user)
71
+ # user.notify_about_something
72
+ # end
73
+ #
74
+ def active_record_records_enumerator(scope, cursor:, **options)
75
+ ActiveRecordEnumerator.new(scope, cursor: cursor, **options).records
76
+ end
77
+
78
+ # Builds Enumerator from Active Record Relation and enumerates on batches of records.
79
+ # Each Enumerator tick moves the cursor +batch_size+ rows forward.
80
+ # @see #active_record_records_enumerator
81
+ #
82
+ # @example
83
+ # def build_enumerator(product_id, cursor:)
84
+ # active_record_batches_enumerator(
85
+ # Comment.where(product_id: product_id).select(:id),
86
+ # cursor: cursor,
87
+ # batch_size: 100
88
+ # )
89
+ # end
90
+ #
91
+ # def each_iteration(batch_of_comments, product_id)
92
+ # comment_ids = batch_of_comments.map(&:id)
93
+ # CommentService.call(comment_ids: comment_ids)
94
+ # end
95
+ #
96
+ def active_record_batches_enumerator(scope, cursor:, **options)
97
+ ActiveRecordEnumerator.new(scope, cursor: cursor, **options).batches
98
+ end
99
+
100
+ # Builds Enumerator from Active Record Relation and enumerates on batches, yielding Active Record Relations.
101
+ # @see #active_record_records_enumerator
102
+ #
103
+ # @example
104
+ # def build_enumerator(product_id, cursor:)
105
+ # active_record_relations_enumerator(
106
+ # Product.find(product_id).comments,
107
+ # cursor: cursor,
108
+ # batch_size: 100,
109
+ # )
110
+ # end
111
+ #
112
+ # def each_iteration(batch_of_comments, product_id)
113
+ # # batch_of_comments will be a Comment::ActiveRecord_Relation
114
+ # batch_of_comments.update_all(deleted: true)
115
+ # end
116
+ #
117
+ def active_record_relations_enumerator(scope, cursor:, **options)
118
+ ActiveRecordBatchEnumerator.new(scope, cursor: cursor, **options).each
119
+ end
120
+
121
+ # Builds Enumerator from a CSV file.
122
+ #
123
+ # @param csv [CSV] an instance of CSV object
124
+ # @param cursor [Integer] offset to start iteration from
125
+ #
126
+ # @example
127
+ # def build_enumerator(import_id, cursor:)
128
+ # import = Import.find(import_id)
129
+ # csv_enumerator(import.csv, cursor: cursor)
130
+ # end
131
+ #
132
+ # def each_iteration(csv_row)
133
+ # # insert csv_row to database
134
+ # end
135
+ #
136
+ def csv_enumerator(csv, cursor:)
137
+ CsvEnumerator.new(csv).rows(cursor: cursor)
138
+ end
139
+
140
+ # Builds Enumerator from a CSV file and enumerates on batches of records.
141
+ #
142
+ # @param csv [CSV] an instance of CSV object
143
+ # @param cursor [Integer] offset to start iteration from
144
+ # @option options :batch_size [Integer] (100) size of the batch
145
+ #
146
+ # @example
147
+ # def build_enumerator(import_id, cursor:)
148
+ # import = Import.find(import_id)
149
+ # csv_batches_enumerator(import.csv, cursor: cursor)
150
+ # end
151
+ #
152
+ # def each_iteration(batch_of_csv_rows)
153
+ # # ...
154
+ # end
155
+ #
156
+ def csv_batches_enumerator(csv, cursor:, **options)
157
+ CsvEnumerator.new(csv).batches(cursor: cursor, **options)
158
+ end
159
+
160
+ # Builds Enumerator for nested iteration.
161
+ #
162
+ # @param enums [Array<Proc>] an Array of Procs, each should return an Enumerator.
163
+ # Each proc from enums should accept the yielded items from the parent enumerators and the `cursor` as its arguments.
164
+ # Each proc's `cursor` argument is its part from the `build_enumerator`'s `cursor` array.
165
+ # @param cursor [Array<Object>] array of offsets for each of the enums to start iteration from
166
+ #
167
+ # @example
168
+ # def build_enumerator(cursor:)
169
+ # nested_enumerator(
170
+ # [
171
+ # ->(cursor) { active_record_records_enumerator(Shop.all, cursor: cursor) },
172
+ # ->(shop, cursor) { active_record_records_enumerator(shop.products, cursor: cursor) },
173
+ # ->(_shop, product, cursor) { active_record_relations_enumerator(product.product_variants, cursor: cursor) }
174
+ # ],
175
+ # cursor: cursor
176
+ # )
177
+ # end
178
+ #
179
+ # def each_iteration(product_variants_relation)
180
+ # # do something
181
+ # end
182
+ #
183
+ def nested_enumerator(enums, cursor:)
184
+ NestedEnumerator.new(enums, cursor: cursor).each
185
+ end
186
+ end
187
+ end