sidekiq-iteration 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SidekiqIteration
4
+ # Batch Enumerator based on ActiveRecord Relation.
5
+ # @private
6
+ class ActiveRecordBatchEnumerator
7
+ include Enumerable
8
+
9
+ SQL_DATETIME_WITH_NSEC = "%Y-%m-%d %H:%M:%S.%N"
10
+
11
+ def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
12
+ @primary_key = "#{relation.table_name}.#{relation.primary_key}"
13
+ @columns = Array(columns&.map(&:to_s) || @primary_key)
14
+ @primary_key_index = @columns.index(@primary_key) || @columns.index(relation.primary_key)
15
+ @pluck_columns = if @primary_key_index
16
+ @columns
17
+ else
18
+ @columns + [@primary_key]
19
+ end
20
+ @batch_size = batch_size
21
+ @cursor = Array.wrap(cursor)
22
+ @initial_cursor = @cursor
23
+ raise ArgumentError, "Must specify at least one column" if @columns.empty?
24
+ if relation.joins_values.present? && !@columns.all?(/\./)
25
+ raise ArgumentError, "You need to specify fully-qualified columns if you join a table"
26
+ end
27
+
28
+ if relation.arel.orders.present? || relation.arel.taken.present?
29
+ raise ArgumentError,
30
+ "The relation cannot use ORDER BY or LIMIT due to the way how iteration with a cursor is designed. " \
31
+ "You can use other ways to limit the number of rows, e.g. a WHERE condition on the primary key column."
32
+ end
33
+
34
+ @base_relation = relation.reorder(@columns.join(", "))
35
+ end
36
+
37
+ def each
38
+ return to_enum { size } unless block_given?
39
+
40
+ while (relation = next_batch)
41
+ yield relation, cursor_value
42
+ end
43
+ end
44
+
45
+ def size
46
+ (@base_relation.count(:all) + @batch_size - 1) / @batch_size # ceiling division
47
+ end
48
+
49
+ private
50
+ def next_batch
51
+ relation = @base_relation.limit(@batch_size)
52
+ if conditions.any?
53
+ relation = relation.where(*conditions)
54
+ end
55
+
56
+ cursor_values, ids = relation.uncached do
57
+ pluck_columns(relation)
58
+ end
59
+
60
+ cursor = cursor_values.last
61
+ unless cursor.present?
62
+ @cursor = @initial_cursor
63
+ return
64
+ end
65
+ # The primary key was plucked, but original cursor did not include it, so we should remove it
66
+ cursor.pop unless @primary_key_index
67
+ @cursor = Array.wrap(cursor)
68
+
69
+ # Yields relations by selecting the primary keys of records in the batch.
70
+ # Post.where(published: nil) results in an enumerator of relations like:
71
+ # Post.where(published: nil, ids: batch_of_ids)
72
+ @base_relation.where(@primary_key => ids)
73
+ end
74
+
75
+ def pluck_columns(relation)
76
+ if @pluck_columns.size == 1 # only the primary key
77
+ column_values = relation.pluck(*@pluck_columns)
78
+ return [column_values, column_values]
79
+ end
80
+
81
+ column_values = relation.pluck(*@pluck_columns)
82
+ primary_key_index = @primary_key_index || -1
83
+ primary_key_values = column_values.map { |values| values[primary_key_index] }
84
+
85
+ serialize_column_values!(column_values)
86
+ [column_values, primary_key_values]
87
+ end
88
+
89
+ def cursor_value
90
+ if @cursor.size == 1
91
+ @cursor.first
92
+ else
93
+ @cursor
94
+ end
95
+ end
96
+
97
+ def conditions
98
+ column_index = @cursor.size - 1
99
+ column = @columns[column_index]
100
+ where_clause = if @columns.size == @cursor.size
101
+ "#{column} > ?"
102
+ else
103
+ "#{column} >= ?"
104
+ end
105
+ while column_index > 0
106
+ column_index -= 1
107
+ column = @columns[column_index]
108
+ where_clause = "#{column} > ? OR (#{column} = ? AND (#{where_clause}))"
109
+ end
110
+ ret = @cursor.reduce([where_clause]) { |params, value| params << value << value }
111
+ ret.pop
112
+ ret
113
+ end
114
+
115
+ def serialize_column_values!(column_values)
116
+ column_values.map! { |values| values.map! { |value| column_value(value) } }
117
+ end
118
+
119
+ def column_value(value)
120
+ if value.is_a?(Time)
121
+ value.strftime(SQL_DATETIME_WITH_NSEC)
122
+ else
123
+ value
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SidekiqIteration
4
+ # @private
5
+ class ActiveRecordCursor
6
+ include Comparable
7
+
8
+ attr_reader :position, :reached_end
9
+
10
+ def initialize(relation, columns = nil, position = nil)
11
+ columns ||= "#{relation.table_name}.#{relation.primary_key}"
12
+ @columns = Array.wrap(columns)
13
+ raise ArgumentError, "Must specify at least one column" if @columns.empty?
14
+
15
+ self.position = Array.wrap(position)
16
+ if relation.joins_values.present? && !@columns.all?(/\./)
17
+ raise ArgumentError, "You need to specify fully-qualified columns if you join a table"
18
+ end
19
+
20
+ if relation.arel.orders.present? || relation.arel.taken.present?
21
+ raise ArgumentError,
22
+ "The relation cannot use ORDER BY or LIMIT due to the way how iteration with a cursor is designed. " \
23
+ "You can use other ways to limit the number of rows, e.g. a WHERE condition on the primary key column."
24
+ end
25
+
26
+ @base_relation = relation.reorder(@columns.join(", "))
27
+ @reached_end = false
28
+ end
29
+
30
+ def <=>(other)
31
+ if reached_end == other.reached_end
32
+ position <=> other.position
33
+ else
34
+ reached_end ? 1 : -1
35
+ end
36
+ end
37
+
38
+ def position=(position)
39
+ raise ArgumentError, "Cursor position cannot contain nil values" if position.any?(&:nil?)
40
+
41
+ @position = position
42
+ end
43
+
44
+ def next_batch(batch_size)
45
+ return if @reached_end
46
+
47
+ relation = @base_relation.limit(batch_size)
48
+
49
+ if (conditions = self.conditions).any?
50
+ relation = relation.where(*conditions)
51
+ end
52
+
53
+ records = relation.uncached do
54
+ relation.to_a
55
+ end
56
+
57
+ update_from_record(records.last) if records.any?
58
+ @reached_end = records.size < batch_size
59
+
60
+ records if records.any?
61
+ end
62
+
63
+ private
64
+ def conditions
65
+ i = @position.size - 1
66
+ column = @columns[i]
67
+ conditions = if @columns.size == @position.size
68
+ "#{column} > ?"
69
+ else
70
+ "#{column} >= ?"
71
+ end
72
+ while i > 0
73
+ i -= 1
74
+ column = @columns[i]
75
+ conditions = "#{column} > ? OR (#{column} = ? AND (#{conditions}))"
76
+ end
77
+ ret = @position.reduce([conditions]) { |params, value| params << value << value }
78
+ ret.pop
79
+ ret
80
+ end
81
+
82
+ def update_from_record(record)
83
+ self.position = @columns.map do |column|
84
+ method = column.to_s.split(".").last
85
+ record.send(method)
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "active_record_cursor"
4
+
5
+ module SidekiqIteration
6
+ # Builds Enumerator based on ActiveRecord Relation. Supports enumerating on rows and batches.
7
+ # @private
8
+ class ActiveRecordEnumerator
9
+ SQL_DATETIME_WITH_NSEC = "%Y-%m-%d %H:%M:%S.%N"
10
+
11
+ def initialize(relation, columns: nil, batch_size: 100, cursor: nil)
12
+ unless relation.is_a?(ActiveRecord::Relation)
13
+ raise ArgumentError, "relation must be an ActiveRecord::Relation"
14
+ end
15
+
16
+ @relation = relation
17
+ @batch_size = batch_size
18
+ @columns = Array(columns || "#{relation.table_name}.#{relation.primary_key}")
19
+ @cursor = cursor
20
+ end
21
+
22
+ def records
23
+ Enumerator.new(-> { size }) do |yielder|
24
+ batches.each do |batch, _|
25
+ batch.each do |record|
26
+ yielder.yield(record, cursor_value(record))
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def batches
33
+ cursor = ActiveRecordCursor.new(@relation, @columns, @cursor)
34
+ Enumerator.new(-> { size }) do |yielder|
35
+ while (records = cursor.next_batch(@batch_size))
36
+ yielder.yield(records, cursor_value(records.last)) if records.any?
37
+ end
38
+ end
39
+ end
40
+
41
+ def size
42
+ @relation.count(:all)
43
+ end
44
+
45
+ private
46
+ def cursor_value(record)
47
+ positions = @columns.map do |column|
48
+ attribute_name = column.to_s.split(".").last
49
+ column_value(record, attribute_name)
50
+ end
51
+
52
+ if positions.size == 1
53
+ positions.first
54
+ else
55
+ positions
56
+ end
57
+ end
58
+
59
+ def column_value(record, attribute)
60
+ value = record.read_attribute(attribute.to_sym)
61
+ case record.class.columns_hash.fetch(attribute).type
62
+ when :datetime
63
+ value.strftime(SQL_DATETIME_WITH_NSEC)
64
+ else
65
+ value
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SidekiqIteration
4
+ # @private
5
+ # CsvEnumerator makes it possible to write an Iteration job
6
+ # that uses CSV file as a collection to iterate.
7
+ #
8
+ # @example Enumerator to iterate on rows
9
+ # def build_enumerator(cursor:)
10
+ # csv = CSV.open('tmp/files', { converters: :integer, headers: true })
11
+ # csv_enumerator(csv, cursor: cursor)
12
+ # end
13
+ #
14
+ # def each_iteration(row)
15
+ # ...
16
+ # end
17
+ #
18
+ # @example Enumerator to iterate on batches of rows
19
+ # def build_enumerator(cursor:)
20
+ # csv = CSV.open('tmp/files', { converters: :integer, headers: true })
21
+ # csv_batches_enumerator(csv, cursor: cursor)
22
+ # end
23
+ #
24
+ # def each_iteration(row)
25
+ # ...
26
+ # end
27
+ #
28
+ class CsvEnumerator
29
+ # Constructs CsvEnumerator instance based on a CSV file.
30
+ #
31
+ # @param [CSV] csv An instance of CSV object
32
+ # @return [SidekiqIteration::CsvEnumerator]
33
+ #
34
+ # @example
35
+ # csv = CSV.open('tmp/files', { converters: :integer, headers: true })
36
+ # SidekiqIteration::CsvEnumerator.new(csv).rows(cursor: cursor)
37
+ #
38
+ def initialize(csv)
39
+ unless csv.instance_of?(CSV)
40
+ raise ArgumentError, "CsvEnumerator.new takes CSV object"
41
+ end
42
+
43
+ @csv = csv
44
+ end
45
+
46
+ # Constructs a enumerator on CSV rows
47
+ # @return [Enumerator] Enumerator instance
48
+ #
49
+ def rows(cursor:)
50
+ @csv.lazy
51
+ .each_with_index
52
+ .drop(count_of_processed_rows(cursor))
53
+ .to_enum { count_of_rows_in_file }
54
+ end
55
+
56
+ # Constructs a enumerator on batches of CSV rows
57
+ # @return [Enumerator] Enumerator instance
58
+ #
59
+ def batches(cursor:, batch_size: 100)
60
+ @csv.lazy
61
+ .each_slice(batch_size)
62
+ .with_index
63
+ .drop(count_of_processed_rows(cursor))
64
+ .to_enum { (count_of_rows_in_file.to_f / batch_size).ceil }
65
+ end
66
+
67
+ private
68
+ def count_of_rows_in_file
69
+ filepath = @csv.path
70
+ return unless filepath
71
+
72
+ count = `wc -l < #{filepath}`.strip.to_i
73
+ count -= 1 if @csv.headers
74
+ count
75
+ end
76
+
77
+ def count_of_processed_rows(cursor)
78
+ if cursor
79
+ cursor + 1
80
+ else
81
+ 0
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "active_record_enumerator"
4
+ require_relative "active_record_batch_enumerator"
5
+ require_relative "csv_enumerator"
6
+ require_relative "nested_enumerator"
7
+
8
+ module SidekiqIteration
9
+ module Enumerators
10
+ # Builds Enumerator object from a given array, using +cursor+ as an offset.
11
+ #
12
+ # @param array [Array]
13
+ # @param cursor [Integer] offset to start iteration from
14
+ #
15
+ # @example
16
+ # array_enumerator(['build', 'enumerator', 'from', 'any', 'array'], cursor: cursor)
17
+ #
18
+ def array_enumerator(array, cursor:)
19
+ raise ArgumentError, "array must be an Array" unless array.is_a?(Array)
20
+
21
+ if defined?(ActiveRecord) && array.any?(ActiveRecord::Base)
22
+ raise ArgumentError, "array cannot contain ActiveRecord objects"
23
+ end
24
+
25
+ drop = cursor ? cursor + 1 : 0
26
+ array.each_with_index.drop(drop).to_enum { array.size }
27
+ end
28
+
29
+ # Builds Enumerator from Active Record Relation. Each Enumerator tick moves the cursor one row forward.
30
+ #
31
+ # @param scope [ActiveRecord::Relation] scope to iterate
32
+ # @param cursor [Object] offset to start iteration from, usually an id
33
+ # @option options :columns [Array<String, Symbol>] used to build the actual query for iteration,
34
+ # defaults to primary key
35
+ # @option options :batch_size [Integer] (100) size of the batch
36
+ #
37
+ # +columns:+ argument is used to build the actual query for iteration. +columns+: defaults to primary key:
38
+ #
39
+ # 1) SELECT * FROM users ORDER BY id LIMIT 100
40
+ #
41
+ # When iteration is resumed, +cursor:+ and +columns:+ values will be used to continue from the point
42
+ # where iteration stopped:
43
+ #
44
+ # 2) SELECT * FROM users WHERE id > $CURSOR ORDER BY id LIMIT 100
45
+ #
46
+ # +columns:+ can also take more than one column. In that case, +cursor+ will contain serialized values
47
+ # of all columns at the point where iteration stopped.
48
+ #
49
+ # Consider this example with +columns: [:created_at, :id]+. Here's the query will use on the first iteration:
50
+ #
51
+ # 1) SELECT * FROM "products" ORDER BY created_at, id LIMIT 100
52
+ #
53
+ # And the query on the next iteration:
54
+ #
55
+ # 2) SELECT * FROM "products"
56
+ # WHERE (created_at > '$LAST_CREATED_AT_CURSOR'
57
+ # OR (created_at = '$LAST_CREATED_AT_CURSOR' AND (id > '$LAST_ID_CURSOR')))
58
+ # ORDER BY created_at, id LIMIT 100
59
+ #
60
+ # As a result of this query pattern, if the values in these columns change for the records in scope during
61
+ # iteration, they may be skipped or yielded multiple times depending on the nature of the update and the
62
+ # cursor's value. If the value gets updated to a greater value than the cursor's value, it will get yielded
63
+ # again. Similarly, if the value gets updated to a lesser value than the curor's value, it will get skipped.
64
+ #
65
+ # @example
66
+ # def build_enumerator(cursor:)
67
+ # active_record_records_enumerator(User.all, cursor: cursor)
68
+ # end
69
+ #
70
+ # def each_iteration(user)
71
+ # user.notify_about_something
72
+ # end
73
+ #
74
+ def active_record_records_enumerator(scope, cursor:, **options)
75
+ ActiveRecordEnumerator.new(scope, cursor: cursor, **options).records
76
+ end
77
+
78
+ # Builds Enumerator from Active Record Relation and enumerates on batches of records.
79
+ # Each Enumerator tick moves the cursor +batch_size+ rows forward.
80
+ # @see #active_record_records_enumerator
81
+ #
82
+ # @example
83
+ # def build_enumerator(product_id, cursor:)
84
+ # active_record_batches_enumerator(
85
+ # Comment.where(product_id: product_id).select(:id),
86
+ # cursor: cursor,
87
+ # batch_size: 100
88
+ # )
89
+ # end
90
+ #
91
+ # def each_iteration(batch_of_comments, product_id)
92
+ # comment_ids = batch_of_comments.map(&:id)
93
+ # CommentService.call(comment_ids: comment_ids)
94
+ # end
95
+ #
96
+ def active_record_batches_enumerator(scope, cursor:, **options)
97
+ ActiveRecordEnumerator.new(scope, cursor: cursor, **options).batches
98
+ end
99
+
100
+ # Builds Enumerator from Active Record Relation and enumerates on batches, yielding Active Record Relations.
101
+ # @see #active_record_records_enumerator
102
+ #
103
+ # @example
104
+ # def build_enumerator(product_id, cursor:)
105
+ # active_record_relations_enumerator(
106
+ # Product.find(product_id).comments,
107
+ # cursor: cursor,
108
+ # batch_size: 100,
109
+ # )
110
+ # end
111
+ #
112
+ # def each_iteration(batch_of_comments, product_id)
113
+ # # batch_of_comments will be a Comment::ActiveRecord_Relation
114
+ # batch_of_comments.update_all(deleted: true)
115
+ # end
116
+ #
117
+ def active_record_relations_enumerator(scope, cursor:, **options)
118
+ ActiveRecordBatchEnumerator.new(scope, cursor: cursor, **options).each
119
+ end
120
+
121
+ # Builds Enumerator from a CSV file.
122
+ #
123
+ # @param csv [CSV] an instance of CSV object
124
+ # @param cursor [Integer] offset to start iteration from
125
+ #
126
+ # @example
127
+ # def build_enumerator(import_id, cursor:)
128
+ # import = Import.find(import_id)
129
+ # csv_enumerator(import.csv, cursor: cursor)
130
+ # end
131
+ #
132
+ # def each_iteration(csv_row)
133
+ # # insert csv_row to database
134
+ # end
135
+ #
136
+ def csv_enumerator(csv, cursor:)
137
+ CsvEnumerator.new(csv).rows(cursor: cursor)
138
+ end
139
+
140
+ # Builds Enumerator from a CSV file and enumerates on batches of records.
141
+ #
142
+ # @param csv [CSV] an instance of CSV object
143
+ # @param cursor [Integer] offset to start iteration from
144
+ # @option options :batch_size [Integer] (100) size of the batch
145
+ #
146
+ # @example
147
+ # def build_enumerator(import_id, cursor:)
148
+ # import = Import.find(import_id)
149
+ # csv_batches_enumerator(import.csv, cursor: cursor)
150
+ # end
151
+ #
152
+ # def each_iteration(batch_of_csv_rows)
153
+ # # ...
154
+ # end
155
+ #
156
+ def csv_batches_enumerator(csv, cursor:, **options)
157
+ CsvEnumerator.new(csv).batches(cursor: cursor, **options)
158
+ end
159
+
160
+ # Builds Enumerator for nested iteration.
161
+ #
162
+ # @param enums [Array<Proc>] an Array of Procs, each should return an Enumerator.
163
+ # Each proc from enums should accept the yielded items from the parent enumerators and the `cursor` as its arguments.
164
+ # Each proc's `cursor` argument is its part from the `build_enumerator`'s `cursor` array.
165
+ # @param cursor [Array<Object>] array of offsets for each of the enums to start iteration from
166
+ #
167
+ # @example
168
+ # def build_enumerator(cursor:)
169
+ # nested_enumerator(
170
+ # [
171
+ # ->(cursor) { active_record_records_enumerator(Shop.all, cursor: cursor) },
172
+ # ->(shop, cursor) { active_record_records_enumerator(shop.products, cursor: cursor) },
173
+ # ->(_shop, product, cursor) { active_record_relations_enumerator(product.product_variants, cursor: cursor) }
174
+ # ],
175
+ # cursor: cursor
176
+ # )
177
+ # end
178
+ #
179
+ # def each_iteration(product_variants_relation)
180
+ # # do something
181
+ # end
182
+ #
183
+ def nested_enumerator(enums, cursor:)
184
+ NestedEnumerator.new(enums, cursor: cursor).each
185
+ end
186
+ end
187
+ end