cdc-solid-queue 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a31a7915a2b6ba29afe3497045f7be40477b0388b89a0357a18bb77428f5701c
4
- data.tar.gz: 4485d2cdf5c8137dc7a11503364c3f05a1d00691d83d0be18f3c5a194ba6c968
3
+ metadata.gz: abd5d503ed994dcf6881b7636c24a2669d358e8e29ab3f0c88aef1e0236ed998
4
+ data.tar.gz: 36235e6ab5e44b9b40f75b5a6ca3cfc41fd03746dca852074ee14509c1ad4c07
5
5
  SHA512:
6
- metadata.gz: 9f3a59fef64e5227fcf6ddab9ce45363221058baa8dbed1932e7eedd75620fe2f1b4f0382c97232ad6cfc4f15ac30044f3427e05c21aebec2b4283aafb255f27
7
- data.tar.gz: 6aacc4886cd318f2ce12dfcb6164d3c059c8a37064ef0daced35dc016d40a4047e2054e11b4f0a6d84a3af01447b060452f9e1c6e3dae5f1f187c53df1532035
6
+ metadata.gz: e9d3c198e816c337815e2f7a840365b9176a789b9ab142f46ec9bc65f2414a9024c639f2e7d4b06ff2ce6e357d7e3b9376292997a435d9779f07f156ed8f7a3a
7
+ data.tar.gz: e7f0efe0daa73c5410eb52ea4b1076ec9c64b8022e80c5793bfccbd8b8540517c8ac8fe8d29b5cc0f6199400857a228661923e1918c0e3627035364ec347a106
data/CHANGELOG.md CHANGED
@@ -2,6 +2,11 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ ## 0.3.0
6
+
7
+ - Batch-sized enqueueing now feeds arrays into downstream `process_many`, so
8
+ the Solid Queue job path matches batch-style downstream runtimes.
9
+
5
10
  ## 0.2.0
6
11
 
7
12
  - Optional downstream processor delegation to `cdc-concurrent` and `cdc-parallel`.
data/README.md CHANGED
@@ -52,7 +52,8 @@ end
52
52
  `config.queue` is applied through Active Job's `set(queue:)` API when the job
53
53
  class supports it. When `preserve_order` is enabled, the enqueued payload also
54
54
  includes cdc-solid-queue metadata with the configured ordering key and computed
55
- ordering value.
55
+ ordering value. Set `config.batch_size` above `1` to enqueue multiple CDC
56
+ events in one Solid Queue job and hand the batch to downstream `process_many`.
56
57
 
57
58
  ## Downstream Processing
58
59
 
@@ -90,7 +91,8 @@ config.downstream_options = { size: 4, timeout: 5 }
90
91
  Both runtime gems are optional. Add `cdc-concurrent` or `cdc-parallel` to the
91
92
  application Gemfile when selecting that runtime. Without a configured
92
93
  `downstream_processor`, `CDC::SolidQueue::ProcessorJob` falls back to the job's
93
- own `#process(event)` method.
94
+ own `#process(event)` method, or `#process_many(events)` when a batched payload
95
+ reaches a job that defines it.
94
96
 
95
97
  ## Rails Task
96
98
 
@@ -136,6 +138,9 @@ bundle exec rake benchmark:enqueue
136
138
  Set `CDC_SOLID_QUEUE_BENCH_EVENTS` to control the event count.
137
139
  Set `CDC_SOLID_QUEUE_BENCH_MODE=downstream_direct` to measure direct downstream
138
140
  processor delegation overhead without Solid Queue enqueue translation.
141
+ Set `CDC_SOLID_QUEUE_BENCH_MODE=downstream_batch` to measure batched downstream
142
+ delegation overhead. Set `CDC_SOLID_QUEUE_BENCH_BATCH_SIZE` to control the batch
143
+ width.
139
144
 
140
145
  Example local result on Ruby 3.4.9:
141
146
 
@@ -175,6 +180,17 @@ The result means the direct downstream adapter can dispatch about 6.1M to 6.3M
175
180
  prebuilt synthetic events per second on that machine, making the adapter layer
176
181
  negligible compared with real persistence, CDC source, and processor costs.
177
182
 
183
+ Batch mode example:
184
+
185
+ ```text
186
+ mode=downstream_batch events=100000000 elapsed=... rate=... events/s
187
+ ```
188
+
189
+ Batch mode measures one more layer: batch deserialization plus `process_many`
190
+ dispatch through the downstream adapter. When a downstream runtime such as
191
+ `cdc-concurrent` or `cdc-parallel` is configured, that batch is handed to the
192
+ runtime pool in one call instead of event-by-event.
193
+
178
194
  ## MVP Checkpoint Rule
179
195
 
180
196
  A checkpoint advances after the Solid Queue job is durably inserted. Job execution success is handled by Solid Queue retry semantics.
@@ -27,6 +27,8 @@ module CDC
27
27
  private
28
28
 
29
29
  def position_for(event)
30
+ return event.map { |item| position_for(item) }.compact.last if event.is_a?(Array)
31
+
30
32
  payload = EventSerializer.dump(event)
31
33
  payload['source_position'] || payload['commit_lsn'] || payload.dig('metadata', 'wal_end_lsn')
32
34
  rescue SerializationError
@@ -16,7 +16,7 @@ module CDC
16
16
  DOWNSTREAM_RUNTIMES = %i[concurrent parallel direct].freeze
17
17
 
18
18
  attr_accessor :processor_job, :queue, :preserve_order, :ordering_key, :postgresql, :checkpoint,
19
- :downstream_processor, :downstream_runtime, :downstream_options
19
+ :downstream_processor, :downstream_runtime, :downstream_options, :batch_size
20
20
 
21
21
  # Build a configuration with safe defaults.
22
22
  def initialize
@@ -29,6 +29,7 @@ module CDC
29
29
  @downstream_processor = nil
30
30
  @downstream_runtime = :concurrent
31
31
  @downstream_options = {}
32
+ @batch_size = 1
32
33
  end
33
34
 
34
35
  # Validate this configuration.
@@ -44,6 +45,7 @@ module CDC
44
45
  validate_postgresql!
45
46
  validate_checkpoint!
46
47
  validate_downstream!
48
+ validate_batch_size!
47
49
  true
48
50
  end
49
51
  # rubocop:enable Naming/PredicateMethod
@@ -98,6 +100,12 @@ module CDC
98
100
 
99
101
  raise ConfigurationError, 'downstream_processor must respond to process'
100
102
  end
103
+
104
+ def validate_batch_size!
105
+ return if @batch_size.is_a?(Integer) && @batch_size.positive?
106
+
107
+ raise ConfigurationError, 'batch_size must be a positive Integer'
108
+ end
101
109
  end
102
110
  end
103
111
  end
@@ -14,16 +14,26 @@ module CDC
14
14
 
15
15
  # Process one normalized CDC work item.
16
16
  #
17
- # @param item [Object]
17
+ # @param item [Object, Array<Object>]
18
18
  # @return [Object]
19
19
  def process(item)
20
+ return process_many(item) if item.is_a?(Array)
21
+
22
+ process_one(item)
23
+ end
24
+
25
+ # Process many normalized CDC work items.
26
+ #
27
+ # @param items [Array<Object>]
28
+ # @return [Object]
29
+ def process_many(items)
20
30
  case configuration.downstream_runtime
21
31
  when :direct
22
- processor.process(item)
32
+ process_many_direct(items)
23
33
  when :concurrent
24
- process_with_runtime(concurrent_runtime, item)
34
+ process_with_runtime(concurrent_runtime, items)
25
35
  when :parallel
26
- process_with_runtime(parallel_runtime, item)
36
+ process_with_runtime(parallel_runtime, items)
27
37
  else
28
38
  raise ConfigurationError, "unsupported downstream_runtime: #{configuration.downstream_runtime.inspect}"
29
39
  end
@@ -35,12 +45,37 @@ module CDC
35
45
  configuration.downstream_processor || raise(ConfigurationError, 'downstream_processor is required')
36
46
  end
37
47
 
38
- def process_with_runtime(runtime, item)
39
- runtime.process(item)
48
+ def process_one(item)
49
+ case configuration.downstream_runtime
50
+ when :direct
51
+ processor.process(item)
52
+ when :concurrent
53
+ unwrap_single_result(process_with_runtime(concurrent_runtime, [item]))
54
+ when :parallel
55
+ unwrap_single_result(process_with_runtime(parallel_runtime, [item]))
56
+ else
57
+ raise ConfigurationError, "unsupported downstream_runtime: #{configuration.downstream_runtime.inspect}"
58
+ end
59
+ end
60
+
61
+ def process_many_direct(items)
62
+ return processor.process_many(items) if processor.respond_to?(:process_many)
63
+
64
+ items.map { |item| processor.process(item) }
65
+ end
66
+ private :process_many_direct
67
+
68
+ def process_with_runtime(runtime, items)
69
+ runtime.process_many(items)
40
70
  ensure
41
71
  runtime.shutdown
42
72
  end
43
73
 
74
+ def unwrap_single_result(result)
75
+ result.is_a?(Array) && result.length == 1 ? result.first : result
76
+ end
77
+ private :unwrap_single_result
78
+
44
79
  def concurrent_runtime
45
80
  require_runtime('cdc/concurrent', 'cdc-concurrent') unless defined?(CDC::Concurrent::Runtime)
46
81
  CDC::Concurrent::Runtime.new(processor:, **configuration.downstream_options)
@@ -15,10 +15,10 @@ module CDC
15
15
 
16
16
  # Enqueue one CDC event.
17
17
  #
18
- # @param event [Object, Hash]
18
+ # @param event [Object, Hash, Array<Object>]
19
19
  # @return [Object] Active Job return value
20
20
  def enqueue(event)
21
- payload = EventSerializer.dump(event)
21
+ payload = payload_for(event)
22
22
  payload = EventSerializer.with_enqueue_metadata(payload, enqueue_metadata(payload))
23
23
  job = configuration.processor_job
24
24
  return async_job(job).perform_later(payload) if job.respond_to?(:perform_later)
@@ -39,14 +39,25 @@ module CDC
39
39
  'queue' => configuration.queue,
40
40
  'preserve_order' => configuration.preserve_order,
41
41
  'ordering_key' => configuration.ordering_key,
42
- 'ordering_value' => ordering_value(payload)
42
+ 'ordering_value' => ordering_value(payload),
43
+ 'batch_size' => configuration.batch_size
43
44
  }
44
45
  end
45
46
 
46
47
  def ordering_value(payload)
47
48
  return nil unless configuration.preserve_order
48
49
 
49
- EventSerializer.ordering_value(payload, configuration.ordering_key)
50
+ if payload.is_a?(Array)
51
+ payload.map { |event| EventSerializer.ordering_value(event, configuration.ordering_key) }
52
+ else
53
+ EventSerializer.ordering_value(payload, configuration.ordering_key)
54
+ end
55
+ end
56
+
57
+ def payload_for(event)
58
+ return EventSerializer.dump_batch(event) if event.is_a?(Array)
59
+
60
+ EventSerializer.dump(event)
50
61
  end
51
62
  end
52
63
  end
@@ -6,9 +6,18 @@ module CDC
6
6
  #
7
7
  # Payloads are plain hashes so Active Job can serialize them without needing
8
8
  # to load the original event object in the queue database.
9
+ # rubocop:disable Metrics/ClassLength
9
10
  class EventSerializer
10
11
  # Reserved payload key for cdc-solid-queue enqueue metadata.
11
12
  INTERNAL_METADATA_KEY = '_cdc_solid_queue'
13
+ # Lookup table for ordering value extraction by ordering key.
14
+ ORDERING_VALUE_FETCHERS = {
15
+ identity: ->(payload) { payload['identity'] || payload['primary_key'] },
16
+ primary_key: ->(payload) { payload['identity'] || payload['primary_key'] },
17
+ relation: ->(payload) { [payload['namespace'] || payload['schema'], payload['entity'] || payload['table']] },
18
+ transaction: ->(payload) { payload['transaction_id'] },
19
+ global: ->(payload) { payload['source_position'] || payload['commit_lsn'] }
20
+ }.freeze
12
21
 
13
22
  # Serialize an event-like object.
14
23
  #
@@ -27,6 +36,16 @@ module CDC
27
36
  normalize_hash(payload)
28
37
  end
29
38
 
39
+ # Serialize a batch of event-like objects.
40
+ #
41
+ # @param events [Array<Object>]
42
+ # @return [Array<Hash>]
43
+ def self.dump_batch(events)
44
+ raise SerializationError, 'events must be an Array' unless events.is_a?(Array)
45
+
46
+ events.map { |event| dump(event) }
47
+ end
48
+
30
49
  # Load a serialized event payload.
31
50
  #
32
51
  # @param payload [Hash]
@@ -38,11 +57,23 @@ module CDC
38
57
  strip_internal_metadata(normalize_hash(payload))
39
58
  end
40
59
 
60
+ # Load a batch of serialized event payloads.
61
+ #
62
+ # @param payloads [Array<Hash>]
63
+ # @return [Array<Hash>]
64
+ def self.load_batch(payloads)
65
+ raise SerializationError, 'payloads must be an Array' unless payloads.is_a?(Array)
66
+
67
+ payloads.map { |payload| load(payload) }
68
+ end
69
+
41
70
  # Load a serialized event payload into a CDC event when possible.
42
71
  #
43
72
  # @param payload [Hash]
44
73
  # @return [CDC::Core::ChangeEvent, Hash]
45
74
  def self.load_event(payload)
75
+ return load_batch(payload).map { |item| load_event(item) } if payload.is_a?(Array)
76
+
46
77
  normalized = load(payload)
47
78
  return normalized unless change_event_payload?(normalized)
48
79
 
@@ -55,6 +86,12 @@ module CDC
55
86
  # @param metadata [Hash]
56
87
  # @return [Hash]
57
88
  def self.with_enqueue_metadata(payload, metadata)
89
+ if payload.is_a?(Array)
90
+ return payload.each_with_index.map do |child, index|
91
+ with_enqueue_metadata(child, metadata_for_batch_item(metadata, index))
92
+ end
93
+ end
94
+
58
95
  normalized = normalize_hash(payload)
59
96
  normalized.merge(INTERNAL_METADATA_KEY => normalize_hash(metadata))
60
97
  end
@@ -64,6 +101,8 @@ module CDC
64
101
  # @param payload [Hash]
65
102
  # @return [Hash]
66
103
  def self.enqueue_metadata(payload)
104
+ return enqueue_metadata_for_batch(payload) if payload.is_a?(Array)
105
+
67
106
  normalized = normalize_hash(payload)
68
107
  metadata = normalized[INTERNAL_METADATA_KEY]
69
108
  metadata.is_a?(Hash) ? metadata : {}
@@ -75,19 +114,13 @@ module CDC
75
114
  # @param key [Symbol]
76
115
  # @return [Object, nil]
77
116
  def self.ordering_value(payload, key)
78
- normalized = load(payload)
79
- case key
80
- when :identity, :primary_key
81
- normalized['identity'] || normalized['primary_key']
82
- when :relation
83
- [normalized['namespace'] || normalized['schema'], normalized['entity'] || normalized['table']]
84
- when :transaction
85
- normalized['transaction_id']
86
- when :global
87
- normalized['source_position'] || normalized['commit_lsn']
88
- when :none
89
- nil
90
- end
117
+ return payload.map { |item| ordering_value(item, key) } if payload.is_a?(Array)
118
+ return nil if key == :none
119
+
120
+ fetcher = ORDERING_VALUE_FETCHERS[key]
121
+ return nil unless fetcher
122
+
123
+ fetcher.call(load(payload))
91
124
  end
92
125
 
93
126
  # Normalize hash keys to strings recursively.
@@ -138,6 +171,25 @@ module CDC
138
171
  payload.key?('operation') && payload.key?('schema') && payload.key?('table')
139
172
  end
140
173
  private_class_method :change_event_payload?
174
+
175
+ def self.enqueue_metadata_for_batch(payloads)
176
+ payloads.each_with_index.map do |payload, index|
177
+ enqueue_metadata(payload).merge(
178
+ 'batch_size' => payloads.length,
179
+ 'batch_index' => index
180
+ )
181
+ end
182
+ end
183
+ private_class_method :enqueue_metadata_for_batch
184
+
185
+ def self.metadata_for_batch_item(metadata, index)
186
+ normalize_hash(metadata).merge(
187
+ 'batch_size' => normalize_hash(metadata).fetch('batch_size'),
188
+ 'batch_index' => index
189
+ )
190
+ end
191
+ private_class_method :metadata_for_batch_item
141
192
  end
193
+ # rubocop:enable Metrics/ClassLength
142
194
  end
143
195
  end
@@ -17,7 +17,7 @@ module CDC
17
17
 
18
18
  # Active Job entrypoint.
19
19
  #
20
- # @param payload [Hash]
20
+ # @param payload [Hash, Array<Hash>]
21
21
  # @return [Object] process return value
22
22
  def perform(payload)
23
23
  event = EventSerializer.load_event(payload)
@@ -25,12 +25,14 @@ module CDC
25
25
  return DownstreamProcessor.new(SolidQueue.configuration).process(event)
26
26
  end
27
27
 
28
+ return process_many(event) if event.is_a?(Array) && respond_to?(:process_many)
29
+
28
30
  process(event)
29
31
  end
30
32
 
31
33
  # Process a normalized CDC event payload.
32
34
  #
33
- # @param event [Hash]
35
+ # @param event [Hash, Array<Hash>]
34
36
  # @raise [NotImplementedError] when the including job does not override it
35
37
  def process(event)
36
38
  raise NotImplementedError, "#{self.class} must implement #process"
@@ -21,17 +21,31 @@ module CDC
21
21
  #
22
22
  # @return [Integer] number of enqueued events
23
23
  def start
24
+ # @type var batch: Array[untyped]
25
+ batch = []
24
26
  count = 0
27
+
25
28
  @stream.each do |event|
26
- result = @enqueuer.enqueue(event)
27
- checkpoint(event, result)
28
- count += 1
29
+ batch << event
30
+ next unless batch.length >= @enqueuer.configuration.batch_size
31
+
32
+ count += flush_batch(batch)
33
+ batch = []
29
34
  end
30
- count
35
+
36
+ count + flush_batch(batch)
31
37
  end
32
38
 
33
39
  private
34
40
 
41
+ def flush_batch(batch)
42
+ return 0 if batch.empty?
43
+
44
+ result = @enqueuer.enqueue(batch.length == 1 ? batch.first : batch.dup)
45
+ checkpoint(batch, result)
46
+ batch.length
47
+ end
48
+
35
49
  def checkpoint(event, result)
36
50
  store = @enqueuer.configuration.checkpoint
37
51
  store&.advance(event, result)
@@ -3,6 +3,6 @@
3
3
  module CDC
4
4
  module SolidQueue
5
5
  # Current cdc-solid-queue gem version.
6
- VERSION = '0.2.0'
6
+ VERSION = '0.3.0'
7
7
  end
8
8
  end
@@ -81,6 +81,7 @@ module CDC
81
81
  attr_accessor downstream_processor: untyped
82
82
  attr_accessor downstream_runtime: Symbol
83
83
  attr_accessor downstream_options: Hash[Symbol, untyped]
84
+ attr_accessor batch_size: Integer
84
85
 
85
86
  def initialize: () -> void
86
87
  def validate!: () -> true
@@ -94,17 +95,21 @@ module CDC
94
95
  def validate_postgresql!: () -> nil
95
96
  def validate_checkpoint!: () -> nil
96
97
  def validate_downstream!: () -> nil
98
+ def validate_batch_size!: () -> nil
97
99
  end
98
100
 
99
101
  class EventSerializer
100
102
  INTERNAL_METADATA_KEY: String
103
+ ORDERING_VALUE_FETCHERS: Hash[Symbol, Proc]
101
104
 
102
105
  def self.dump: (untyped event) -> Hash[String, untyped]
106
+ def self.dump_batch: (::Array[untyped] events) -> ::Array[Hash[String, untyped]]
103
107
  def self.load: (Hash[untyped, untyped] payload) -> Hash[String, untyped]
104
- def self.load_event: (Hash[untyped, untyped] payload) -> untyped
105
- def self.with_enqueue_metadata: (Hash[untyped, untyped] payload, Hash[untyped, untyped] metadata) -> Hash[String, untyped]
106
- def self.enqueue_metadata: (Hash[untyped, untyped] payload) -> Hash[String, untyped]
107
- def self.ordering_value: (Hash[untyped, untyped] payload, Symbol key) -> untyped
108
+ def self.load_batch: (::Array[Hash[untyped, untyped]] payloads) -> ::Array[Hash[String, untyped]]
109
+ def self.load_event: (Hash[untyped, untyped] | ::Array[Hash[untyped, untyped]]) -> untyped
110
+ def self.with_enqueue_metadata: (Hash[untyped, untyped] | ::Array[Hash[untyped, untyped]], Hash[untyped, untyped]) -> untyped
111
+ def self.enqueue_metadata: (Hash[untyped, untyped] | ::Array[Hash[untyped, untyped]]) -> untyped
112
+ def self.ordering_value: (Hash[untyped, untyped] | ::Array[Hash[untyped, untyped]], Symbol) -> untyped
108
113
 
109
114
  private
110
115
 
@@ -112,6 +117,8 @@ module CDC
112
117
  def self.build_change_event: (Hash[String, untyped] normalized) -> untyped
113
118
  def self.strip_internal_metadata: (Hash[String, untyped] payload) -> Hash[String, untyped]
114
119
  def self.change_event_payload?: (Hash[String, untyped] payload) -> bool
120
+ def self.enqueue_metadata_for_batch: (::Array[Hash[String, untyped]] payloads) -> ::Array[Hash[String, untyped]]
121
+ def self.metadata_for_batch_item: (Hash[untyped, untyped] metadata, Integer index) -> Hash[String, untyped]
115
122
  end
116
123
 
117
124
  class Checkpoint
@@ -135,17 +142,22 @@ module CDC
135
142
  def async_job: (untyped job) -> untyped
136
143
  def enqueue_metadata: (Hash[untyped, untyped] payload) -> Hash[String, untyped]
137
144
  def ordering_value: (Hash[untyped, untyped] payload) -> untyped
145
+ def payload_for: (untyped event) -> untyped
138
146
  end
139
147
 
140
148
  class DownstreamProcessor
141
149
  attr_reader configuration: Configuration
142
150
  def initialize: (Configuration configuration) -> void
143
151
  def process: (untyped item) -> untyped
152
+ def process_many: (::Array[untyped] items) -> untyped
144
153
 
145
154
  private
146
155
 
147
156
  def processor: () -> untyped
148
- def process_with_runtime: (untyped runtime, untyped item) -> untyped
157
+ def process_one: (untyped item) -> untyped
158
+ def process_many_direct: (::Array[untyped] items) -> untyped
159
+ def process_with_runtime: (untyped runtime, ::Array[untyped] items) -> untyped
160
+ def unwrap_single_result: (untyped result) -> untyped
149
161
  def concurrent_runtime: () -> untyped
150
162
  def parallel_runtime: () -> untyped
151
163
  def require_runtime: (String feature, String gem_name) -> untyped
@@ -154,7 +166,9 @@ module CDC
154
166
  module ProcessorJob
155
167
  def self.included: (untyped base) -> void
156
168
  def perform: (Hash[untyped, untyped] payload) -> untyped
169
+ | (::Array[Hash[untyped, untyped]]) -> untyped
157
170
  def process: (Hash[String, untyped] event) -> untyped
171
+ def process_many: (::Array[untyped] events) -> untyped
158
172
  end
159
173
 
160
174
  class Runner
@@ -164,6 +178,7 @@ module CDC
164
178
  private
165
179
 
166
180
  def checkpoint: (untyped event, untyped result) -> untyped
181
+ def flush_batch: (::Array[untyped] batch) -> Integer
167
182
  end
168
183
 
169
184
  class PostgresqlStream
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cdc-solid-queue
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ken C. Demanawa