karafka 2.4.18 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. checksums.yaml +4 -4
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/workflows/ci.yml +59 -15
  4. data/.github/workflows/push.yml +35 -0
  5. data/.github/workflows/verify-action-pins.yml +16 -0
  6. data/.ruby-version +1 -1
  7. data/CHANGELOG.md +75 -0
  8. data/Gemfile +2 -2
  9. data/Gemfile.lock +72 -53
  10. data/LICENSE-COMM +2 -2
  11. data/README.md +1 -1
  12. data/Rakefile +4 -0
  13. data/bin/clean_kafka +43 -0
  14. data/bin/integrations +20 -6
  15. data/bin/rspecs +15 -3
  16. data/bin/verify_kafka_warnings +35 -0
  17. data/bin/verify_topics_naming +27 -0
  18. data/config/locales/errors.yml +5 -1
  19. data/config/locales/pro_errors.yml +13 -2
  20. data/docker-compose.yml +1 -1
  21. data/examples/payloads/avro/.gitkeep +0 -0
  22. data/examples/payloads/json/sample_set_01/enrollment_event.json +579 -0
  23. data/examples/payloads/json/sample_set_01/ingestion_event.json +30 -0
  24. data/examples/payloads/json/sample_set_01/transaction_event.json +17 -0
  25. data/examples/payloads/json/sample_set_01/user_event.json +11 -0
  26. data/karafka.gemspec +3 -8
  27. data/lib/karafka/active_job/current_attributes.rb +1 -1
  28. data/lib/karafka/active_job/job_extensions.rb +4 -1
  29. data/lib/karafka/admin/acl.rb +5 -1
  30. data/lib/karafka/admin/configs.rb +5 -1
  31. data/lib/karafka/admin.rb +89 -42
  32. data/lib/karafka/base_consumer.rb +17 -8
  33. data/lib/karafka/cli/base.rb +8 -2
  34. data/lib/karafka/cli/topics/align.rb +7 -4
  35. data/lib/karafka/cli/topics/base.rb +17 -0
  36. data/lib/karafka/cli/topics/create.rb +9 -7
  37. data/lib/karafka/cli/topics/delete.rb +4 -2
  38. data/lib/karafka/cli/topics/help.rb +39 -0
  39. data/lib/karafka/cli/topics/repartition.rb +4 -2
  40. data/lib/karafka/cli/topics.rb +10 -3
  41. data/lib/karafka/cli.rb +2 -0
  42. data/lib/karafka/connection/client.rb +39 -9
  43. data/lib/karafka/connection/listener.rb +24 -12
  44. data/lib/karafka/connection/messages_buffer.rb +1 -1
  45. data/lib/karafka/connection/proxy.rb +4 -1
  46. data/lib/karafka/constraints.rb +3 -3
  47. data/lib/karafka/contracts/base.rb +3 -2
  48. data/lib/karafka/contracts/config.rb +5 -1
  49. data/lib/karafka/contracts/topic.rb +1 -1
  50. data/lib/karafka/errors.rb +46 -2
  51. data/lib/karafka/helpers/async.rb +3 -1
  52. data/lib/karafka/helpers/interval_runner.rb +8 -0
  53. data/lib/karafka/instrumentation/callbacks/rebalance.rb +5 -1
  54. data/lib/karafka/instrumentation/logger_listener.rb +95 -32
  55. data/lib/karafka/instrumentation/proctitle_listener.rb +5 -1
  56. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +2 -2
  57. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +17 -2
  58. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +29 -6
  59. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +9 -0
  60. data/lib/karafka/messages/builders/batch_metadata.rb +1 -1
  61. data/lib/karafka/pro/cleaner.rb +8 -0
  62. data/lib/karafka/pro/cli/parallel_segments/base.rb +89 -0
  63. data/lib/karafka/pro/cli/parallel_segments/collapse.rb +164 -0
  64. data/lib/karafka/pro/cli/parallel_segments/distribute.rb +164 -0
  65. data/lib/karafka/pro/cli/parallel_segments.rb +60 -0
  66. data/lib/karafka/pro/connection/manager.rb +5 -8
  67. data/lib/karafka/pro/encryption.rb +12 -1
  68. data/lib/karafka/pro/instrumentation/performance_tracker.rb +1 -1
  69. data/lib/karafka/pro/iterator/expander.rb +5 -3
  70. data/lib/karafka/pro/iterator/tpl_builder.rb +23 -0
  71. data/lib/karafka/pro/loader.rb +10 -0
  72. data/lib/karafka/pro/processing/coordinator.rb +4 -1
  73. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +32 -3
  74. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +11 -0
  75. data/lib/karafka/pro/processing/filters/base.rb +10 -2
  76. data/lib/karafka/pro/processing/filters/expirer.rb +5 -0
  77. data/lib/karafka/pro/processing/filters/inline_insights_delayer.rb +2 -2
  78. data/lib/karafka/pro/processing/filters/virtual_limiter.rb +5 -0
  79. data/lib/karafka/pro/processing/parallel_segments/filters/base.rb +73 -0
  80. data/lib/karafka/pro/processing/parallel_segments/filters/default.rb +85 -0
  81. data/lib/karafka/pro/processing/parallel_segments/filters/mom.rb +66 -0
  82. data/lib/karafka/pro/processing/partitioner.rb +1 -13
  83. data/lib/karafka/pro/processing/piping/consumer.rb +13 -13
  84. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +1 -1
  85. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +1 -1
  86. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +1 -1
  87. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +1 -1
  88. data/lib/karafka/pro/processing/strategies/aj/ftr_lrj_mom_vp.rb +1 -1
  89. data/lib/karafka/pro/processing/strategies/aj/lrj_mom_vp.rb +1 -1
  90. data/lib/karafka/pro/processing/strategies/default.rb +36 -8
  91. data/lib/karafka/pro/processing/strategies/dlq/default.rb +15 -10
  92. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +1 -1
  93. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +1 -1
  94. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +3 -1
  95. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +1 -1
  96. data/lib/karafka/pro/processing/strategies/ftr/default.rb +1 -1
  97. data/lib/karafka/pro/processing/strategies/lrj/default.rb +4 -1
  98. data/lib/karafka/pro/processing/strategies/lrj/ftr.rb +1 -1
  99. data/lib/karafka/pro/processing/strategies/lrj/ftr_mom.rb +1 -1
  100. data/lib/karafka/pro/processing/strategies/lrj/mom.rb +1 -1
  101. data/lib/karafka/pro/processing/virtual_partitions/distributors/balanced.rb +50 -0
  102. data/lib/karafka/pro/processing/virtual_partitions/distributors/base.rb +29 -0
  103. data/lib/karafka/pro/processing/virtual_partitions/distributors/consistent.rb +27 -0
  104. data/lib/karafka/pro/recurring_tasks/contracts/config.rb +8 -4
  105. data/lib/karafka/pro/recurring_tasks/dispatcher.rb +3 -3
  106. data/lib/karafka/pro/recurring_tasks/setup/config.rb +7 -2
  107. data/lib/karafka/pro/recurring_tasks.rb +21 -2
  108. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +1 -1
  109. data/lib/karafka/pro/routing/features/multiplexing/config.rb +1 -0
  110. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +17 -0
  111. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +5 -2
  112. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +8 -1
  113. data/lib/karafka/pro/routing/features/parallel_segments/builder.rb +47 -0
  114. data/lib/karafka/pro/routing/features/parallel_segments/config.rb +27 -0
  115. data/lib/karafka/pro/routing/features/parallel_segments/consumer_group.rb +83 -0
  116. data/lib/karafka/pro/routing/features/parallel_segments/contracts/consumer_group.rb +49 -0
  117. data/lib/karafka/pro/routing/features/parallel_segments/topic.rb +43 -0
  118. data/lib/karafka/pro/routing/features/parallel_segments.rb +24 -0
  119. data/lib/karafka/pro/routing/features/patterns/pattern.rb +1 -1
  120. data/lib/karafka/pro/routing/features/recurring_tasks/builder.rb +2 -2
  121. data/lib/karafka/pro/routing/features/scheduled_messages/builder.rb +10 -6
  122. data/lib/karafka/pro/routing/features/swarm/contracts/routing.rb +3 -2
  123. data/lib/karafka/pro/routing/features/swarm.rb +4 -1
  124. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +20 -2
  125. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  126. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +8 -2
  127. data/lib/karafka/pro/scheduled_messages/consumer.rb +61 -26
  128. data/lib/karafka/pro/scheduled_messages/daily_buffer.rb +9 -6
  129. data/lib/karafka/pro/scheduled_messages/deserializers/headers.rb +7 -1
  130. data/lib/karafka/pro/scheduled_messages/dispatcher.rb +2 -1
  131. data/lib/karafka/pro/scheduled_messages/max_epoch.rb +15 -6
  132. data/lib/karafka/pro/scheduled_messages/proxy.rb +15 -3
  133. data/lib/karafka/pro/scheduled_messages/serializer.rb +2 -4
  134. data/lib/karafka/pro/scheduled_messages/state.rb +20 -23
  135. data/lib/karafka/pro/scheduled_messages/tracker.rb +34 -8
  136. data/lib/karafka/pro/scheduled_messages.rb +17 -1
  137. data/lib/karafka/processing/coordinators_buffer.rb +1 -0
  138. data/lib/karafka/processing/strategies/default.rb +4 -4
  139. data/lib/karafka/routing/builder.rb +12 -3
  140. data/lib/karafka/routing/features/base/expander.rb +8 -2
  141. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  142. data/lib/karafka/routing/subscription_group.rb +1 -1
  143. data/lib/karafka/runner.rb +7 -1
  144. data/lib/karafka/server.rb +21 -18
  145. data/lib/karafka/setup/attributes_map.rb +2 -0
  146. data/lib/karafka/setup/config.rb +40 -7
  147. data/lib/karafka/setup/defaults_injector.rb +26 -1
  148. data/lib/karafka/status.rb +6 -1
  149. data/lib/karafka/swarm/node.rb +31 -0
  150. data/lib/karafka/swarm/supervisor.rb +9 -2
  151. data/lib/karafka/templates/karafka.rb.erb +14 -1
  152. data/lib/karafka/version.rb +1 -1
  153. data/lib/karafka.rb +17 -9
  154. data/renovate.json +14 -2
  155. metadata +41 -40
  156. checksums.yaml.gz.sig +0 -0
  157. data/certs/cert.pem +0 -26
  158. data.tar.gz.sig +0 -0
  159. metadata.gz.sig +0 -0
@@ -22,7 +22,10 @@ module Karafka
22
22
 
23
23
  # @param config [Karafka::Core::Configurable::Node] root node config
24
24
  def post_setup(config)
25
- Encryption::Contracts::Config.new.validate!(config.to_h)
25
+ Encryption::Contracts::Config.new.validate!(
26
+ config.to_h,
27
+ scope: %w[config]
28
+ )
26
29
 
27
30
  # Don't inject extra components if encryption is not active
28
31
  return unless config.encryption.active
@@ -33,6 +36,14 @@ module Karafka
33
36
  # Encryption for WaterDrop
34
37
  config.producer.middleware.append(Messages::Middleware.new)
35
38
  end
39
+
40
+ # This feature does not need any changes post-fork
41
+ #
42
+ # @param _config [Karafka::Core::Configurable::Node]
43
+ # @param _pre_fork_producer [WaterDrop::Producer]
44
+ def post_fork(_config, _pre_fork_producer)
45
+ true
46
+ end
36
47
  end
37
48
  end
38
49
  end
@@ -50,7 +50,7 @@ module Karafka
50
50
  partition = messages.metadata.partition
51
51
 
52
52
  samples = @processing_times[topic][partition]
53
- samples << event[:time] / messages.count
53
+ samples << event[:time] / messages.size
54
54
 
55
55
  return unless samples.size > SAMPLES_COUNT
56
56
 
@@ -21,8 +21,10 @@ module Karafka
21
21
  # - { 'topic1' => 100 } - means we run all partitions from the offset 100
22
22
  # - { 'topic1' => Time.now - 60 } - we run all partitions from the message from 60s ago
23
23
  # - { 'topic1' => { 1 => Time.now - 60 } } - partition1 from message 60s ago
24
- # - { 'topic1' => { 1 => true } } - will pick first offset not consumed on this CG for p 1
25
- # - { 'topic1' => true } - will pick first offset not consumed on this CG for all p
24
+ # - { 'topic1' => { 1 => true } } - will pick first offset on this CG for partition 1
25
+ # - { 'topic1' => true } - will pick first offset for all partitions
26
+ # - { 'topic1' => :earliest } - will pick earliest offset for all partitions
27
+ # - { 'topic1' => :latest } - will pick latest (high-watermark) for all partitions
26
28
  class Expander
27
29
  # Expands topics to which we want to subscribe with partitions information in case this
28
30
  # info is not provided.
@@ -80,7 +82,7 @@ module Karafka
80
82
  .find { |topic| topic.fetch(:topic_name) == name }
81
83
  .tap { |topic| topic || raise(Errors::TopicNotFoundError, name) }
82
84
  .fetch(:partitions)
83
- .count
85
+ .size
84
86
  end
85
87
  end
86
88
  end
@@ -14,6 +14,11 @@ module Karafka
14
14
  # This builder resolves that and builds a tpl to which we can safely subscribe the way
15
15
  # we want it.
16
16
  class TplBuilder
17
+ # Supported named offset positions that we can reference via their name
18
+ SUPPORTED_NAMED_POSITIONS = %w[earliest latest].freeze
19
+
20
+ private_constant :SUPPORTED_NAMED_POSITIONS
21
+
17
22
  # @param consumer [::Rdkafka::Consumer] consumer instance needed to talk with Kafka
18
23
  # @param expanded_topics [Hash] hash with expanded and normalized topics data
19
24
  def initialize(consumer, expanded_topics)
@@ -28,6 +33,7 @@ module Karafka
28
33
  resolve_partitions_with_exact_offsets
29
34
  resolve_partitions_with_negative_offsets
30
35
  resolve_partitions_with_time_offsets
36
+ resolve_partitions_with_named_offsets
31
37
  resolve_partitions_with_cg_expectations
32
38
 
33
39
  # Final tpl with all the data
@@ -143,6 +149,23 @@ module Karafka
143
149
  end
144
150
  end
145
151
 
152
+ # If we get named offsets, we can just remap them to librdkafka special offset positions
153
+ def resolve_partitions_with_named_offsets
154
+ @expanded_topics.each do |name, partitions|
155
+ next unless partitions.is_a?(Hash)
156
+
157
+ partitions.each do |partition, offset|
158
+ # Skip offsets that do not match our named expectations
159
+ named_offset = offset.to_s
160
+
161
+ next unless SUPPORTED_NAMED_POSITIONS.include?(named_offset)
162
+
163
+ @mapped_topics[name][partition] = -1 if named_offset == 'latest'
164
+ @mapped_topics[name][partition] = -2 if named_offset == 'earliest'
165
+ end
166
+ end
167
+ end
168
+
146
169
  # Fetches last used offsets for those partitions for which we want to consume from last
147
170
  # moment where given consumer group has finished
148
171
  # This is indicated by given partition value being set to `true`.
@@ -60,6 +60,15 @@ module Karafka
60
60
  Processing::SubscriptionGroupsCoordinator.instance
61
61
  end
62
62
 
63
+ # Runs operations needed after fork in swarm for features that need it
64
+ #
65
+ # @param config [Karafka::Core::Configurable::Node]
66
+ # @param pre_fork_producer [WaterDrop::Producer] pre fork producer instance that may be
67
+ # needed to be replaced with newly changed one post-fork.
68
+ def post_fork(config, pre_fork_producer)
69
+ features.each { |feature| feature.post_fork(config, pre_fork_producer) }
70
+ end
71
+
63
72
  private
64
73
 
65
74
  # @return [Array<Module>] extra non-routing related pro features and routing components
@@ -84,6 +93,7 @@ module Karafka
84
93
  icfg.connection.manager = Connection::Manager.new
85
94
 
86
95
  icfg.processing.coordinator_class = Processing::Coordinator
96
+ icfg.processing.errors_tracker_class = Processing::Coordinators::ErrorsTracker
87
97
  icfg.processing.partitioner_class = Processing::Partitioner
88
98
  icfg.processing.scheduler_class = Processing::Schedulers::Default
89
99
  icfg.processing.jobs_queue_class = Processing::JobsQueue
@@ -10,6 +10,9 @@ module Karafka
10
10
  # within the same partition
11
11
  class Coordinator < ::Karafka::Processing::Coordinator
12
12
  extend Forwardable
13
+ include Helpers::ConfigImporter.new(
14
+ errors_tracker_class: %i[internal processing errors_tracker_class]
15
+ )
13
16
 
14
17
  def_delegators :@collapser, :collapsed?, :collapse_until!
15
18
 
@@ -20,7 +23,7 @@ module Karafka
20
23
  super
21
24
 
22
25
  @executed = []
23
- @errors_tracker = Coordinators::ErrorsTracker.new
26
+ @errors_tracker = errors_tracker_class.new(topic, partition)
24
27
  @flow_mutex = Mutex.new
25
28
  # Lock for user code synchronization
26
29
  # We do not want to mix coordinator lock with the user lock not to create cases where
@@ -13,25 +13,52 @@ module Karafka
13
13
  class ErrorsTracker
14
14
  include Enumerable
15
15
 
16
+ # @return [Karafka::Routing::Topic] topic of this error tracker
17
+ attr_reader :topic
18
+
19
+ # @return [Integer] partition of this error tracker
20
+ attr_reader :partition
21
+
22
+ # @return [Hash]
23
+ attr_reader :counts
24
+
25
+ # @return [String]
26
+ attr_reader :trace_id
27
+
16
28
  # Max errors we keep in memory.
17
29
  # We do not want to keep more because for DLQ-less this would cause memory-leaks.
30
+ # We do however count per class for granular error counting
18
31
  STORAGE_LIMIT = 100
19
32
 
20
33
  private_constant :STORAGE_LIMIT
21
34
 
22
- def initialize
35
+ # @param topic [Karafka::Routing::Topic]
36
+ # @param partition [Integer]
37
+ # @param limit [Integer] max number of errors we want to keep for reference when
38
+ # implementing custom error handling.
39
+ # @note `limit` does not apply to the counts. They will work beyond the number of errors
40
+ # occurring
41
+ def initialize(topic, partition, limit: STORAGE_LIMIT)
23
42
  @errors = []
43
+ @counts = Hash.new { |hash, key| hash[key] = 0 }
44
+ @topic = topic
45
+ @partition = partition
46
+ @limit = limit
47
+ @trace_id = SecureRandom.uuid
24
48
  end
25
49
 
26
50
  # Clears all the errors
27
51
  def clear
28
52
  @errors.clear
53
+ @counts.clear
29
54
  end
30
55
 
31
56
  # @param error [StandardError] adds the error to the tracker
32
57
  def <<(error)
33
- @errors.shift if @errors.size >= STORAGE_LIMIT
58
+ @errors.shift if @errors.size >= @limit
34
59
  @errors << error
60
+ @counts[error.class] += 1
61
+ @trace_id = SecureRandom.uuid
35
62
  end
36
63
 
37
64
  # @return [Boolean] is the error tracker empty
@@ -41,7 +68,9 @@ module Karafka
41
68
 
42
69
  # @return [Integer] number of elements
43
70
  def size
44
- count
71
+ # We use counts reference of all errors and not the `@errors` array because it allows
72
+ # us to go beyond the whole errors storage limit
73
+ @counts.values.sum
45
74
  end
46
75
 
47
76
  # @return [StandardError, nil] last error that occurred or nil if no errors
@@ -98,6 +98,17 @@ module Karafka
98
98
  :mark_as_consumed
99
99
  end
100
100
 
101
+ # The first (lowest) message we want to mark as consumed in marking. By default it uses
102
+ # same position as cursor in case user wants to mark same message as consumed as the
103
+ # one on which cursor action is applied.
104
+ # @return [Karafka::Messages::Message, nil] cursor marking message or nil if none
105
+ # @note It should not return position in time format, only numerical offset
106
+ def marking_cursor
107
+ return nil unless active?
108
+
109
+ applied.map(&:marking_cursor).compact.min_by(&:offset)
110
+ end
111
+
101
112
  private
102
113
 
103
114
  # @return [Boolean] is filtering active
@@ -42,9 +42,11 @@ module Karafka
42
42
  @applied
43
43
  end
44
44
 
45
- # @return [Integer] default timeout for pausing (if applicable)
45
+ # @return [Integer, nil] default timeout for pausing (if applicable) or nil if not
46
+ # @note Please do not return `0` when your filter is not pausing as it may interact
47
+ # with other filters that want to pause.
46
48
  def timeout
47
- 0
49
+ nil
48
50
  end
49
51
 
50
52
  # @return [Boolean] should we use the cursor value to mark as consumed. If any of the
@@ -58,6 +60,12 @@ module Karafka
58
60
  def marking_method
59
61
  :mark_as_consumed
60
62
  end
63
+
64
+ # @return [Karafka::Messages::Message, nil] cursor message for marking or nil if no
65
+ # marking
66
+ def marking_cursor
67
+ cursor
68
+ end
61
69
  end
62
70
  end
63
71
  end
@@ -36,6 +36,11 @@ module Karafka
36
36
  too_old
37
37
  end
38
38
  end
39
+
40
+ # @return [nil] this filter does not deal with timeouts
41
+ def timeout
42
+ nil
43
+ end
39
44
  end
40
45
  end
41
46
  end
@@ -54,9 +54,9 @@ module Karafka
54
54
  @applied = true
55
55
  end
56
56
 
57
- # @return [Integer] ms timeout in case of pause
57
+ # @return [Integer, nil] ms timeout in case of pause or nil if not delaying
58
58
  def timeout
59
- @cursor && applied? ? PAUSE_TIMEOUT : 0
59
+ @cursor && applied? ? PAUSE_TIMEOUT : nil
60
60
  end
61
61
 
62
62
  # Pause when we had to back-off or skip if delay is not needed
@@ -37,6 +37,11 @@ module Karafka
37
37
 
38
38
  messages.delete_if { |message| marked.include?(message.offset) }
39
39
  end
40
+
41
+ # @return [nil] This filter does not deal with pausing, so timeout is always nil
42
+ def timeout
43
+ nil
44
+ end
40
45
  end
41
46
  end
42
47
  end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Processing
9
+ module ParallelSegments
10
+ # Module for filters injected into the processing pipeline of each of the topics used
11
+ # within the parallel segmented consumer groups
12
+ module Filters
13
+ # Base class for filters for parallel segments that deal with different feature scenarios
14
+ class Base < Processing::Filters::Base
15
+ # @param segment_id [Integer] numeric id of the parallel segment group to use with the
16
+ # partitioner and reducer for segment matching comparison
17
+ # @param partitioner [Proc]
18
+ # @param reducer [Proc]
19
+ def initialize(segment_id:, partitioner:, reducer:)
20
+ super()
21
+
22
+ @segment_id = segment_id
23
+ @partitioner = partitioner
24
+ @reducer = reducer
25
+ end
26
+
27
+ private
28
+
29
+ # @param message [Karafka::Messages::Message] received message
30
+ # @return [String, Numeric] segment assignment key
31
+ def partition(message)
32
+ @partitioner.call(message)
33
+ rescue StandardError => e
34
+ # This should not happen. If you are seeing this it means your partitioner code
35
+ # failed and raised an error. We highly recommend mitigating partitioner level errors
36
+ # on the user side because this type of collapse should be considered a last resort
37
+ Karafka.monitor.instrument(
38
+ 'error.occurred',
39
+ caller: self,
40
+ error: e,
41
+ message: message,
42
+ type: 'parallel_segments.partitioner.error'
43
+ )
44
+
45
+ :failure
46
+ end
47
+
48
+ # @param message_segment_key [String, Numeric] segment key to pass to the reducer
49
+ # @return [Integer] segment assignment of a given message
50
+ def reduce(message_segment_key)
51
+ # Assign to segment 0 always in case of failures in partitioner
52
+ # This is a fail-safe
53
+ return 0 if message_segment_key == :failure
54
+
55
+ @reducer.call(message_segment_key)
56
+ rescue StandardError => e
57
+ # @see `#partition` method error handling doc
58
+ Karafka.monitor.instrument(
59
+ 'error.occurred',
60
+ caller: self,
61
+ error: e,
62
+ message_segment_key: message_segment_key,
63
+ type: 'parallel_segments.reducer.error'
64
+ )
65
+
66
+ 0
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Processing
9
+ # Processing components namespace for parallel segments feature
10
+ module ParallelSegments
11
+ module Filters
12
+ # Filter used for handling parallel segments with automatic offset management. Handles
13
+ # message distribution and ensures proper offset management when messages are filtered
14
+ # out during the distribution process.
15
+ #
16
+ # When operating in automatic offset management mode, this filter takes care of marking
17
+ # offsets of messages that were filtered out during the distribution process to maintain
18
+ # proper offset progression.
19
+ #
20
+ # @note This is the default filter that should be used when manual offset management
21
+ # is not enabled. For manual offset management scenarios use the Mom filter instead.
22
+ class Default < Base
23
+ # Applies the filter to the batch of messages
24
+ # It removes messages that don't belong to the current parallel segment group
25
+ # based on the partitioner and reducer logic
26
+ #
27
+ # @param messages [Array<Karafka::Messages::Message>] messages batch that we want to
28
+ # filter
29
+ def apply!(messages)
30
+ @applied = false
31
+ @all_filtered = false
32
+ @cursor = messages.first
33
+
34
+ # Keep track of how many messages we had initially
35
+ initial_size = messages.size
36
+
37
+ # Filter out messages that don't match our segment group
38
+ messages.delete_if do |message|
39
+ message_segment_key = partition(message)
40
+
41
+ # Use the reducer to get the target group for this message
42
+ target_segment = reduce(message_segment_key)
43
+
44
+ # Remove the message if it doesn't belong to our group
45
+ remove = target_segment != @segment_id
46
+
47
+ if remove
48
+ @cursor = message
49
+ @applied = true
50
+ end
51
+
52
+ remove
53
+ end
54
+
55
+ # If all messages were filtered out, we want to mark them as consumed
56
+ @all_filtered = messages.empty? && initial_size.positive?
57
+ end
58
+
59
+ # @return [Boolean] true if any messages were filtered out
60
+ def applied?
61
+ @applied
62
+ end
63
+
64
+ # @return [Boolean] true if we should mark as consumed (when all were filtered)
65
+ def mark_as_consumed?
66
+ @all_filtered
67
+ end
68
+
69
+ # @return [nil] Since we do not timeout ever in this filter, we should not return
70
+ # any value for it.
71
+ def timeout
72
+ nil
73
+ end
74
+
75
+ # Only return cursor if we wanted to mark as consumed in case all was filtered.
76
+ # Otherwise it could interfere with other filters
77
+ def cursor
78
+ @all_filtered ? @cursor : nil
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Processing
9
+ module ParallelSegments
10
+ module Filters
11
+ # Filter used for handling parallel segments when manual offset management (mom) is
12
+ # enabled. Provides message distribution without any post-filtering offset state
13
+ # management as it is fully user-based.
14
+ #
15
+ # Since with manual offset management we need to ensure that offsets are never marked
16
+ # even in cases where all data in a batch is filtered out.
17
+ #
18
+ # This separation allows for cleaner implementation and easier debugging of each flow.
19
+ #
20
+ # @note This filter should be used only when manual offset management is enabled.
21
+ # For automatic offset management scenarios use the regular filter instead.
22
+ class Mom < Base
23
+ # Applies the filter to the batch of messages
24
+ # It removes messages that don't belong to the current parallel segment group
25
+ # based on the partitioner and reducer logic without any offset marking
26
+ #
27
+ # @param messages [Array<Karafka::Messages::Message>] messages batch that we want to
28
+ # filter
29
+ def apply!(messages)
30
+ @applied = false
31
+
32
+ # Filter out messages that don't match our segment group
33
+ messages.delete_if do |message|
34
+ message_segment_key = partition(message)
35
+ # Use the reducer to get the target group for this message
36
+ target_segment = reduce(message_segment_key)
37
+ # Remove the message if it doesn't belong to our segment
38
+ remove = target_segment != @segment_id
39
+
40
+ @applied = true if remove
41
+
42
+ remove
43
+ end
44
+ end
45
+
46
+ # @return [Boolean] true if any messages were filtered out
47
+ def applied?
48
+ @applied
49
+ end
50
+
51
+ # @return [Boolean] false, as mom mode never marks as consumed automatically
52
+ def mark_as_consumed?
53
+ false
54
+ end
55
+
56
+ # @return [nil] Since we do not timeout ever in this filter, we should not return
57
+ # any value for it.
58
+ def timeout
59
+ nil
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -38,19 +38,7 @@ module Karafka
38
38
  # reduce the whole set into one partition and emit error. This should still allow for
39
39
  # user flow but should mitigate damages by not virtualizing
40
40
  begin
41
- groupings = messages.group_by do |msg|
42
- # We need to reduce it to the max concurrency, so the group_id is not a direct
43
- # effect of the end user action. Otherwise the persistence layer for consumers
44
- # would cache it forever and it would cause memory leaks
45
- #
46
- # This also needs to be consistent because the aggregation here needs to warrant,
47
- # that the same partitioned message will always be assigned to the same virtual
48
- # partition. Otherwise in case of a window aggregation with VP spanning across
49
- # several polls, the data could not be complete.
50
- vps.reducer.call(
51
- vps.partitioner.call(msg)
52
- )
53
- end
41
+ groupings = vps.distributor.call(messages)
54
42
  rescue StandardError => e
55
43
  # This should not happen. If you are seeing this it means your partitioner code
56
44
  # failed and raised an error. We highly recommend mitigating partitioner level errors
@@ -20,16 +20,16 @@ module Karafka
20
20
 
21
21
  # Pipes given message to the provided topic with expected details. Useful for
22
22
  # pass-through operations where deserialization is not needed. Upon usage it will include
23
- # all the original headers + meta headers about the source of message.
23
+ # all the source headers + meta headers about the source of message.
24
24
  #
25
25
  # @param topic [String, Symbol] where we want to send the message
26
- # @param message [Karafka::Messages::Message] original message to pipe
26
+ # @param message [Karafka::Messages::Message] source message to pipe
27
27
  #
28
28
  # @note It will NOT deserialize the payload so it is fast
29
29
  #
30
30
  # @note We assume that there can be different number of partitions in the target topic,
31
- # this is why we use `key` based on the original topic key and not the partition id.
32
- # This will not utilize partitions beyond the number of partitions of original topic,
31
+ # this is why we use `key` based on the source topic key and not the partition id.
32
+ # This will not utilize partitions beyond the number of partitions of source topic,
33
33
  # but will accommodate for topics with less partitions.
34
34
  def pipe_async(topic:, message:)
35
35
  produce_async(
@@ -40,7 +40,7 @@ module Karafka
40
40
  # Sync version of pipe for one message
41
41
  #
42
42
  # @param topic [String, Symbol] where we want to send the message
43
- # @param message [Karafka::Messages::Message] original message to pipe
43
+ # @param message [Karafka::Messages::Message] source message to pipe
44
44
  # @see [#pipe_async]
45
45
  def pipe_sync(topic:, message:)
46
46
  produce_sync(
@@ -51,7 +51,7 @@ module Karafka
51
51
  # Async multi-message pipe
52
52
  #
53
53
  # @param topic [String, Symbol] where we want to send the message
54
- # @param messages [Array<Karafka::Messages::Message>] original messages to pipe
54
+ # @param messages [Array<Karafka::Messages::Message>] source messages to pipe
55
55
  #
56
56
  # @note If transactional producer in use and dispatch is not wrapped with a transaction,
57
57
  # it will automatically wrap the dispatch with a transaction
@@ -66,7 +66,7 @@ module Karafka
66
66
  # Sync multi-message pipe
67
67
  #
68
68
  # @param topic [String, Symbol] where we want to send the message
69
- # @param messages [Array<Karafka::Messages::Message>] original messages to pipe
69
+ # @param messages [Array<Karafka::Messages::Message>] source messages to pipe
70
70
  #
71
71
  # @note If transactional producer in use and dispatch is not wrapped with a transaction,
72
72
  # it will automatically wrap the dispatch with a transaction
@@ -81,7 +81,7 @@ module Karafka
81
81
  private
82
82
 
83
83
  # @param topic [String, Symbol] where we want to send the message
84
- # @param message [Karafka::Messages::Message] original message to pipe
84
+ # @param message [Karafka::Messages::Message] source message to pipe
85
85
  # @return [Hash] hash with message to pipe.
86
86
  #
87
87
  # @note If you need to alter this, please define the `#enhance_pipe_message` method
@@ -90,17 +90,17 @@ module Karafka
90
90
  topic: topic,
91
91
  payload: message.raw_payload,
92
92
  headers: message.raw_headers.merge(
93
- 'original_topic' => message.topic,
94
- 'original_partition' => message.partition.to_s,
95
- 'original_offset' => message.offset.to_s,
96
- 'original_consumer_group' => self.topic.consumer_group.id
93
+ 'source_topic' => message.topic,
94
+ 'source_partition' => message.partition.to_s,
95
+ 'source_offset' => message.offset.to_s,
96
+ 'source_consumer_group' => self.topic.consumer_group.id
97
97
  )
98
98
  }
99
99
 
100
100
  # Use a key only if key was provided
101
101
  if message.raw_key
102
102
  pipe_message[:key] = message.raw_key
103
- # Otherwise pipe creating a key that will assign it based on the original partition
103
+ # Otherwise pipe creating a key that will assign it based on the source partition
104
104
  # number
105
105
  else
106
106
  pipe_message[:key] = message.partition.to_s
@@ -38,7 +38,7 @@ module Karafka
38
38
  elsif !revoked?
39
39
  # no need to check for manual seek because AJ consumer is internal and
40
40
  # fully controlled by us
41
- seek(seek_offset, false)
41
+ seek(seek_offset, false, reset_offset: false)
42
42
  resume
43
43
  else
44
44
  resume
@@ -44,7 +44,7 @@ module Karafka
44
44
  elsif !revoked?
45
45
  # no need to check for manual seek because AJ consumer is internal and
46
46
  # fully controlled by us
47
- seek(seek_offset, false)
47
+ seek(seek_offset, false, reset_offset: false)
48
48
  resume
49
49
  else
50
50
  resume
@@ -36,7 +36,7 @@ module Karafka
36
36
 
37
37
  # no need to check for manual seek because AJ consumer is internal and
38
38
  # fully controlled by us
39
- seek(seek_offset, false) unless revoked?
39
+ seek(seek_offset, false, reset_offset: false) unless revoked?
40
40
 
41
41
  resume
42
42
  else
@@ -40,7 +40,7 @@ module Karafka
40
40
  mark_as_consumed(last_group_message) unless revoked?
41
41
  # no need to check for manual seek because AJ consumer is internal and
42
42
  # fully controlled by us
43
- seek(seek_offset, false) unless revoked?
43
+ seek(seek_offset, false, reset_offset: false) unless revoked?
44
44
 
45
45
  resume
46
46
  else