karafka 2.4.18 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. checksums.yaml +4 -4
  2. data/.github/CODEOWNERS +3 -0
  3. data/.github/workflows/ci.yml +59 -15
  4. data/.github/workflows/push.yml +35 -0
  5. data/.github/workflows/verify-action-pins.yml +16 -0
  6. data/.ruby-version +1 -1
  7. data/CHANGELOG.md +75 -0
  8. data/Gemfile +2 -2
  9. data/Gemfile.lock +72 -53
  10. data/LICENSE-COMM +2 -2
  11. data/README.md +1 -1
  12. data/Rakefile +4 -0
  13. data/bin/clean_kafka +43 -0
  14. data/bin/integrations +20 -6
  15. data/bin/rspecs +15 -3
  16. data/bin/verify_kafka_warnings +35 -0
  17. data/bin/verify_topics_naming +27 -0
  18. data/config/locales/errors.yml +5 -1
  19. data/config/locales/pro_errors.yml +13 -2
  20. data/docker-compose.yml +1 -1
  21. data/examples/payloads/avro/.gitkeep +0 -0
  22. data/examples/payloads/json/sample_set_01/enrollment_event.json +579 -0
  23. data/examples/payloads/json/sample_set_01/ingestion_event.json +30 -0
  24. data/examples/payloads/json/sample_set_01/transaction_event.json +17 -0
  25. data/examples/payloads/json/sample_set_01/user_event.json +11 -0
  26. data/karafka.gemspec +3 -8
  27. data/lib/karafka/active_job/current_attributes.rb +1 -1
  28. data/lib/karafka/active_job/job_extensions.rb +4 -1
  29. data/lib/karafka/admin/acl.rb +5 -1
  30. data/lib/karafka/admin/configs.rb +5 -1
  31. data/lib/karafka/admin.rb +89 -42
  32. data/lib/karafka/base_consumer.rb +17 -8
  33. data/lib/karafka/cli/base.rb +8 -2
  34. data/lib/karafka/cli/topics/align.rb +7 -4
  35. data/lib/karafka/cli/topics/base.rb +17 -0
  36. data/lib/karafka/cli/topics/create.rb +9 -7
  37. data/lib/karafka/cli/topics/delete.rb +4 -2
  38. data/lib/karafka/cli/topics/help.rb +39 -0
  39. data/lib/karafka/cli/topics/repartition.rb +4 -2
  40. data/lib/karafka/cli/topics.rb +10 -3
  41. data/lib/karafka/cli.rb +2 -0
  42. data/lib/karafka/connection/client.rb +39 -9
  43. data/lib/karafka/connection/listener.rb +24 -12
  44. data/lib/karafka/connection/messages_buffer.rb +1 -1
  45. data/lib/karafka/connection/proxy.rb +4 -1
  46. data/lib/karafka/constraints.rb +3 -3
  47. data/lib/karafka/contracts/base.rb +3 -2
  48. data/lib/karafka/contracts/config.rb +5 -1
  49. data/lib/karafka/contracts/topic.rb +1 -1
  50. data/lib/karafka/errors.rb +46 -2
  51. data/lib/karafka/helpers/async.rb +3 -1
  52. data/lib/karafka/helpers/interval_runner.rb +8 -0
  53. data/lib/karafka/instrumentation/callbacks/rebalance.rb +5 -1
  54. data/lib/karafka/instrumentation/logger_listener.rb +95 -32
  55. data/lib/karafka/instrumentation/proctitle_listener.rb +5 -1
  56. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +2 -2
  57. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +17 -2
  58. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +29 -6
  59. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +9 -0
  60. data/lib/karafka/messages/builders/batch_metadata.rb +1 -1
  61. data/lib/karafka/pro/cleaner.rb +8 -0
  62. data/lib/karafka/pro/cli/parallel_segments/base.rb +89 -0
  63. data/lib/karafka/pro/cli/parallel_segments/collapse.rb +164 -0
  64. data/lib/karafka/pro/cli/parallel_segments/distribute.rb +164 -0
  65. data/lib/karafka/pro/cli/parallel_segments.rb +60 -0
  66. data/lib/karafka/pro/connection/manager.rb +5 -8
  67. data/lib/karafka/pro/encryption.rb +12 -1
  68. data/lib/karafka/pro/instrumentation/performance_tracker.rb +1 -1
  69. data/lib/karafka/pro/iterator/expander.rb +5 -3
  70. data/lib/karafka/pro/iterator/tpl_builder.rb +23 -0
  71. data/lib/karafka/pro/loader.rb +10 -0
  72. data/lib/karafka/pro/processing/coordinator.rb +4 -1
  73. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +32 -3
  74. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +11 -0
  75. data/lib/karafka/pro/processing/filters/base.rb +10 -2
  76. data/lib/karafka/pro/processing/filters/expirer.rb +5 -0
  77. data/lib/karafka/pro/processing/filters/inline_insights_delayer.rb +2 -2
  78. data/lib/karafka/pro/processing/filters/virtual_limiter.rb +5 -0
  79. data/lib/karafka/pro/processing/parallel_segments/filters/base.rb +73 -0
  80. data/lib/karafka/pro/processing/parallel_segments/filters/default.rb +85 -0
  81. data/lib/karafka/pro/processing/parallel_segments/filters/mom.rb +66 -0
  82. data/lib/karafka/pro/processing/partitioner.rb +1 -13
  83. data/lib/karafka/pro/processing/piping/consumer.rb +13 -13
  84. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +1 -1
  85. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +1 -1
  86. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +1 -1
  87. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +1 -1
  88. data/lib/karafka/pro/processing/strategies/aj/ftr_lrj_mom_vp.rb +1 -1
  89. data/lib/karafka/pro/processing/strategies/aj/lrj_mom_vp.rb +1 -1
  90. data/lib/karafka/pro/processing/strategies/default.rb +36 -8
  91. data/lib/karafka/pro/processing/strategies/dlq/default.rb +15 -10
  92. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +1 -1
  93. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +1 -1
  94. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +3 -1
  95. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +1 -1
  96. data/lib/karafka/pro/processing/strategies/ftr/default.rb +1 -1
  97. data/lib/karafka/pro/processing/strategies/lrj/default.rb +4 -1
  98. data/lib/karafka/pro/processing/strategies/lrj/ftr.rb +1 -1
  99. data/lib/karafka/pro/processing/strategies/lrj/ftr_mom.rb +1 -1
  100. data/lib/karafka/pro/processing/strategies/lrj/mom.rb +1 -1
  101. data/lib/karafka/pro/processing/virtual_partitions/distributors/balanced.rb +50 -0
  102. data/lib/karafka/pro/processing/virtual_partitions/distributors/base.rb +29 -0
  103. data/lib/karafka/pro/processing/virtual_partitions/distributors/consistent.rb +27 -0
  104. data/lib/karafka/pro/recurring_tasks/contracts/config.rb +8 -4
  105. data/lib/karafka/pro/recurring_tasks/dispatcher.rb +3 -3
  106. data/lib/karafka/pro/recurring_tasks/setup/config.rb +7 -2
  107. data/lib/karafka/pro/recurring_tasks.rb +21 -2
  108. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +1 -1
  109. data/lib/karafka/pro/routing/features/multiplexing/config.rb +1 -0
  110. data/lib/karafka/pro/routing/features/multiplexing/contracts/topic.rb +17 -0
  111. data/lib/karafka/pro/routing/features/multiplexing/proxy.rb +5 -2
  112. data/lib/karafka/pro/routing/features/multiplexing/subscription_group.rb +8 -1
  113. data/lib/karafka/pro/routing/features/parallel_segments/builder.rb +47 -0
  114. data/lib/karafka/pro/routing/features/parallel_segments/config.rb +27 -0
  115. data/lib/karafka/pro/routing/features/parallel_segments/consumer_group.rb +83 -0
  116. data/lib/karafka/pro/routing/features/parallel_segments/contracts/consumer_group.rb +49 -0
  117. data/lib/karafka/pro/routing/features/parallel_segments/topic.rb +43 -0
  118. data/lib/karafka/pro/routing/features/parallel_segments.rb +24 -0
  119. data/lib/karafka/pro/routing/features/patterns/pattern.rb +1 -1
  120. data/lib/karafka/pro/routing/features/recurring_tasks/builder.rb +2 -2
  121. data/lib/karafka/pro/routing/features/scheduled_messages/builder.rb +10 -6
  122. data/lib/karafka/pro/routing/features/swarm/contracts/routing.rb +3 -2
  123. data/lib/karafka/pro/routing/features/swarm.rb +4 -1
  124. data/lib/karafka/pro/routing/features/virtual_partitions/config.rb +20 -2
  125. data/lib/karafka/pro/routing/features/virtual_partitions/contracts/topic.rb +1 -0
  126. data/lib/karafka/pro/routing/features/virtual_partitions/topic.rb +8 -2
  127. data/lib/karafka/pro/scheduled_messages/consumer.rb +61 -26
  128. data/lib/karafka/pro/scheduled_messages/daily_buffer.rb +9 -6
  129. data/lib/karafka/pro/scheduled_messages/deserializers/headers.rb +7 -1
  130. data/lib/karafka/pro/scheduled_messages/dispatcher.rb +2 -1
  131. data/lib/karafka/pro/scheduled_messages/max_epoch.rb +15 -6
  132. data/lib/karafka/pro/scheduled_messages/proxy.rb +15 -3
  133. data/lib/karafka/pro/scheduled_messages/serializer.rb +2 -4
  134. data/lib/karafka/pro/scheduled_messages/state.rb +20 -23
  135. data/lib/karafka/pro/scheduled_messages/tracker.rb +34 -8
  136. data/lib/karafka/pro/scheduled_messages.rb +17 -1
  137. data/lib/karafka/processing/coordinators_buffer.rb +1 -0
  138. data/lib/karafka/processing/strategies/default.rb +4 -4
  139. data/lib/karafka/routing/builder.rb +12 -3
  140. data/lib/karafka/routing/features/base/expander.rb +8 -2
  141. data/lib/karafka/routing/features/dead_letter_queue/contracts/topic.rb +1 -0
  142. data/lib/karafka/routing/subscription_group.rb +1 -1
  143. data/lib/karafka/runner.rb +7 -1
  144. data/lib/karafka/server.rb +21 -18
  145. data/lib/karafka/setup/attributes_map.rb +2 -0
  146. data/lib/karafka/setup/config.rb +40 -7
  147. data/lib/karafka/setup/defaults_injector.rb +26 -1
  148. data/lib/karafka/status.rb +6 -1
  149. data/lib/karafka/swarm/node.rb +31 -0
  150. data/lib/karafka/swarm/supervisor.rb +9 -2
  151. data/lib/karafka/templates/karafka.rb.erb +14 -1
  152. data/lib/karafka/version.rb +1 -1
  153. data/lib/karafka.rb +17 -9
  154. data/renovate.json +14 -2
  155. metadata +41 -40
  156. checksums.yaml.gz.sig +0 -0
  157. data/certs/cert.pem +0 -26
  158. data.tar.gz.sig +0 -0
  159. metadata.gz.sig +0 -0
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Routing
9
+ module Features
10
+ class ParallelSegments < Base
11
+ # Config for parallel segments.
12
+ # @note Used on the consumer level, not per topic
13
+ Config = Struct.new(
14
+ :active,
15
+ :count,
16
+ :partitioner,
17
+ :reducer,
18
+ :merge_key,
19
+ keyword_init: true
20
+ ) do
21
+ alias_method :active?, :active
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Routing
9
+ module Features
10
+ class ParallelSegments < Base
11
+ # Parallel segments are defined on the consumer group (since it creates many), thus we
12
+ # define them on the consumer group.
13
+ # This module adds extra methods needed there to make it work
14
+ module ConsumerGroup
15
+ # @return [Config] parallel segments config
16
+ def parallel_segments
17
+ # We initialize it as disabled if not configured by the user
18
+ public_send(:parallel_segments=, count: 1)
19
+ end
20
+
21
+ # Allows setting parallel segments configuration
22
+ #
23
+ # @param count [Integer] number of parallel segments (number of parallel consumer
24
+ # groups that will be created)
25
+ # @param partitioner [nil, #call] nil or callable partitioner
26
+ # @param reducer [nil, #call] reducer for parallel key. It allows for using a custom
27
+ # reducer to achieve enhanced parallelization when the default reducer is not enough.
28
+ # @param merge_key [String] key used to build the parallel segment consumer groups
29
+ #
30
+ # @note This method is an assignor but the API is actually via the `#parallel_segments`
31
+ # method. Our `Routing::Proxy` normalizes that the way we want to have it exposed
32
+ # for the end users.
33
+ def parallel_segments=(
34
+ count: 1,
35
+ partitioner: nil,
36
+ reducer: nil,
37
+ merge_key: '-parallel-'
38
+ )
39
+ @parallel_segments ||= Config.new(
40
+ active: count > 1,
41
+ count: count,
42
+ partitioner: partitioner,
43
+ reducer: reducer || ->(parallel_key) { parallel_key.to_s.sum % count },
44
+ merge_key: merge_key
45
+ )
46
+ end
47
+
48
+ # @return [Boolean] are parallel segments active
49
+ def parallel_segments?
50
+ parallel_segments.active?
51
+ end
52
+
53
+ # @return [Integer] id of the segment (0 or bigger) or -1 if parallel segments are not
54
+ # active
55
+ def segment_id
56
+ return @segment_id if @segment_id
57
+
58
+ @segment_id = if parallel_segments?
59
+ name.split(parallel_segments.merge_key).last.to_i
60
+ else
61
+ -1
62
+ end
63
+ end
64
+
65
+ # @return [String] original segment consumer group name
66
+ def segment_origin
67
+ name.split(parallel_segments.merge_key).first
68
+ end
69
+
70
+ # @return [Hash] consumer group setup with the parallel segments definition in it
71
+ def to_h
72
+ super.merge(
73
+ parallel_segments: parallel_segments.to_h.merge(
74
+ segment_id: segment_id
75
+ )
76
+ ).freeze
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Routing
9
+ module Features
10
+ class ParallelSegments < Base
11
+ # Namespace for parallel segments contracts
12
+ module Contracts
13
+ # Contract to validate configuration of the parallel segments feature
14
+ class ConsumerGroup < Karafka::Contracts::Base
15
+ configure do |config|
16
+ config.error_messages = YAML.safe_load(
17
+ File.read(
18
+ File.join(Karafka.gem_root, 'config', 'locales', 'pro_errors.yml')
19
+ )
20
+ ).fetch('en').fetch('validations').fetch('consumer_group')
21
+
22
+ nested(:parallel_segments) do
23
+ required(:active) { |val| [true, false].include?(val) }
24
+ required(:partitioner) { |val| val.nil? || val.respond_to?(:call) }
25
+ required(:reducer) { |val| val.respond_to?(:call) }
26
+ required(:count) { |val| val.is_a?(Integer) && val >= 1 }
27
+ required(:merge_key) { |val| val.is_a?(String) && val.size >= 1 }
28
+ end
29
+
30
+ # When parallel segments are defined, partitioner needs to respond to `#call` and
31
+ # it cannot be nil
32
+ virtual do |data, errors|
33
+ next unless errors.empty?
34
+
35
+ parallel_segments = data[:parallel_segments]
36
+
37
+ next unless parallel_segments[:active]
38
+ next if parallel_segments[:partitioner].respond_to?(:call)
39
+
40
+ [[%i[parallel_segments partitioner], :respond_to_call]]
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Routing
9
+ module Features
10
+ class ParallelSegments < Base
11
+ # Parallel segments related expansions to the topic building flow
12
+ module Topic
13
+ # Injects the parallel segments filter as the first filter during building of each of
14
+ # the topics in case parallel segments are enabled.
15
+ #
16
+ # @param args [Object] anything accepted by the topic initializer
17
+ def initialize(*args)
18
+ super
19
+
20
+ return unless consumer_group.parallel_segments?
21
+
22
+ builder = lambda do |topic, _partition|
23
+ mom = topic.manual_offset_management?
24
+
25
+ # We have two filters for mom and non-mom scenario not to mix this logic
26
+ filter_scope = Karafka::Pro::Processing::ParallelSegments::Filters
27
+ filter_class = mom ? filter_scope::Mom : filter_scope::Default
28
+
29
+ filter_class.new(
30
+ segment_id: consumer_group.segment_id,
31
+ partitioner: consumer_group.parallel_segments.partitioner,
32
+ reducer: consumer_group.parallel_segments.reducer
33
+ )
34
+ end
35
+
36
+ filter(builder)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This code is part of Karafka Pro, a commercial component not licensed under LGPL.
4
+ # See LICENSE for details.
5
+
6
+ module Karafka
7
+ module Pro
8
+ module Routing
9
+ module Features
10
+ # Feature that allows parallelizing message processing within a single consumer group by
11
+ # creating multiple consumer group instances. It enables processing messages from each
12
+ # partition in parallel by distributing them to separate consumer group instances based on
13
+ # a partitioning key. Useful for both CPU and IO bound operations.
14
+ #
15
+ # Each parallel segment operates as an independent consumer group instance, processing
16
+ # messages that are assigned to it based on the configured partitioner and reducer.
17
+ # This allows for better resource utilization and increased processing throughput without
18
+ # requiring changes to the topic's partition count.
19
+ class ParallelSegments < Base
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -45,7 +45,7 @@ module Karafka
45
45
  # topic but this minimizes simple mistakes
46
46
  #
47
47
  # This sub-part of sh1 should be unique enough and short-enough to use it here
48
- digest = Digest::SHA1.hexdigest(safe_regexp.source)[8..16]
48
+ digest = Digest::SHA256.hexdigest(safe_regexp.source)[8..16]
49
49
  @name = name ? name.to_s : "karafka-pattern-#{digest}"
50
50
  @config = config
51
51
  end
@@ -29,7 +29,7 @@ module Karafka
29
29
  consumer_group tasks_cfg.group_id do
30
30
  # Registers the primary topic that we use to control schedules execution. This is
31
31
  # the one that we use to trigger recurring tasks.
32
- schedules_topic = topic(topics_cfg.schedules) do
32
+ schedules_topic = topic(topics_cfg.schedules.name) do
33
33
  consumer tasks_cfg.consumer_class
34
34
  deserializer tasks_cfg.deserializer
35
35
  # Because the topic method name as well as builder proxy method name is the same
@@ -83,7 +83,7 @@ module Karafka
83
83
 
84
84
  # This topic is to store logs that we can then inspect either from the admin or via
85
85
  # the Web UI
86
- logs_topic = topic(topics_cfg.logs) do
86
+ logs_topic = topic(topics_cfg.logs.name) do
87
87
  active(false)
88
88
  deserializer tasks_cfg.deserializer
89
89
  target.recurring_tasks(true)
@@ -12,14 +12,14 @@ module Karafka
12
12
  module Builder
13
13
  # Enabled scheduled messages operations and adds needed topics and other stuff.
14
14
  #
15
- # @param group_name [String, false] name for scheduled messages topic that is also used
15
+ # @param topic_name [String, false] name for scheduled messages topic that is also used
16
16
  # as a group identifier. Users can have multiple schedule topics flows to prevent key
17
17
  # collisions, prioritize and do other stuff. `false` if not active.
18
18
  # @param block [Proc] optional reconfiguration of the topics definitions.
19
19
  # @note Namespace for topics should include the divider as it is not automatically
20
20
  # added.
21
- def scheduled_messages(group_name = false, &block)
22
- return unless group_name
21
+ def scheduled_messages(topic_name = false, &block)
22
+ return unless topic_name
23
23
 
24
24
  # Load zlib only if user enables scheduled messages
25
25
  require 'zlib'
@@ -32,7 +32,7 @@ module Karafka
32
32
  consumer_group msg_cfg.group_id do
33
33
  # Registers the primary topic that we use to control schedules execution. This is
34
34
  # the one that we use to trigger scheduled messages.
35
- messages_topic = topic(group_name) do
35
+ messages_topic = topic(topic_name) do
36
36
  instance_eval(&block) if block && block.arity.zero?
37
37
 
38
38
  consumer msg_cfg.consumer_class
@@ -54,7 +54,11 @@ module Karafka
54
54
  consumer_persistence(true)
55
55
 
56
56
  # This needs to be enabled for the eof to work correctly
57
- kafka('enable.partition.eof': true, inherit: true)
57
+ kafka(
58
+ 'enable.partition.eof': true,
59
+ 'auto.offset.reset': 'earliest',
60
+ inherit: true
61
+ )
58
62
  eofed(true)
59
63
 
60
64
  # Since this is a topic that gets replayed because of schedule management, we do
@@ -96,7 +100,7 @@ module Karafka
96
100
  # Holds states of scheduler per each of the partitions since they tick
97
101
  # independently. We only hold future statistics not to have to deal with
98
102
  # any type of state restoration
99
- states_topic = topic("#{group_name}#{msg_cfg.states_postfix}") do
103
+ states_topic = topic("#{topic_name}#{msg_cfg.states_postfix}") do
100
104
  active(false)
101
105
  target.scheduled_messages(true)
102
106
  config(
@@ -28,7 +28,8 @@ module Karafka
28
28
  # Validates that each node has at least one assignment.
29
29
  #
30
30
  # @param builder [Karafka::Routing::Builder]
31
- def validate!(builder)
31
+ # @param scope [Array<String>]
32
+ def validate!(builder, scope: [])
32
33
  nodes_setup = Hash.new do |h, node_id|
33
34
  h[node_id] = { active: false, node_id: node_id }
34
35
  end
@@ -49,7 +50,7 @@ module Karafka
49
50
  end
50
51
 
51
52
  nodes_setup.each_value do |details|
52
- super(details)
53
+ super(details, scope: scope)
53
54
  end
54
55
  end
55
56
 
@@ -17,7 +17,10 @@ module Karafka
17
17
  # @param config [Karafka::Core::Configurable::Node] app config
18
18
  def post_setup(config)
19
19
  config.monitor.subscribe('app.before_warmup') do
20
- Contracts::Routing.new.validate!(config.internal.routing.builder)
20
+ Contracts::Routing.new.validate!(
21
+ config.internal.routing.builder,
22
+ scope: %w[swarm]
23
+ )
21
24
  end
22
25
  end
23
26
  end
@@ -8,15 +8,33 @@ module Karafka
8
8
  module Routing
9
9
  module Features
10
10
  class VirtualPartitions < Base
11
- # Config for virtual partitions
11
+ # Configuration for virtual partitions feature
12
12
  Config = Struct.new(
13
13
  :active,
14
14
  :partitioner,
15
15
  :max_partitions,
16
16
  :offset_metadata_strategy,
17
17
  :reducer,
18
+ :distribution,
18
19
  keyword_init: true
19
- ) { alias_method :active?, :active }
20
+ ) do
21
+ # @return [Boolean] is this feature active
22
+ def active?
23
+ active
24
+ end
25
+
26
+ # @return [Object] distributor instance for the current distribution
27
+ def distributor
28
+ @distributor ||= case distribution
29
+ when :balanced
30
+ Processing::VirtualPartitions::Distributors::Balanced.new(self)
31
+ when :consistent
32
+ Processing::VirtualPartitions::Distributors::Consistent.new(self)
33
+ else
34
+ raise Karafka::Errors::UnsupportedCaseError, distribution
35
+ end
36
+ end
37
+ end
20
38
  end
21
39
  end
22
40
  end
@@ -26,6 +26,7 @@ module Karafka
26
26
  required(:reducer) { |val| val.respond_to?(:call) }
27
27
  required(:max_partitions) { |val| val.is_a?(Integer) && val >= 1 }
28
28
  required(:offset_metadata_strategy) { |val| %i[exact current].include?(val) }
29
+ required(:distribution) { |val| %i[consistent balanced].include?(val) }
29
30
  end
30
31
 
31
32
  # When virtual partitions are defined, partitioner needs to respond to `#call` and it
@@ -20,13 +20,18 @@ module Karafka
20
20
  # the most recently reported metadata
21
21
  # @param reducer [nil, #call] reducer for VPs key. It allows for using a custom
22
22
  # reducer to achieve enhanced parallelization when the default reducer is not enough.
23
+ # @param distribution [Symbol] the strategy to use for virtual partitioning. Can be
24
+ # either `:consistent` or `:balanced`. The `:balanced` strategy ensures balanced
25
+ # distribution of work across available workers while maintaining message order
26
+ # within groups.
23
27
  # @return [VirtualPartitions] method that allows to set the virtual partitions details
24
28
  # during the routing configuration and then allows to retrieve it
25
29
  def virtual_partitions(
26
30
  max_partitions: Karafka::App.config.concurrency,
27
31
  partitioner: nil,
28
32
  offset_metadata_strategy: :current,
29
- reducer: nil
33
+ reducer: nil,
34
+ distribution: :consistent
30
35
  )
31
36
  @virtual_partitions ||= Config.new(
32
37
  active: !partitioner.nil?,
@@ -35,7 +40,8 @@ module Karafka
35
40
  offset_metadata_strategy: offset_metadata_strategy,
36
41
  # If no reducer provided, we use this one. It just runs a modulo on the sum of
37
42
  # a stringified version, providing fairly good distribution.
38
- reducer: reducer || ->(virtual_key) { virtual_key.to_s.sum % max_partitions }
43
+ reducer: reducer || ->(virtual_key) { virtual_key.to_s.sum % max_partitions },
44
+ distribution: distribution
39
45
  )
40
46
  end
41
47
 
@@ -8,13 +8,27 @@ module Karafka
8
8
  module ScheduledMessages
9
9
  # Consumer that coordinates scheduling of messages when the time comes
10
10
  class Consumer < ::Karafka::BaseConsumer
11
+ include Helpers::ConfigImporter.new(
12
+ dispatcher_class: %i[scheduled_messages dispatcher_class]
13
+ )
14
+
15
+ # In case there is an extremely high turnover of messages, EOF may never kick in,
16
+ # effectively not changing status from loading to loaded. We use the time consumer instance
17
+ # was created + a buffer time to detect such a case (loading + messages from the time it
18
+ # was already running) to switch the state despite no EOF
19
+ # This is in seconds
20
+ GRACE_PERIOD = 15
21
+
22
+ private_constant :GRACE_PERIOD
23
+
11
24
  # Prepares the initial state of all stateful components
12
25
  def initialized
13
26
  clear!
14
27
  # Max epoch is always moving forward with the time. Never backwards, hence we do not
15
28
  # reset it at all.
16
29
  @max_epoch = MaxEpoch.new
17
- @state = State.new(nil)
30
+ @state = State.new
31
+ @reloads = 0
18
32
  end
19
33
 
20
34
  # Processes messages and runs dispatch (via tick) if needed
@@ -23,11 +37,25 @@ module Karafka
23
37
 
24
38
  messages.each do |message|
25
39
  SchemaValidator.call(message)
40
+
41
+ # We always track offsets of messages, even if they would be later on skipped or
42
+ # ignored for any reason. That way we have debug info that is useful once in a while.
43
+ @tracker.offsets(message)
44
+
26
45
  process_message(message)
27
46
  end
28
47
 
29
48
  @states_reporter.call
30
49
 
50
+ recent_timestamp = messages.last.timestamp.to_i
51
+ post_started_timestamp = @tracker.started_at + GRACE_PERIOD
52
+
53
+ # If we started getting messages that are beyond the current time, it means we have
54
+ # loaded enough to start scheduling. The upcoming messages are from the future looking
55
+ # from perspective of the current consumer start. We add a bit of grace period not to
56
+ # deal with edge cases
57
+ loaded! if @state.loading? && recent_timestamp > post_started_timestamp
58
+
31
59
  eofed if eofed?
32
60
 
33
61
  # Unless given day data is fully loaded we should not dispatch any notifications nor
@@ -51,8 +79,7 @@ module Karafka
51
79
  return if reload!
52
80
 
53
81
  # If end of the partition is reached, it always means all data is loaded
54
- @state.loaded!
55
- @states_reporter.call
82
+ loaded!
56
83
  end
57
84
 
58
85
  # Performs periodic operations when no new data is provided to the topic partition
@@ -64,7 +91,6 @@ module Karafka
64
91
  return unless @state.loaded?
65
92
 
66
93
  keys = []
67
- epochs = []
68
94
 
69
95
  # We first collect all the data for dispatch and then dispatch and **only** after
70
96
  # dispatch that is sync is successful we remove those messages from the daily buffer
@@ -72,35 +98,30 @@ module Karafka
72
98
  # with timeouts, etc, we need to be sure it wen through prior to deleting those messages
73
99
  # from the daily buffer. That way we ensure the at least once delivery and in case of
74
100
  # a transactional producer, exactly once delivery.
75
- @daily_buffer.for_dispatch do |epoch, message|
76
- epochs << epoch
101
+ @daily_buffer.for_dispatch do |message|
77
102
  keys << message.key
78
103
  @dispatcher << message
79
104
  end
80
105
 
81
106
  @dispatcher.flush
82
107
 
83
- @max_epoch.update(epochs.max)
84
-
85
108
  keys.each { |key| @daily_buffer.delete(key) }
86
109
 
87
110
  @states_reporter.call
88
111
  end
89
112
 
113
+ # Move the state to shutdown and publish immediately
114
+ def shutdown
115
+ @state.stopped!
116
+ @states_reporter.call!
117
+ end
118
+
90
119
  private
91
120
 
92
121
  # Takes each message and adds it to the daily accumulator if needed or performs other
93
122
  # accumulator and time related per-message operations.
94
123
  # @param message [Karafka::Messages::Message]
95
124
  def process_message(message)
96
- # If we started to receive messages younger than the moment we created the consumer for
97
- # the given day, it means we have loaded all the history and we are no longer in the
98
- # loading phase.
99
- if message.timestamp.to_i > @today.created_at
100
- @state.loaded!
101
- tags.add(:state, @state.to_s)
102
- end
103
-
104
125
  # If this is a schedule message we need to check if this is for today. Tombstone events
105
126
  # are always considered immediate as they indicate, that a message with a given key
106
127
  # was already dispatched or that user decided not to dispatch and cancelled the dispatch
@@ -109,7 +130,7 @@ module Karafka
109
130
  time = message.headers['schedule_target_epoch']
110
131
 
111
132
  # Do not track historical below today as those will be reflected in the daily buffer
112
- @tracker.track(message) if time >= @today.starts_at
133
+ @tracker.future(message) if time >= @today.starts_at
113
134
 
114
135
  if time > @today.ends_at || time < @max_epoch.to_i
115
136
  # Clean the message immediately when not needed (won't be scheduled) to preserve
@@ -120,6 +141,14 @@ module Karafka
120
141
  end
121
142
  end
122
143
 
144
+ # Tombstone events are only published after we have dispatched given message. This means
145
+ # that we've got that far in the dispatching time. This allows us (with a certain buffer)
146
+ # to quickly reject older messages (older in sense of being scheduled for previous times)
147
+ # instead of loading them into memory until they are expired
148
+ if message.headers['schedule_source_type'] == 'tombstone'
149
+ @max_epoch.update(message.headers['schedule_target_epoch'])
150
+ end
151
+
123
152
  # Add to buffer all tombstones and messages for the same day
124
153
  @daily_buffer << message
125
154
  end
@@ -129,7 +158,8 @@ module Karafka
129
158
  # If this is a new assignment we always need to seek from beginning to load the data
130
159
  if @state.fresh?
131
160
  clear!
132
- seek(0)
161
+ @reloads += 1
162
+ seek(:earliest)
133
163
 
134
164
  return true
135
165
  end
@@ -140,7 +170,8 @@ module Karafka
140
170
  # If day has ended we reload and start new day with new schedules
141
171
  if @today.ended?
142
172
  clear!
143
- seek(0)
173
+ @reloads += 1
174
+ seek(:earliest)
144
175
 
145
176
  return true
146
177
  end
@@ -148,6 +179,13 @@ module Karafka
148
179
  false
149
180
  end
150
181
 
182
+ # Moves the state to loaded and publishes the state update
183
+ def loaded!
184
+ @state.loaded!
185
+ tags.add(:state, @state.to_s)
186
+ @states_reporter.call!
187
+ end
188
+
151
189
  # Resets all buffers and states so we can start a new day with a clean slate
152
190
  # We can fully recreate the dispatcher because any undispatched messages will be dispatched
153
191
  # with the new day dispatcher after it is reloaded.
@@ -155,22 +193,19 @@ module Karafka
155
193
  @daily_buffer = DailyBuffer.new
156
194
  @today = Day.new
157
195
  @tracker = Tracker.new
158
- @state = State.new(false)
159
- @dispatcher = config.dispatcher_class.new(topic.name, partition)
196
+ @state = State.new
197
+ @state.loading!
198
+ @dispatcher = dispatcher_class.new(topic.name, partition)
160
199
  @states_reporter = Helpers::IntervalRunner.new do
161
200
  @tracker.today = @daily_buffer.size
162
201
  @tracker.state = @state.to_s
202
+ @tracker.reloads = @reloads
163
203
 
164
204
  @dispatcher.state(@tracker)
165
205
  end
166
206
 
167
207
  tags.add(:state, @state.to_s)
168
208
  end
169
-
170
- # @return [Karafka::Core::Configurable::Node] Schedules config node
171
- def config
172
- @config ||= Karafka::App.config.scheduled_messages
173
- end
174
209
  end
175
210
  end
176
211
  end
@@ -45,19 +45,22 @@ module Karafka
45
45
 
46
46
  # Yields messages that should be dispatched (sent) to Kafka
47
47
  #
48
- # @yieldparam [Integer, Karafka::Messages::Message] epoch of the message and the message
49
- # itself
50
- #
51
- # @note We yield epoch alongside of the message so we do not have to extract it several
52
- # times later on. This simplifies the API
48
+ # @yieldparam [Karafka::Messages::Message] messages to be dispatched sorted from the once
49
+ # that are the oldest (lowest epoch)
53
50
  def for_dispatch
54
51
  dispatch = Time.now.to_i
55
52
 
53
+ selected = []
54
+
56
55
  @accu.each_value do |epoch, message|
57
56
  next unless epoch <= dispatch
58
57
 
59
- yield(epoch, message)
58
+ selected << [epoch, message]
60
59
  end
60
+
61
+ selected
62
+ .sort_by!(&:first)
63
+ .each { |_, message| yield(message) }
61
64
  end
62
65
 
63
66
  # Removes given key from the accumulator
@@ -10,6 +10,12 @@ module Karafka
10
10
  module Deserializers
11
11
  # Converts certain pieces of headers into their integer form for messages
12
12
  class Headers
13
+ # We only directly operate on epoch and other details for schedules and tombstones.
14
+ # cancel requests don't have to be deserialized that way since they don't have epoch
15
+ WORKABLE_TYPES = %w[schedule tombstone].freeze
16
+
17
+ private_constant :WORKABLE_TYPES
18
+
13
19
  # @param metadata [Karafka::aMessages::Metadata]
14
20
  # @return [Hash] headers
15
21
  def call(metadata)
@@ -19,7 +25,7 @@ module Karafka
19
25
 
20
26
  # tombstone and cancellation events are not operable, thus we do not have to cast any
21
27
  # of the headers pieces
22
- return raw_headers unless type == 'schedule'
28
+ return raw_headers unless WORKABLE_TYPES.include?(type)
23
29
 
24
30
  headers = raw_headers.dup
25
31
  headers['schedule_target_epoch'] = headers['schedule_target_epoch'].to_i