karafka 2.3.1 → 2.3.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +20 -0
  5. data/Gemfile.lock +6 -6
  6. data/README.md +2 -2
  7. data/bin/integrations +2 -1
  8. data/bin/rspecs +6 -2
  9. data/config/locales/errors.yml +33 -8
  10. data/config/locales/pro_errors.yml +6 -0
  11. data/docker-compose.yml +1 -1
  12. data/lib/karafka/app.rb +14 -0
  13. data/lib/karafka/cli/base.rb +19 -0
  14. data/lib/karafka/cli/server.rb +62 -76
  15. data/lib/karafka/cli/swarm.rb +30 -0
  16. data/lib/karafka/connection/client.rb +7 -0
  17. data/lib/karafka/constraints.rb +3 -3
  18. data/lib/karafka/contracts/config.rb +41 -0
  19. data/lib/karafka/errors.rb +12 -0
  20. data/lib/karafka/helpers/config_importer.rb +30 -0
  21. data/lib/karafka/instrumentation/logger_listener.rb +31 -0
  22. data/lib/karafka/instrumentation/notifications.rb +9 -0
  23. data/lib/karafka/instrumentation/vendors/datadog/logger_listener.rb +2 -0
  24. data/lib/karafka/instrumentation/vendors/datadog/metrics_listener.rb +34 -4
  25. data/lib/karafka/instrumentation/vendors/kubernetes/base_listener.rb +72 -0
  26. data/lib/karafka/instrumentation/vendors/kubernetes/liveness_listener.rb +11 -40
  27. data/lib/karafka/instrumentation/vendors/kubernetes/swarm_liveness_listener.rb +54 -0
  28. data/lib/karafka/pro/active_job/job_options_contract.rb +1 -1
  29. data/lib/karafka/pro/base_consumer.rb +16 -0
  30. data/lib/karafka/pro/connection/manager.rb +6 -1
  31. data/lib/karafka/pro/processing/coordinator.rb +13 -3
  32. data/lib/karafka/pro/processing/coordinators/errors_tracker.rb +74 -0
  33. data/lib/karafka/pro/processing/coordinators/filters_applier.rb +107 -0
  34. data/lib/karafka/pro/processing/coordinators/virtual_offset_manager.rb +180 -0
  35. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom.rb +5 -7
  36. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_lrj_mom_vp.rb +5 -7
  37. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom.rb +8 -10
  38. data/lib/karafka/pro/processing/strategies/aj/dlq_ftr_mom_vp.rb +8 -16
  39. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom.rb +5 -7
  40. data/lib/karafka/pro/processing/strategies/aj/dlq_lrj_mom_vp.rb +5 -7
  41. data/lib/karafka/pro/processing/strategies/aj/dlq_mom.rb +8 -10
  42. data/lib/karafka/pro/processing/strategies/aj/dlq_mom_vp.rb +7 -9
  43. data/lib/karafka/pro/processing/strategies/dlq/default.rb +36 -10
  44. data/lib/karafka/pro/processing/strategies/dlq/ftr.rb +3 -7
  45. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj.rb +4 -8
  46. data/lib/karafka/pro/processing/strategies/dlq/ftr_lrj_mom.rb +6 -9
  47. data/lib/karafka/pro/processing/strategies/dlq/ftr_mom.rb +5 -15
  48. data/lib/karafka/pro/processing/strategies/dlq/lrj.rb +4 -8
  49. data/lib/karafka/pro/processing/strategies/dlq/lrj_mom.rb +6 -9
  50. data/lib/karafka/pro/processing/strategies/dlq/mom.rb +10 -20
  51. data/lib/karafka/pro/processing/strategies/vp/default.rb +7 -0
  52. data/lib/karafka/pro/routing/features/dead_letter_queue/contracts/topic.rb +6 -0
  53. data/lib/karafka/pro/routing/features/dead_letter_queue/topic.rb +39 -0
  54. data/lib/karafka/pro/routing/features/swarm/config.rb +31 -0
  55. data/lib/karafka/pro/routing/features/swarm/contracts/topic.rb +67 -0
  56. data/lib/karafka/pro/routing/features/swarm/topic.rb +54 -0
  57. data/lib/karafka/pro/routing/features/swarm.rb +25 -0
  58. data/lib/karafka/pro/swarm/liveness_listener.rb +171 -0
  59. data/lib/karafka/process.rb +27 -1
  60. data/lib/karafka/routing/features/dead_letter_queue/config.rb +2 -0
  61. data/lib/karafka/routing/subscription_group.rb +44 -9
  62. data/lib/karafka/server.rb +11 -13
  63. data/lib/karafka/setup/config.rb +41 -2
  64. data/lib/karafka/status.rb +4 -2
  65. data/lib/karafka/swarm/liveness_listener.rb +55 -0
  66. data/lib/karafka/swarm/manager.rb +229 -0
  67. data/lib/karafka/swarm/node.rb +179 -0
  68. data/lib/karafka/swarm/pidfd.rb +147 -0
  69. data/lib/karafka/swarm/supervisor.rb +187 -0
  70. data/lib/karafka/swarm.rb +27 -0
  71. data/lib/karafka/version.rb +1 -1
  72. data/lib/karafka.rb +1 -1
  73. data.tar.gz.sig +0 -0
  74. metadata +21 -4
  75. metadata.gz.sig +0 -0
  76. data/lib/karafka/pro/processing/filters_applier.rb +0 -105
  77. data/lib/karafka/pro/processing/virtual_offset_manager.rb +0 -177
@@ -28,6 +28,12 @@ module Karafka
28
28
  ).fetch('en').fetch('validations').fetch('topic')
29
29
  end
30
30
 
31
+ nested(:dead_letter_queue) do
32
+ # We use strategy based DLQ for every case in Pro
33
+ # For default (when no strategy) a default `max_retries` based strategy is used
34
+ required(:strategy) { |val| val.respond_to?(:call) }
35
+ end
36
+
31
37
  # Make sure that when we use virtual partitions with DLQ, at least one retry is set
32
38
  # We cannot use VP with DLQ without retries as we in order to provide ordering
33
39
  # warranties on errors with VP, we need to collapse the VPs concurrency and retry
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ class DeadLetterQueue < Base
19
+ # Expansions to the topic API in DLQ
20
+ module Topic
21
+ # @param strategy [#call, nil] Strategy we want to use or nil if a default strategy
22
+ # (same as in OSS) should be applied
23
+ # @param args [Hash] OSS DLQ arguments
24
+ def dead_letter_queue(strategy: nil, **args)
25
+ return @dead_letter_queue if @dead_letter_queue
26
+
27
+ super(**args).tap do |config|
28
+ # If explicit strategy is not provided, use the default approach from OSS
29
+ config.strategy = strategy || lambda do |_errors_tracker, attempt|
30
+ attempt > config.max_retries ? :dispatch : :retry
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ class Swarm < Base
19
+ # Swarm feature configuration
20
+ Config = Struct.new(
21
+ :active,
22
+ :nodes,
23
+ keyword_init: true
24
+ ) do
25
+ alias_method :active?, :active
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ class Swarm < Base
19
+ # Namespace for swarm contracts
20
+ module Contracts
21
+ # Contract to validate configuration of the swarm feature
22
+ class Topic < Karafka::Contracts::Base
23
+ configure do |config|
24
+ config.error_messages = YAML.safe_load(
25
+ File.read(
26
+ File.join(Karafka.gem_root, 'config', 'locales', 'pro_errors.yml')
27
+ )
28
+ ).fetch('en').fetch('validations').fetch('topic')
29
+ end
30
+
31
+ nested(:swarm) do
32
+ required(:active) { |val| val == true }
33
+
34
+ required(:nodes) do |val|
35
+ val.is_a?(Range) || (
36
+ val.is_a?(Array) &&
37
+ val.all? { |id| id.is_a?(Integer) }
38
+ )
39
+ end
40
+ end
41
+
42
+ # Make sure that if range is defined it fits number of nodes (except infinity)
43
+ # As it may be a common error to accidentally define a node that will never be
44
+ # reached
45
+ virtual do |data, errors|
46
+ next unless errors.empty?
47
+
48
+ nodes = data[:swarm][:nodes]
49
+
50
+ # Defaults
51
+ next if nodes.first.zero? && nodes.last == Float::INFINITY
52
+
53
+ # If our expectation towards which node should run things matches at least one
54
+ # node, then it's ok
55
+ next if Karafka::App.config.swarm.nodes.times.any? do |node_id|
56
+ nodes.include?(node_id)
57
+ end
58
+
59
+ [[%i[swarm_nodes], :with_non_existent_nodes]]
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ class Swarm < Base
19
+ # Topic swarm API extensions
20
+ module Topic
21
+ # Allows defining swarm routing topic settings
22
+ # @param nodes [Range, Array] range of nodes ids or array with nodes ids for which we
23
+ # should run given topic
24
+ def swarm(nodes: (0...Karafka::App.config.swarm.nodes))
25
+ @swarm ||= Config.new(active: true, nodes: nodes)
26
+ end
27
+
28
+ # @return [true] swarm setup is always true. May not be in use but is active
29
+ def swarm?
30
+ swarm.active?
31
+ end
32
+
33
+ # @return [Boolean] should this topic be active. In the context of swarm it is only
34
+ # active when swarm routing setup does not limit nodes on which it should operate
35
+ def active?
36
+ node = Karafka::App.config.swarm.node
37
+
38
+ return super unless node
39
+
40
+ super && swarm.nodes.include?(node.id)
41
+ end
42
+
43
+ # @return [Hash] topic with all its native configuration options plus swarm
44
+ def to_h
45
+ super.merge(
46
+ swarm: swarm.to_h
47
+ ).freeze
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ module Routing
17
+ module Features
18
+ # Karafka Pro Swarm extensions to the routing
19
+ # They allow for more granular work assignment in the swarm
20
+ class Swarm < Base
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This Karafka component is a Pro component under a commercial license.
4
+ # This Karafka component is NOT licensed under LGPL.
5
+ #
6
+ # All of the commercial components are present in the lib/karafka/pro directory of this
7
+ # repository and their usage requires commercial license agreement.
8
+ #
9
+ # Karafka has also commercial-friendly license, commercial support and commercial components.
10
+ #
11
+ # By sending a pull request to the pro components, you are agreeing to transfer the copyright of
12
+ # your code to Maciej Mensfeld.
13
+
14
+ module Karafka
15
+ module Pro
16
+ # Pro Swarm components namespace
17
+ module Swarm
18
+ # Pro listener that monitors RSS usage and other heartbeat metrics (if configured) to ensure
19
+ # that everything operates.
20
+ #
21
+ # It can:
22
+ # - monitor poll frequency to make sure things are not polled not often enough
23
+ # - monitor consumption to make sure we do not process data for too long
24
+ # - monitor RSS to make sure that we do not use too much memory
25
+ #
26
+ # By default it does **not** monitor memory and consuming and polling is configured in such
27
+ # a way to align with `max.poll.interval.ms` and other defaults.
28
+ #
29
+ # Failure statuses reported are as follows:
30
+ # - 1 - polling ttl exceeded
31
+ # - 2 - consuming ttl exceeded
32
+ # - 3 - memory limit exceeded
33
+ #
34
+ # @note This listener should not break anything if subscribed in the supervisor prior to
35
+ # forking as it relies on server events for operations.
36
+ class LivenessListener < Karafka::Swarm::LivenessListener
37
+ # @param memory_limit [Integer] max memory in MB for this process to be considered healthy
38
+ # @param consuming_ttl [Integer] time in ms after which we consider consumption hanging.
39
+ # It allows us to define max consumption time after which supervisor should consider
40
+ # given process as hanging
41
+ # @param polling_ttl [Integer] max time in ms for polling. If polling (any) does not
42
+ # happen that often, process should be considered dead.
43
+ # @note The default TTL matches the default `max.poll.interval.ms`
44
+ def initialize(
45
+ memory_limit: Float::INFINITY,
46
+ consuming_ttl: 5 * 60 * 1_000,
47
+ polling_ttl: 5 * 60 * 1_000
48
+ )
49
+ @polling_ttl = polling_ttl
50
+ @consuming_ttl = consuming_ttl
51
+ # We cast it just in case someone would provide '10MB' or something similar
52
+ @memory_limit = memory_limit.is_a?(String) ? memory_limit.to_i : memory_limit
53
+ @pollings = {}
54
+ @consumptions = {}
55
+
56
+ super()
57
+ end
58
+
59
+ # Tick on each fetch
60
+ #
61
+ # @param _event [Karafka::Core::Monitoring::Event]
62
+ def on_connection_listener_fetch_loop(_event)
63
+ mark_polling_tick
64
+ end
65
+
66
+ {
67
+ consume: :consumed,
68
+ revoke: :revoked,
69
+ shutting_down: :shutdown,
70
+ tick: :ticked
71
+ }.each do |before, after|
72
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
73
+ # Tick on starting work
74
+ # @param _event [Karafka::Core::Monitoring::Event]
75
+ def on_consumer_#{before}(_event)
76
+ mark_consumption_tick
77
+ end
78
+
79
+ # Tick on finished work
80
+ # @param _event [Karafka::Core::Monitoring::Event]
81
+ def on_consumer_#{after}(_event)
82
+ clear_consumption_tick
83
+ end
84
+ RUBY
85
+ end
86
+
87
+ # @param _event [Karafka::Core::Monitoring::Event]
88
+ def on_error_occurred(_event)
89
+ clear_consumption_tick
90
+ clear_polling_tick
91
+ end
92
+
93
+ # Reports the current status once in a while
94
+ #
95
+ # @param _event [Karafka::Core::Monitoring::Event]
96
+ def on_statistics_emitted(_event)
97
+ periodically do
98
+ return unless node
99
+
100
+ current_status = status
101
+
102
+ current_status.positive? ? node.unhealthy(current_status) : node.healthy
103
+ end
104
+ end
105
+
106
+ private
107
+
108
+ # @return [Integer] object id of the current thread
109
+ def thread_id
110
+ Thread.current.object_id
111
+ end
112
+
113
+ # Update the polling tick time for current thread
114
+ def mark_polling_tick
115
+ synchronize do
116
+ @pollings[thread_id] = monotonic_now
117
+ end
118
+ end
119
+
120
+ # Clear current thread polling time tracker
121
+ def clear_polling_tick
122
+ synchronize do
123
+ @pollings.delete(thread_id)
124
+ end
125
+ end
126
+
127
+ # Update the processing tick time
128
+ def mark_consumption_tick
129
+ synchronize do
130
+ @consumptions[thread_id] = monotonic_now
131
+ end
132
+ end
133
+
134
+ # Clear current thread consumption time tracker
135
+ def clear_consumption_tick
136
+ synchronize do
137
+ @consumptions.delete(thread_id)
138
+ end
139
+ end
140
+
141
+ # Did we exceed any of the ttls
142
+ # @return [String] 204 string if ok, 500 otherwise
143
+ def status
144
+ time = monotonic_now
145
+
146
+ return 1 if @pollings.values.any? { |tick| (time - tick) > @polling_ttl }
147
+ return 2 if @consumptions.values.any? { |tick| (time - tick) > @consuming_ttl }
148
+ return 3 if rss_mb > @memory_limit
149
+
150
+ 0
151
+ end
152
+
153
+ # @return [Integer] RSS in MB for the current process
154
+ # @note Since swarm is linux only, we do not have to worry about getting RSS on other OSes
155
+ def rss_mb
156
+ kb_rss = 0
157
+
158
+ IO.readlines("/proc/#{node.pid}/status").each do |line|
159
+ next unless line.start_with?('VmRSS:')
160
+
161
+ kb_rss = line.split[1].to_i
162
+
163
+ break
164
+ end
165
+
166
+ (kb_rss / 1_024.to_i).round
167
+ end
168
+ end
169
+ end
170
+ end
171
+ end
@@ -14,6 +14,8 @@ module Karafka
14
14
  SIGTERM
15
15
  SIGTTIN
16
16
  SIGTSTP
17
+ SIGCHLD
18
+ SIGUSER1
17
19
  ].freeze
18
20
 
19
21
  HANDLED_SIGNALS.each do |signal|
@@ -32,16 +34,40 @@ module Karafka
32
34
  RUBY
33
35
  end
34
36
 
37
+ # Assigns a callback that will run on any supported signal that has at least one callback
38
+ # registered already.
39
+ # @param block [Proc] code we want to run
40
+ # @note This will only bind to signals that already have at least one callback defined
41
+ def on_any_active(&block)
42
+ HANDLED_SIGNALS.each do |signal|
43
+ next unless @callbacks.key?(signal)
44
+
45
+ public_send(:"on_#{signal.to_s.downcase}", &block)
46
+ end
47
+ end
48
+
35
49
  # Creates an instance of process and creates empty hash for callbacks
36
50
  def initialize
37
51
  @callbacks = Hash.new { |hsh, key| hsh[key] = [] }
38
52
  @supervised = false
39
53
  end
40
54
 
55
+ # Clears all the defined callbacks. Useful for post-fork cleanup when parent already defined
56
+ # some signals
57
+ def clear
58
+ @callbacks.clear
59
+ end
60
+
41
61
  # Method catches all HANDLED_SIGNALS and performs appropriate callbacks (if defined)
42
62
  # @note If there are no callbacks, this method will just ignore a given signal that was sent
43
63
  def supervise
44
- HANDLED_SIGNALS.each { |signal| trap_signal(signal) }
64
+ HANDLED_SIGNALS.each do |signal|
65
+ # Supervise only signals for which we have defined callbacks
66
+ next unless @callbacks.key?(signal)
67
+
68
+ trap_signal(signal)
69
+ end
70
+
45
71
  @supervised = true
46
72
  end
47
73
 
@@ -15,6 +15,8 @@ module Karafka
15
15
  :independent,
16
16
  # Move to DLQ and mark as consumed in transactional mode (if applicable)
17
17
  :transactional,
18
+ # Strategy to apply (if strategies supported)
19
+ :strategy,
18
20
  keyword_init: true
19
21
  ) do
20
22
  alias_method :active?, :active
@@ -8,6 +8,12 @@ module Karafka
8
8
  # @note One subscription group will always belong to one consumer group, but one consumer
9
9
  # group can have multiple subscription groups.
10
10
  class SubscriptionGroup
11
+ include Helpers::ConfigImporter.new(
12
+ activity_manager: %i[internal routing activity_manager],
13
+ client_id: %i[client_id],
14
+ node: %i[swarm node]
15
+ )
16
+
11
17
  attr_reader :id, :name, :topics, :kafka, :consumer_group
12
18
 
13
19
  # Lock for generating new ids safely
@@ -67,7 +73,7 @@ module Karafka
67
73
 
68
74
  # @return [Boolean] is this subscription group one of active once
69
75
  def active?
70
- Karafka::App.config.internal.routing.activity_manager.active?(:subscription_groups, name)
76
+ activity_manager.active?(:subscription_groups, name)
71
77
  end
72
78
 
73
79
  # @return [Array<String>] names of topics to which we should subscribe.
@@ -85,6 +91,19 @@ module Karafka
85
91
  id
86
92
  end
87
93
 
94
+ # Refreshes the configuration of this subscription group if needed based on the execution
95
+ # context.
96
+ #
97
+ # Since the initial routing setup happens in the supervisor, it is inherited by the children.
98
+ # This causes incomplete assignment of `group.instance.id` which is not expanded with proper
99
+ # node identifier. This refreshes this if needed when in swarm.
100
+ def refresh
101
+ return unless node
102
+ return unless kafka.key?(:'group.instance.id')
103
+
104
+ @kafka = build_kafka
105
+ end
106
+
88
107
  private
89
108
 
90
109
  # @return [Hash] kafka settings are a bit special. They are exactly the same for all of the
@@ -93,15 +112,9 @@ module Karafka
93
112
  def build_kafka
94
113
  kafka = Setup::AttributesMap.consumer(@topics.first.kafka.dup)
95
114
 
96
- # If we use static group memberships, there can be a case, where same instance id would
97
- # be set on many subscription groups as the group instance id from Karafka perspective is
98
- # set per config. Each instance even if they are subscribed to different topics needs to
99
- # have it fully unique. To make sure of that, we just add extra postfix at the end that
100
- # increments.
101
- group_instance_id = kafka.fetch(:'group.instance.id', false)
115
+ inject_group_instance_id(kafka)
102
116
 
103
- kafka[:'group.instance.id'] = "#{group_instance_id}_#{@position}" if group_instance_id
104
- kafka[:'client.id'] ||= Karafka::App.config.client_id
117
+ kafka[:'client.id'] ||= client_id
105
118
  kafka[:'group.id'] ||= @consumer_group.id
106
119
  kafka[:'auto.offset.reset'] ||= @topics.first.initial_offset
107
120
  # Karafka manages the offsets based on the processing state, thus we do not rely on the
@@ -110,6 +123,28 @@ module Karafka
110
123
  kafka.freeze
111
124
  kafka
112
125
  end
126
+
127
+ # If we use static group memberships, there can be a case, where same instance id would
128
+ # be set on many subscription groups as the group instance id from Karafka perspective is
129
+ # set per config. Each instance even if they are subscribed to different topics needs to
130
+ # have it fully unique. To make sure of that, we just add extra postfix at the end that
131
+ # increments.
132
+ #
133
+ # We also handle a swarm case, where the same setup would run from many forked nodes, hence
134
+ # affecting the instance id and causing conflicts
135
+ # @param kafka [Hash] kafka level config
136
+ def inject_group_instance_id(kafka)
137
+ group_instance_prefix = kafka.fetch(:'group.instance.id', false)
138
+
139
+ # If group instance id was not even configured, do nothing
140
+ return unless group_instance_prefix
141
+
142
+ # If there is a node, we need to take its id and inject it as well so multiple forks can
143
+ # have different instances ids but they are reproducible
144
+ components = [group_instance_prefix, node ? node.id : nil, @position]
145
+
146
+ kafka[:'group.instance.id'] = components.compact.join('_')
147
+ end
113
148
  end
114
149
  end
115
150
  end
@@ -3,16 +3,6 @@
3
3
  module Karafka
4
4
  # Karafka consuming server class
5
5
  class Server
6
- # How long should we sleep between checks on shutting down consumers
7
- SUPERVISION_SLEEP = 0.1
8
- # What system exit code should we use when we terminated forcefully
9
- FORCEFUL_EXIT_CODE = 2
10
- # This factor allows us to calculate how many times we have to sleep before
11
- # a forceful shutdown
12
- SUPERVISION_CHECK_FACTOR = (1 / SUPERVISION_SLEEP)
13
-
14
- private_constant :SUPERVISION_SLEEP, :FORCEFUL_EXIT_CODE, :SUPERVISION_CHECK_FACTOR
15
-
16
6
  class << self
17
7
  # Set of consuming threads. Each consumer thread contains a single consumer
18
8
  attr_accessor :listeners
@@ -36,12 +26,20 @@ module Karafka
36
26
  config.internal.routing.activity_manager.to_h
37
27
  )
38
28
 
29
+ # We clear as we do not want parent handlers in case of working from fork
30
+ process.clear
39
31
  process.on_sigint { stop }
40
32
  process.on_sigquit { stop }
41
33
  process.on_sigterm { stop }
42
34
  process.on_sigtstp { quiet }
35
+ # Needed for instrumentation
36
+ process.on_sigttin {}
43
37
  process.supervise
44
38
 
39
+ # This will only run when not in a swarm mode. In swarm mode the server runs post-fork, so
40
+ # warmup will do nothing
41
+ Karafka::App.warmup
42
+
45
43
  # Start is blocking until stop is called and when we stop, it will wait until
46
44
  # all of the things are ready to stop
47
45
  start
@@ -86,13 +84,13 @@ module Karafka
86
84
  # We check from time to time (for the timeout period) if all the threads finished
87
85
  # their work and if so, we can just return and normal shutdown process will take place
88
86
  # We divide it by 1000 because we use time in ms.
89
- ((timeout / 1_000) * SUPERVISION_CHECK_FACTOR).to_i.times do
87
+ ((timeout / 1_000) * (1 / config.internal.supervision_sleep)).to_i.times do
90
88
  all_listeners_stopped = listeners.all?(&:stopped?)
91
89
  all_workers_stopped = workers.none?(&:alive?)
92
90
 
93
91
  return if all_listeners_stopped && all_workers_stopped
94
92
 
95
- sleep SUPERVISION_SLEEP
93
+ sleep(config.internal.supervision_sleep)
96
94
  end
97
95
 
98
96
  raise Errors::ForcefulShutdownError
@@ -116,7 +114,7 @@ module Karafka
116
114
  return unless process.supervised?
117
115
 
118
116
  # exit! is not within the instrumentation as it would not trigger due to exit
119
- Kernel.exit!(FORCEFUL_EXIT_CODE)
117
+ Kernel.exit!(config.internal.forceful_exit_code)
120
118
  ensure
121
119
  # We need to check if it wasn't an early exit to make sure that only on stop invocation
122
120
  # can change the status after everything is closed
@@ -105,6 +105,17 @@ module Karafka
105
105
  # @see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
106
106
  setting :kafka, default: {}
107
107
 
108
+ # Public configuration for swarm operations
109
+ setting :swarm do
110
+ # option [Integer] how many processes do we want to run in a swarm mode
111
+ # Keep in mind this is only applicable when running in a swarm mode
112
+ setting :nodes, default: 3
113
+ # This is set automatically when we fork. Used to hold reference that may be needed
114
+ # for static group membership, supervision and more. If set to `false`, it means this
115
+ # process is not a fork
116
+ setting :node, default: false
117
+ end
118
+
108
119
  # Admin specific settings.
109
120
  #
110
121
  # Since admin operations are often specific, they may require specific librdkafka settings
@@ -151,7 +162,6 @@ module Karafka
151
162
  # @note In the future, we need to have a single process representation for all the karafka
152
163
  # instances
153
164
  setting :process, default: Process.new
154
-
155
165
  # Interval of "ticking". This is used to define the maximum time between consecutive
156
166
  # polling of the main rdkafka queue. It should match also the `statistics.interval.ms`
157
167
  # smallest value defined in any of the per-kafka settings, so metrics are published with
@@ -162,6 +172,36 @@ module Karafka
162
172
  # not to have enough time to run. This (not directly) defines also a single poll
163
173
  # max timeout as to allow for frequent enough events polling
164
174
  setting :tick_interval, default: 5_000
175
+ # How long should we sleep between checks on shutting down consumers
176
+ setting :supervision_sleep, default: 0.1
177
+ # What system exit code should we use when we terminated forcefully
178
+ setting :forceful_exit_code, default: 2
179
+
180
+ setting :swarm do
181
+ # Manager for swarm nodes control
182
+ setting :manager, default: Swarm::Manager.new
183
+ # Exit code we exit an orphaned child with to indicate something went wrong
184
+ setting :orphaned_exit_code, default: 3
185
+ # syscall number for https://man7.org/linux/man-pages/man2/pidfd_open.2.html
186
+ setting :pidfd_open_syscall, default: 434
187
+ # syscall number for https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
188
+ setting :pidfd_signal_syscall, default: 424
189
+ # How often (in ms) should we control our nodes
190
+ # This is maximum time after which we will check. This can happen more often in case of
191
+ # system events.
192
+ setting :supervision_interval, default: 30_000
193
+ # How often should each node report its status
194
+ setting :liveness_interval, default: 10_000
195
+ # Listener used to report nodes state to the supervisor
196
+ setting :liveness_listener, default: Swarm::LivenessListener.new
197
+ # How long should we wait for any info from the node before we consider it hanging at
198
+ # stop it
199
+ setting :node_report_timeout, default: 60_000
200
+ # How long should we wait before restarting a node. This can prevent us from having a
201
+ # case where for some external reason our spawned process would die immediately and we
202
+ # would immediately try to start it back in an endless loop
203
+ setting :node_restart_timeout, default: 5_000
204
+ end
165
205
 
166
206
  # Namespace for CLI related settings
167
207
  setting :cli do
@@ -176,7 +216,6 @@ module Karafka
176
216
  # option subscription_groups_builder [Routing::SubscriptionGroupsBuilder] subscription
177
217
  # group builder
178
218
  setting :subscription_groups_builder, default: Routing::SubscriptionGroupsBuilder.new
179
-
180
219
  # Internally assigned list of limits on routings active for the current process
181
220
  # This can be altered by the CLI command
182
221
  setting :activity_manager, default: Routing::ActivityManager.new