karafka 2.0.0.beta2 → 2.0.0.beta5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/.github/workflows/ci.yml +18 -15
- data/CHANGELOG.md +49 -0
- data/Gemfile.lock +8 -8
- data/bin/benchmarks +2 -2
- data/bin/integrations +44 -15
- data/bin/scenario +29 -0
- data/bin/{stress → stress_many} +0 -0
- data/bin/stress_one +13 -0
- data/bin/wait_for_kafka +20 -0
- data/docker-compose.yml +28 -11
- data/karafka.gemspec +2 -2
- data/lib/karafka/active_job/routing/extensions.rb +12 -2
- data/lib/karafka/app.rb +2 -1
- data/lib/karafka/base_consumer.rb +75 -45
- data/lib/karafka/connection/client.rb +88 -22
- data/lib/karafka/connection/listener.rb +60 -18
- data/lib/karafka/connection/pauses_manager.rb +8 -0
- data/lib/karafka/connection/rebalance_manager.rb +20 -19
- data/lib/karafka/contracts/config.rb +17 -3
- data/lib/karafka/contracts/server_cli_options.rb +1 -1
- data/lib/karafka/errors.rb +3 -0
- data/lib/karafka/instrumentation/logger_listener.rb +34 -10
- data/lib/karafka/instrumentation/monitor.rb +3 -1
- data/lib/karafka/licenser.rb +26 -7
- data/lib/karafka/pro/active_job/consumer.rb +30 -9
- data/lib/karafka/pro/active_job/dispatcher.rb +9 -9
- data/lib/karafka/pro/active_job/job_options_contract.rb +9 -9
- data/lib/karafka/pro/base_consumer.rb +73 -0
- data/lib/karafka/pro/loader.rb +38 -20
- data/lib/karafka/pro/performance_tracker.rb +9 -9
- data/lib/karafka/pro/processing/coordinator.rb +12 -0
- data/lib/karafka/pro/processing/jobs/consume_non_blocking.rb +10 -11
- data/lib/karafka/pro/processing/jobs_builder.rb +32 -0
- data/lib/karafka/pro/processing/scheduler.rb +56 -0
- data/lib/karafka/pro/routing/extensions.rb +32 -0
- data/lib/karafka/processing/coordinator.rb +84 -0
- data/lib/karafka/processing/coordinators_buffer.rb +58 -0
- data/lib/karafka/processing/executor.rb +23 -9
- data/lib/karafka/processing/executors_buffer.rb +46 -15
- data/lib/karafka/processing/jobs/base.rb +8 -3
- data/lib/karafka/processing/jobs/consume.rb +11 -4
- data/lib/karafka/processing/jobs_builder.rb +29 -0
- data/lib/karafka/processing/result.rb +29 -0
- data/lib/karafka/processing/scheduler.rb +22 -0
- data/lib/karafka/processing/worker.rb +17 -9
- data/lib/karafka/routing/consumer_group.rb +1 -1
- data/lib/karafka/routing/subscription_group.rb +1 -1
- data/lib/karafka/routing/topic.rb +14 -0
- data/lib/karafka/setup/config.rb +19 -9
- data/lib/karafka/status.rb +1 -3
- data/lib/karafka/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +19 -7
- metadata.gz.sig +0 -0
- data/lib/karafka/pro/scheduler.rb +0 -54
- data/lib/karafka/scheduler.rb +0 -20
| @@ -10,44 +10,74 @@ module Karafka | |
| 10 10 | 
             
                attr_accessor :messages
         | 
| 11 11 | 
             
                # @return [Karafka::Connection::Client] kafka connection client
         | 
| 12 12 | 
             
                attr_accessor :client
         | 
| 13 | 
            -
                # @return [Karafka:: | 
| 14 | 
            -
                attr_accessor : | 
| 13 | 
            +
                # @return [Karafka::Processing::Coordinator] coordinator
         | 
| 14 | 
            +
                attr_accessor :coordinator
         | 
| 15 15 | 
             
                # @return [Waterdrop::Producer] producer instance
         | 
| 16 16 | 
             
                attr_accessor :producer
         | 
| 17 17 |  | 
| 18 | 
            +
                # Can be used to run preparation code
         | 
| 19 | 
            +
                #
         | 
| 20 | 
            +
                # @private
         | 
| 21 | 
            +
                # @note This should not be used by the end users as it is part of the lifecycle of things but
         | 
| 22 | 
            +
                #   not as part of the public api. This can act as a hook when creating non-blocking
         | 
| 23 | 
            +
                #   consumers and doing other advanced stuff
         | 
| 24 | 
            +
                def on_before_consume; end
         | 
| 25 | 
            +
             | 
| 18 26 | 
             
                # Executes the default consumer flow.
         | 
| 19 27 | 
             
                #
         | 
| 28 | 
            +
                # @return [Boolean] true if there was no exception, otherwise false.
         | 
| 29 | 
            +
                #
         | 
| 20 30 | 
             
                # @note We keep the seek offset tracking, and use it to compensate for async offset flushing
         | 
| 21 31 | 
             
                #   that may not yet kick in when error occurs. That way we pause always on the last processed
         | 
| 22 32 | 
             
                #   message.
         | 
| 23 33 | 
             
                def on_consume
         | 
| 24 34 | 
             
                  Karafka.monitor.instrument('consumer.consumed', caller: self) do
         | 
| 25 35 | 
             
                    consume
         | 
| 26 | 
            -
             | 
| 27 | 
            -
                    pause_tracker.reset
         | 
| 28 | 
            -
             | 
| 29 | 
            -
                    # Mark as consumed only if manual offset management is not on
         | 
| 30 | 
            -
                    next if topic.manual_offset_management
         | 
| 31 | 
            -
             | 
| 32 | 
            -
                    # We use the non-blocking one here. If someone needs the blocking one, can implement it
         | 
| 33 | 
            -
                    # with manual offset management
         | 
| 34 | 
            -
                    mark_as_consumed(messages.last)
         | 
| 35 36 | 
             
                  end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                  @coordinator.consumption(self).success!
         | 
| 36 39 | 
             
                rescue StandardError => e
         | 
| 40 | 
            +
                  @coordinator.consumption(self).failure!
         | 
| 41 | 
            +
             | 
| 37 42 | 
             
                  Karafka.monitor.instrument(
         | 
| 38 43 | 
             
                    'error.occurred',
         | 
| 39 44 | 
             
                    error: e,
         | 
| 40 45 | 
             
                    caller: self,
         | 
| 41 46 | 
             
                    type: 'consumer.consume.error'
         | 
| 42 47 | 
             
                  )
         | 
| 48 | 
            +
                ensure
         | 
| 49 | 
            +
                  # We need to decrease number of jobs that this coordinator coordinates as it has finished
         | 
| 50 | 
            +
                  @coordinator.decrement
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                # @private
         | 
| 54 | 
            +
                # @note This should not be used by the end users as it is part of the lifecycle of things but
         | 
| 55 | 
            +
                #   not as part of the public api.
         | 
| 56 | 
            +
                def on_after_consume
         | 
| 57 | 
            +
                  return if revoked?
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  if @coordinator.success?
         | 
| 60 | 
            +
                    coordinator.pause_tracker.reset
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    # Mark as consumed only if manual offset management is not on
         | 
| 63 | 
            +
                    return if topic.manual_offset_management?
         | 
| 43 64 |  | 
| 44 | 
            -
             | 
| 65 | 
            +
                    # We use the non-blocking one here. If someone needs the blocking one, can implement it
         | 
| 66 | 
            +
                    # with manual offset management
         | 
| 67 | 
            +
                    mark_as_consumed(messages.last)
         | 
| 68 | 
            +
                  else
         | 
| 69 | 
            +
                    pause(@seek_offset || messages.first.offset)
         | 
| 70 | 
            +
                  end
         | 
| 45 71 | 
             
                end
         | 
| 46 72 |  | 
| 47 73 | 
             
                # Trigger method for running on shutdown.
         | 
| 48 74 | 
             
                #
         | 
| 49 75 | 
             
                # @private
         | 
| 50 76 | 
             
                def on_revoked
         | 
| 77 | 
            +
                  coordinator.revoke
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                  resume
         | 
| 80 | 
            +
             | 
| 51 81 | 
             
                  Karafka.monitor.instrument('consumer.revoked', caller: self) do
         | 
| 52 82 | 
             
                    revoked
         | 
| 53 83 | 
             
                  end
         | 
| @@ -76,31 +106,8 @@ module Karafka | |
| 76 106 | 
             
                  )
         | 
| 77 107 | 
             
                end
         | 
| 78 108 |  | 
| 79 | 
            -
                # Can be used to run preparation code
         | 
| 80 | 
            -
                #
         | 
| 81 | 
            -
                # @private
         | 
| 82 | 
            -
                # @note This should not be used by the end users as it is part of the lifecycle of things but
         | 
| 83 | 
            -
                #   not as part of the public api. This can act as a hook when creating non-blocking
         | 
| 84 | 
            -
                #   consumers and doing other advanced stuff
         | 
| 85 | 
            -
                def on_prepared
         | 
| 86 | 
            -
                  Karafka.monitor.instrument('consumer.prepared', caller: self) do
         | 
| 87 | 
            -
                    prepared
         | 
| 88 | 
            -
                  end
         | 
| 89 | 
            -
                rescue StandardError => e
         | 
| 90 | 
            -
                  Karafka.monitor.instrument(
         | 
| 91 | 
            -
                    'error.occurred',
         | 
| 92 | 
            -
                    error: e,
         | 
| 93 | 
            -
                    caller: self,
         | 
| 94 | 
            -
                    type: 'consumer.prepared.error'
         | 
| 95 | 
            -
                  )
         | 
| 96 | 
            -
                end
         | 
| 97 | 
            -
             | 
| 98 109 | 
             
                private
         | 
| 99 110 |  | 
| 100 | 
            -
                # Method that gets called in the blocking flow allowing to setup any type of resources or to
         | 
| 101 | 
            -
                # send additional commands to Kafka before the proper execution starts.
         | 
| 102 | 
            -
                def prepared; end
         | 
| 103 | 
            -
             | 
| 104 111 | 
             
                # Method that will perform business logic and on data received from Kafka (it will consume
         | 
| 105 112 | 
             
                #   the data)
         | 
| 106 113 | 
             
                # @note This method needs bo be implemented in a subclass. We stub it here as a failover if
         | 
| @@ -120,21 +127,40 @@ module Karafka | |
| 120 127 | 
             
                # Marks message as consumed in an async way.
         | 
| 121 128 | 
             
                #
         | 
| 122 129 | 
             
                # @param message [Messages::Message] last successfully processed message.
         | 
| 130 | 
            +
                # @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
         | 
| 131 | 
            +
                #   that we were not able and that we have lost the partition.
         | 
| 132 | 
            +
                #
         | 
| 123 133 | 
             
                # @note We keep track of this offset in case we would mark as consumed and got error when
         | 
| 124 134 | 
             
                #   processing another message. In case like this we do not pause on the message we've already
         | 
| 125 135 | 
             
                #   processed but rather at the next one. This applies to both sync and async versions of this
         | 
| 126 136 | 
             
                #   method.
         | 
| 127 137 | 
             
                def mark_as_consumed(message)
         | 
| 128 | 
            -
                  client.mark_as_consumed(message)
         | 
| 138 | 
            +
                  unless client.mark_as_consumed(message)
         | 
| 139 | 
            +
                    coordinator.revoke
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                    return false
         | 
| 142 | 
            +
                  end
         | 
| 143 | 
            +
             | 
| 129 144 | 
             
                  @seek_offset = message.offset + 1
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                  true
         | 
| 130 147 | 
             
                end
         | 
| 131 148 |  | 
| 132 149 | 
             
                # Marks message as consumed in a sync way.
         | 
| 133 150 | 
             
                #
         | 
| 134 151 | 
             
                # @param message [Messages::Message] last successfully processed message.
         | 
| 152 | 
            +
                # @return [Boolean] true if we were able to mark the offset, false otherwise. False indicates
         | 
| 153 | 
            +
                #   that we were not able and that we have lost the partition.
         | 
| 135 154 | 
             
                def mark_as_consumed!(message)
         | 
| 136 | 
            -
                  client.mark_as_consumed!(message)
         | 
| 155 | 
            +
                  unless client.mark_as_consumed!(message)
         | 
| 156 | 
            +
                    coordinator.revoke
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    return false
         | 
| 159 | 
            +
                  end
         | 
| 160 | 
            +
             | 
| 137 161 | 
             
                  @seek_offset = message.offset + 1
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                  true
         | 
| 138 164 | 
             
                end
         | 
| 139 165 |  | 
| 140 166 | 
             
                # Pauses processing on a given offset for the current topic partition
         | 
| @@ -144,23 +170,20 @@ module Karafka | |
| 144 170 | 
             
                # @param timeout [Integer, nil] how long in milliseconds do we want to pause or nil to use the
         | 
| 145 171 | 
             
                #   default exponential pausing strategy defined for retries
         | 
| 146 172 | 
             
                def pause(offset, timeout = nil)
         | 
| 173 | 
            +
                  timeout ? coordinator.pause_tracker.pause(timeout) : coordinator.pause_tracker.pause
         | 
| 174 | 
            +
             | 
| 147 175 | 
             
                  client.pause(
         | 
| 148 176 | 
             
                    messages.metadata.topic,
         | 
| 149 177 | 
             
                    messages.metadata.partition,
         | 
| 150 178 | 
             
                    offset
         | 
| 151 179 | 
             
                  )
         | 
| 152 | 
            -
             | 
| 153 | 
            -
                  timeout ? pause_tracker.pause(timeout) : pause_tracker.pause
         | 
| 154 180 | 
             
                end
         | 
| 155 181 |  | 
| 156 182 | 
             
                # Resumes processing of the current topic partition
         | 
| 157 183 | 
             
                def resume
         | 
| 158 | 
            -
                   | 
| 159 | 
            -
             | 
| 160 | 
            -
             | 
| 161 | 
            -
                  )
         | 
| 162 | 
            -
             | 
| 163 | 
            -
                  pause_tracker.expire
         | 
| 184 | 
            +
                  # This is sufficient to expire a partition pause, as with it will be resumed by the listener
         | 
| 185 | 
            +
                  # thread before the next poll.
         | 
| 186 | 
            +
                  coordinator.pause_tracker.expire
         | 
| 164 187 | 
             
                end
         | 
| 165 188 |  | 
| 166 189 | 
             
                # Seeks in the context of current topic and partition
         | 
| @@ -175,5 +198,12 @@ module Karafka | |
| 175 198 | 
             
                    )
         | 
| 176 199 | 
             
                  )
         | 
| 177 200 | 
             
                end
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                # @return [Boolean] true if partition was revoked from the current consumer
         | 
| 203 | 
            +
                # @note We know that partition got revoked because when we try to mark message as consumed,
         | 
| 204 | 
            +
                #   unless if is successful, it will return false
         | 
| 205 | 
            +
                def revoked?
         | 
| 206 | 
            +
                  coordinator.revoked?
         | 
| 207 | 
            +
                end
         | 
| 178 208 | 
             
              end
         | 
| 179 209 | 
             
            end
         | 
| @@ -36,6 +36,12 @@ module Karafka | |
| 36 36 | 
             
                    # Marks if we need to offset. If we did not store offsets, we should not commit the offset
         | 
| 37 37 | 
             
                    # position as it will crash rdkafka
         | 
| 38 38 | 
             
                    @offsetting = false
         | 
| 39 | 
            +
                    # We need to keep track of what we have paused for resuming
         | 
| 40 | 
            +
                    # In case we loose partition, we still need to resume it, otherwise it won't be fetched
         | 
| 41 | 
            +
                    # again if we get reassigned to it later on. We need to keep them as after revocation we
         | 
| 42 | 
            +
                    # no longer may be able to fetch them from Kafka. We could build them but it is easier
         | 
| 43 | 
            +
                    # to just keep them here and use if needed when cannot be obtained
         | 
| 44 | 
            +
                    @paused_tpls = Hash.new { |h, k| h[k] = {} }
         | 
| 39 45 | 
             
                  end
         | 
| 40 46 |  | 
| 41 47 | 
             
                  # Fetches messages within boundaries defined by the settings (time, size, topics, etc).
         | 
| @@ -45,12 +51,13 @@ module Karafka | |
| 45 51 | 
             
                  # @note This method should not be executed from many threads at the same time
         | 
| 46 52 | 
             
                  def batch_poll
         | 
| 47 53 | 
             
                    time_poll = TimeTrackers::Poll.new(@subscription_group.max_wait_time)
         | 
| 48 | 
            -
                    time_poll.start
         | 
| 49 54 |  | 
| 50 55 | 
             
                    @buffer.clear
         | 
| 51 56 | 
             
                    @rebalance_manager.clear
         | 
| 52 57 |  | 
| 53 58 | 
             
                    loop do
         | 
| 59 | 
            +
                      time_poll.start
         | 
| 60 | 
            +
             | 
| 54 61 | 
             
                      # Don't fetch more messages if we do not have any time left
         | 
| 55 62 | 
             
                      break if time_poll.exceeded?
         | 
| 56 63 | 
             
                      # Don't fetch more messages if we've fetched max as we've wanted
         | 
| @@ -69,7 +76,11 @@ module Karafka | |
| 69 76 | 
             
                      # If partition revocation happens, we need to remove messages from revoked partitions
         | 
| 70 77 | 
             
                      # as well as ensure we do not have duplicated due to the offset reset for partitions
         | 
| 71 78 | 
             
                      # that we got assigned
         | 
| 72 | 
            -
                       | 
| 79 | 
            +
                      # We also do early break, so the information about rebalance is used as soon as possible
         | 
| 80 | 
            +
                      if @rebalance_manager.changed?
         | 
| 81 | 
            +
                        remove_revoked_and_duplicated_messages
         | 
| 82 | 
            +
                        break
         | 
| 83 | 
            +
                      end
         | 
| 73 84 |  | 
| 74 85 | 
             
                      # Finally once we've (potentially) removed revoked, etc, if no messages were returned
         | 
| 75 86 | 
             
                      # we can break.
         | 
| @@ -86,8 +97,7 @@ module Karafka | |
| 86 97 | 
             
                  # @param message [Karafka::Messages::Message]
         | 
| 87 98 | 
             
                  def store_offset(message)
         | 
| 88 99 | 
             
                    @mutex.synchronize do
         | 
| 89 | 
            -
                       | 
| 90 | 
            -
                      @kafka.store_offset(message)
         | 
| 100 | 
            +
                      internal_store_offset(message)
         | 
| 91 101 | 
             
                    end
         | 
| 92 102 | 
             
                  end
         | 
| 93 103 |  | 
| @@ -104,14 +114,7 @@ module Karafka | |
| 104 114 | 
             
                  def commit_offsets(async: true)
         | 
| 105 115 | 
             
                    @mutex.lock
         | 
| 106 116 |  | 
| 107 | 
            -
                     | 
| 108 | 
            -
             | 
| 109 | 
            -
                    @kafka.commit(nil, async)
         | 
| 110 | 
            -
                    @offsetting = false
         | 
| 111 | 
            -
                  rescue Rdkafka::RdkafkaError => e
         | 
| 112 | 
            -
                    return if e.code == :no_offset
         | 
| 113 | 
            -
             | 
| 114 | 
            -
                    raise e
         | 
| 117 | 
            +
                    internal_commit_offsets(async: async)
         | 
| 115 118 | 
             
                  ensure
         | 
| 116 119 | 
             
                    @mutex.unlock
         | 
| 117 120 | 
             
                  end
         | 
| @@ -128,7 +131,11 @@ module Karafka | |
| 128 131 | 
             
                  #
         | 
| 129 132 | 
             
                  # @param message [Messages::Message, Messages::Seek] message to which we want to seek to
         | 
| 130 133 | 
             
                  def seek(message)
         | 
| 134 | 
            +
                    @mutex.lock
         | 
| 135 | 
            +
             | 
| 131 136 | 
             
                    @kafka.seek(message)
         | 
| 137 | 
            +
                  ensure
         | 
| 138 | 
            +
                    @mutex.unlock
         | 
| 132 139 | 
             
                  end
         | 
| 133 140 |  | 
| 134 141 | 
             
                  # Pauses given partition and moves back to last successful offset processed.
         | 
| @@ -144,15 +151,21 @@ module Karafka | |
| 144 151 | 
             
                    # Do not pause if the client got closed, would not change anything
         | 
| 145 152 | 
             
                    return if @closed
         | 
| 146 153 |  | 
| 154 | 
            +
                    pause_msg = Messages::Seek.new(topic, partition, offset)
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    internal_commit_offsets(async: false)
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    # Here we do not use our cached tpls because we should not try to pause something we do
         | 
| 159 | 
            +
                    # not own anymore.
         | 
| 147 160 | 
             
                    tpl = topic_partition_list(topic, partition)
         | 
| 148 161 |  | 
| 149 162 | 
             
                    return unless tpl
         | 
| 150 163 |  | 
| 151 | 
            -
                    @ | 
| 164 | 
            +
                    @paused_tpls[topic][partition] = tpl
         | 
| 152 165 |  | 
| 153 | 
            -
                     | 
| 166 | 
            +
                    @kafka.pause(tpl)
         | 
| 154 167 |  | 
| 155 | 
            -
                    seek(pause_msg)
         | 
| 168 | 
            +
                    @kafka.seek(pause_msg)
         | 
| 156 169 | 
             
                  ensure
         | 
| 157 170 | 
             
                    @mutex.unlock
         | 
| 158 171 | 
             
                  end
         | 
| @@ -166,9 +179,18 @@ module Karafka | |
| 166 179 |  | 
| 167 180 | 
             
                    return if @closed
         | 
| 168 181 |  | 
| 169 | 
            -
                     | 
| 182 | 
            +
                    # Always commit synchronously offsets if any when we resume
         | 
| 183 | 
            +
                    # This prevents resuming without offset in case it would not be committed prior
         | 
| 184 | 
            +
                    # We can skip performance penalty since resuming should not happen too often
         | 
| 185 | 
            +
                    internal_commit_offsets(async: false)
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    # If we were not able, let's try to reuse the one we have (if we have)
         | 
| 188 | 
            +
                    tpl = topic_partition_list(topic, partition) || @paused_tpls[topic][partition]
         | 
| 170 189 |  | 
| 171 190 | 
             
                    return unless tpl
         | 
| 191 | 
            +
                    # If we did not have it, it means we never paused this partition, thus no resume should
         | 
| 192 | 
            +
                    # happen in the first place
         | 
| 193 | 
            +
                    return unless @paused_tpls[topic].delete(partition)
         | 
| 172 194 |  | 
| 173 195 | 
             
                    @kafka.resume(tpl)
         | 
| 174 196 | 
             
                  ensure
         | 
| @@ -187,6 +209,7 @@ module Karafka | |
| 187 209 | 
             
                  # Marks given message as consumed.
         | 
| 188 210 | 
             
                  #
         | 
| 189 211 | 
             
                  # @param [Karafka::Messages::Message] message that we want to mark as processed
         | 
| 212 | 
            +
                  # @return [Boolean] true if successful. False if we no longer own given partition
         | 
| 190 213 | 
             
                  # @note This method won't trigger automatic offsets commits, rather relying on the offset
         | 
| 191 214 | 
             
                  #   check-pointing trigger that happens with each batch processed
         | 
| 192 215 | 
             
                  def mark_as_consumed(message)
         | 
| @@ -196,8 +219,10 @@ module Karafka | |
| 196 219 | 
             
                  # Marks a given message as consumed and commits the offsets in a blocking way.
         | 
| 197 220 | 
             
                  #
         | 
| 198 221 | 
             
                  # @param [Karafka::Messages::Message] message that we want to mark as processed
         | 
| 222 | 
            +
                  # @return [Boolean] true if successful. False if we no longer own given partition
         | 
| 199 223 | 
             
                  def mark_as_consumed!(message)
         | 
| 200 | 
            -
                    mark_as_consumed(message)
         | 
| 224 | 
            +
                    return false unless mark_as_consumed(message)
         | 
| 225 | 
            +
             | 
| 201 226 | 
             
                    commit_offsets!
         | 
| 202 227 | 
             
                  end
         | 
| 203 228 |  | 
| @@ -208,17 +233,51 @@ module Karafka | |
| 208 233 | 
             
                    @mutex.synchronize do
         | 
| 209 234 | 
             
                      @closed = false
         | 
| 210 235 | 
             
                      @offsetting = false
         | 
| 236 | 
            +
                      @paused_tpls.clear
         | 
| 211 237 | 
             
                      @kafka = build_consumer
         | 
| 212 238 | 
             
                    end
         | 
| 213 239 | 
             
                  end
         | 
| 214 240 |  | 
| 215 241 | 
             
                  private
         | 
| 216 242 |  | 
| 243 | 
            +
                  # When we cannot store an offset, it means we no longer own the partition
         | 
| 244 | 
            +
                  #
         | 
| 245 | 
            +
                  # Non thread-safe offset storing method
         | 
| 246 | 
            +
                  # @param message [Karafka::Messages::Message]
         | 
| 247 | 
            +
                  # @return [Boolean] true if we could store the offset (if we still own the partition)
         | 
| 248 | 
            +
                  def internal_store_offset(message)
         | 
| 249 | 
            +
                    @offsetting = true
         | 
| 250 | 
            +
                    @kafka.store_offset(message)
         | 
| 251 | 
            +
                    true
         | 
| 252 | 
            +
                  rescue Rdkafka::RdkafkaError => e
         | 
| 253 | 
            +
                    return false if e.code == :assignment_lost
         | 
| 254 | 
            +
                    return false if e.code == :state
         | 
| 255 | 
            +
             | 
| 256 | 
            +
                    raise e
         | 
| 257 | 
            +
                  end
         | 
| 258 | 
            +
             | 
| 259 | 
            +
                  # Non thread-safe message committing method
         | 
| 260 | 
            +
                  # @param async [Boolean] should the commit happen async or sync (async by default)
         | 
| 261 | 
            +
                  # @return [Boolean] true if offset commit worked, false if we've lost the assignment
         | 
| 262 | 
            +
                  def internal_commit_offsets(async: true)
         | 
| 263 | 
            +
                    return true unless @offsetting
         | 
| 264 | 
            +
             | 
| 265 | 
            +
                    @kafka.commit(nil, async)
         | 
| 266 | 
            +
                    @offsetting = false
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                    true
         | 
| 269 | 
            +
                  rescue Rdkafka::RdkafkaError => e
         | 
| 270 | 
            +
                    return false if e.code == :assignment_lost
         | 
| 271 | 
            +
                    return false if e.code == :no_offset
         | 
| 272 | 
            +
             | 
| 273 | 
            +
                    raise e
         | 
| 274 | 
            +
                  end
         | 
| 275 | 
            +
             | 
| 217 276 | 
             
                  # Commits the stored offsets in a sync way and closes the consumer.
         | 
| 218 277 | 
             
                  def close
         | 
| 219 | 
            -
                    commit_offsets!
         | 
| 220 | 
            -
             | 
| 221 278 | 
             
                    @mutex.synchronize do
         | 
| 279 | 
            +
                      internal_commit_offsets(async: false)
         | 
| 280 | 
            +
             | 
| 222 281 | 
             
                      @closed = true
         | 
| 223 282 |  | 
| 224 283 | 
             
                      # Remove callbacks runners that were registered
         | 
| @@ -227,7 +286,8 @@ module Karafka | |
| 227 286 |  | 
| 228 287 | 
             
                      @kafka.close
         | 
| 229 288 | 
             
                      @buffer.clear
         | 
| 230 | 
            -
                      @ | 
| 289 | 
            +
                      # @note We do not clear rebalance manager here as we may still have revocation info here
         | 
| 290 | 
            +
                      # that we want to consider valid prior to running another reconnection
         | 
| 231 291 | 
             
                    end
         | 
| 232 292 | 
             
                  end
         | 
| 233 293 |  | 
| @@ -280,7 +340,13 @@ module Karafka | |
| 280 340 |  | 
| 281 341 | 
             
                    time_poll.backoff
         | 
| 282 342 |  | 
| 283 | 
            -
                     | 
| 343 | 
            +
                    # We return nil, so we do not restart until running the whole loop
         | 
| 344 | 
            +
                    # This allows us to run revocation jobs and other things and we will pick up new work
         | 
| 345 | 
            +
                    # next time after dispatching all the things that are needed
         | 
| 346 | 
            +
                    #
         | 
| 347 | 
            +
                    # If we would retry here, the client reset would become transparent and we would not have
         | 
| 348 | 
            +
                    # a chance to take any actions
         | 
| 349 | 
            +
                    nil
         | 
| 284 350 | 
             
                  end
         | 
| 285 351 |  | 
| 286 352 | 
             
                  # Builds a new rdkafka consumer instance based on the subscription group configuration
         | 
| @@ -323,7 +389,7 @@ module Karafka | |
| 323 389 | 
             
                  # we are no longer responsible in a given process for processing those messages and they
         | 
| 324 390 | 
             
                  # should have been picked up by a different process.
         | 
| 325 391 | 
             
                  def remove_revoked_and_duplicated_messages
         | 
| 326 | 
            -
                    @rebalance_manager. | 
| 392 | 
            +
                    @rebalance_manager.lost_partitions.each do |topic, partitions|
         | 
| 327 393 | 
             
                      partitions.each do |partition|
         | 
| 328 394 | 
             
                        @buffer.delete(topic, partition)
         | 
| 329 395 | 
             
                      end
         | 
| @@ -10,17 +10,23 @@ module Karafka | |
| 10 10 | 
             
                class Listener
         | 
| 11 11 | 
             
                  include Helpers::Async
         | 
| 12 12 |  | 
| 13 | 
            +
                  # Can be useful for logging
         | 
| 14 | 
            +
                  # @return [String] id of this listener
         | 
| 15 | 
            +
                  attr_reader :id
         | 
| 16 | 
            +
             | 
| 13 17 | 
             
                  # @param subscription_group [Karafka::Routing::SubscriptionGroup]
         | 
| 14 18 | 
             
                  # @param jobs_queue [Karafka::Processing::JobsQueue] queue where we should push work
         | 
| 15 19 | 
             
                  # @return [Karafka::Connection::Listener] listener instance
         | 
| 16 20 | 
             
                  def initialize(subscription_group, jobs_queue)
         | 
| 21 | 
            +
                    @id = SecureRandom.uuid
         | 
| 17 22 | 
             
                    @subscription_group = subscription_group
         | 
| 18 23 | 
             
                    @jobs_queue = jobs_queue
         | 
| 19 | 
            -
                    @ | 
| 24 | 
            +
                    @jobs_builder = ::Karafka::App.config.internal.processing.jobs_builder
         | 
| 25 | 
            +
                    @coordinators = Processing::CoordinatorsBuffer.new
         | 
| 20 26 | 
             
                    @client = Client.new(@subscription_group)
         | 
| 21 27 | 
             
                    @executors = Processing::ExecutorsBuffer.new(@client, subscription_group)
         | 
| 22 28 | 
             
                    # We reference scheduler here as it is much faster than fetching this each time
         | 
| 23 | 
            -
                    @scheduler = ::Karafka::App.config.internal.scheduler
         | 
| 29 | 
            +
                    @scheduler = ::Karafka::App.config.internal.processing.scheduler
         | 
| 24 30 | 
             
                    # We keep one buffer for messages to preserve memory and not allocate extra objects
         | 
| 25 31 | 
             
                    # We can do this that way because we always first schedule jobs using messages before we
         | 
| 26 32 | 
             
                    # fetch another batch.
         | 
| @@ -62,16 +68,20 @@ module Karafka | |
| 62 68 |  | 
| 63 69 | 
             
                      resume_paused_partitions
         | 
| 64 70 |  | 
| 65 | 
            -
                      # We need to fetch data before we revoke lost partitions details as during the polling
         | 
| 66 | 
            -
                      # the callbacks for tracking lost partitions are triggered. Otherwise we would be always
         | 
| 67 | 
            -
                      # one batch behind.
         | 
| 68 | 
            -
                      poll_and_remap_messages
         | 
| 69 | 
            -
             | 
| 70 71 | 
             
                      Karafka.monitor.instrument(
         | 
| 71 72 | 
             
                        'connection.listener.fetch_loop.received',
         | 
| 72 73 | 
             
                        caller: self,
         | 
| 73 74 | 
             
                        messages_buffer: @messages_buffer
         | 
| 74 | 
            -
                      )
         | 
| 75 | 
            +
                      ) do
         | 
| 76 | 
            +
                        # We need to fetch data before we revoke lost partitions details as during the polling
         | 
| 77 | 
            +
                        # the callbacks for tracking lost partitions are triggered. Otherwise we would be
         | 
| 78 | 
            +
                        # always one batch behind.
         | 
| 79 | 
            +
                        poll_and_remap_messages
         | 
| 80 | 
            +
                      end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                      # This will ensure, that in the next poll, we continue processing (if we get them back)
         | 
| 83 | 
            +
                      # partitions that we have paused
         | 
| 84 | 
            +
                      resume_assigned_partitions
         | 
| 75 85 |  | 
| 76 86 | 
             
                      # If there were revoked partitions, we need to wait on their jobs to finish before
         | 
| 77 87 | 
             
                      # distributing consuming jobs as upon revoking, we might get assigned to the same
         | 
| @@ -80,6 +90,9 @@ module Karafka | |
| 80 90 | 
             
                      build_and_schedule_revoke_lost_partitions_jobs
         | 
| 81 91 |  | 
| 82 92 | 
             
                      # We wait only on jobs from our subscription group. Other groups are independent.
         | 
| 93 | 
            +
                      # This will block on revoked jobs until they are finished. Those are not meant to last
         | 
| 94 | 
            +
                      # long and should not have any bigger impact on the system. Doing this in a blocking way
         | 
| 95 | 
            +
                      # simplifies the overall design and prevents from race conditions
         | 
| 83 96 | 
             
                      wait
         | 
| 84 97 |  | 
| 85 98 | 
             
                      build_and_schedule_consumption_jobs
         | 
| @@ -130,7 +143,7 @@ module Karafka | |
| 130 143 |  | 
| 131 144 | 
             
                  # Resumes processing of partitions that were paused due to an error.
         | 
| 132 145 | 
             
                  def resume_paused_partitions
         | 
| 133 | 
            -
                    @ | 
| 146 | 
            +
                    @coordinators.resume do |topic, partition|
         | 
| 134 147 | 
             
                      @client.resume(topic, partition)
         | 
| 135 148 | 
             
                    end
         | 
| 136 149 | 
             
                  end
         | 
| @@ -146,9 +159,23 @@ module Karafka | |
| 146 159 |  | 
| 147 160 | 
             
                    revoked_partitions.each do |topic, partitions|
         | 
| 148 161 | 
             
                      partitions.each do |partition|
         | 
| 149 | 
            -
                         | 
| 150 | 
            -
                         | 
| 151 | 
            -
                         | 
| 162 | 
            +
                        # We revoke the coordinator here, so we do not have to revoke it in the revoke job
         | 
| 163 | 
            +
                        # itself (this happens prior to scheduling those jobs)
         | 
| 164 | 
            +
                        @coordinators.revoke(topic, partition)
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                        # There may be a case where we have lost partition of which data we have never
         | 
| 167 | 
            +
                        # processed (if it was assigned and revoked really fast), thus we may not have it
         | 
| 168 | 
            +
                        # here. In cases like this, we do not run a revocation job
         | 
| 169 | 
            +
                        @executors.find_all(topic, partition).each do |executor|
         | 
| 170 | 
            +
                          jobs << @jobs_builder.revoked(executor)
         | 
| 171 | 
            +
                        end
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                        # We need to remove all the executors of a given topic partition that we have lost, so
         | 
| 174 | 
            +
                        # next time we pick up it's work, new executors kick in. This may be needed especially
         | 
| 175 | 
            +
                        # for LRJ where we could end up with a race condition
         | 
| 176 | 
            +
                        # This revocation needs to happen after the jobs are scheduled, otherwise they would
         | 
| 177 | 
            +
                        # be scheduled with new executors instead of old
         | 
| 178 | 
            +
                        @executors.revoke(topic, partition)
         | 
| 152 179 | 
             
                      end
         | 
| 153 180 | 
             
                    end
         | 
| 154 181 |  | 
| @@ -160,7 +187,7 @@ module Karafka | |
| 160 187 | 
             
                    jobs = []
         | 
| 161 188 |  | 
| 162 189 | 
             
                    @executors.each do |_, _, executor|
         | 
| 163 | 
            -
                      jobs <<  | 
| 190 | 
            +
                      jobs << @jobs_builder.shutdown(executor)
         | 
| 164 191 | 
             
                    end
         | 
| 165 192 |  | 
| 166 193 | 
             
                    @scheduler.schedule_shutdown(@jobs_queue, jobs)
         | 
| @@ -177,6 +204,17 @@ module Karafka | |
| 177 204 | 
             
                    )
         | 
| 178 205 | 
             
                  end
         | 
| 179 206 |  | 
| 207 | 
            +
                  # Revoked partition needs to be resumed if we were processing them earlier. This will do
         | 
| 208 | 
            +
                  # nothing to things that we are planning to process. Without this, things we get
         | 
| 209 | 
            +
                  # re-assigned would not be polled.
         | 
| 210 | 
            +
                  def resume_assigned_partitions
         | 
| 211 | 
            +
                    @client.rebalance_manager.assigned_partitions.each do |topic, partitions|
         | 
| 212 | 
            +
                      partitions.each do |partition|
         | 
| 213 | 
            +
                        @client.resume(topic, partition)
         | 
| 214 | 
            +
                      end
         | 
| 215 | 
            +
                    end
         | 
| 216 | 
            +
                  end
         | 
| 217 | 
            +
             | 
| 180 218 | 
             
                  # Takes the messages per topic partition and enqueues processing jobs in threads using
         | 
| 181 219 | 
             
                  # given scheduler.
         | 
| 182 220 | 
             
                  def build_and_schedule_consumption_jobs
         | 
| @@ -185,13 +223,17 @@ module Karafka | |
| 185 223 | 
             
                    jobs = []
         | 
| 186 224 |  | 
| 187 225 | 
             
                    @messages_buffer.each do |topic, partition, messages|
         | 
| 188 | 
            -
                       | 
| 226 | 
            +
                      coordinator = @coordinators.find_or_create(topic, partition)
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                      # Start work coordination for this topic partition
         | 
| 229 | 
            +
                      coordinator.start
         | 
| 189 230 |  | 
| 190 | 
            -
                       | 
| 231 | 
            +
                      # Count the job we're going to create here
         | 
| 232 | 
            +
                      coordinator.increment
         | 
| 191 233 |  | 
| 192 | 
            -
                      executor = @executors. | 
| 234 | 
            +
                      executor = @executors.find_or_create(topic, partition, 0)
         | 
| 193 235 |  | 
| 194 | 
            -
                      jobs <<  | 
| 236 | 
            +
                      jobs << @jobs_builder.consume(executor, messages, coordinator)
         | 
| 195 237 | 
             
                    end
         | 
| 196 238 |  | 
| 197 239 | 
             
                    @scheduler.schedule_consumption(@jobs_queue, jobs)
         | 
| @@ -227,7 +269,7 @@ module Karafka | |
| 227 269 | 
             
                    @jobs_queue.wait(@subscription_group.id)
         | 
| 228 270 | 
             
                    @jobs_queue.clear(@subscription_group.id)
         | 
| 229 271 | 
             
                    @client.reset
         | 
| 230 | 
            -
                    @ | 
| 272 | 
            +
                    @coordinators.reset
         | 
| 231 273 | 
             
                    @executors = Processing::ExecutorsBuffer.new(@client, @subscription_group)
         | 
| 232 274 | 
             
                  end
         | 
| 233 275 | 
             
                end
         | 
| @@ -25,6 +25,14 @@ module Karafka | |
| 25 25 | 
             
                    )
         | 
| 26 26 | 
             
                  end
         | 
| 27 27 |  | 
| 28 | 
            +
                  # Revokes pause tracker for a given topic partition
         | 
| 29 | 
            +
                  #
         | 
| 30 | 
            +
                  # @param topic [String] topic name
         | 
| 31 | 
            +
                  # @param partition [Integer] partition number
         | 
| 32 | 
            +
                  def revoke(topic, partition)
         | 
| 33 | 
            +
                    @pauses[topic].delete(partition)
         | 
| 34 | 
            +
                  end
         | 
| 35 | 
            +
             | 
| 28 36 | 
             
                  # Resumes processing of partitions for which pause time has ended.
         | 
| 29 37 | 
             
                  #
         | 
| 30 38 | 
             
                  # @yieldparam [String] topic name
         | 
| @@ -18,13 +18,15 @@ module Karafka | |
| 18 18 | 
             
                  # Empty array for internal usage not to create new objects
         | 
| 19 19 | 
             
                  EMPTY_ARRAY = [].freeze
         | 
| 20 20 |  | 
| 21 | 
            +
                  attr_reader :assigned_partitions, :revoked_partitions
         | 
| 22 | 
            +
             | 
| 21 23 | 
             
                  private_constant :EMPTY_ARRAY
         | 
| 22 24 |  | 
| 23 25 | 
             
                  # @return [RebalanceManager]
         | 
| 24 26 | 
             
                  def initialize
         | 
| 25 27 | 
             
                    @assigned_partitions = {}
         | 
| 26 28 | 
             
                    @revoked_partitions = {}
         | 
| 27 | 
            -
                    @ | 
| 29 | 
            +
                    @changed = false
         | 
| 28 30 | 
             
                  end
         | 
| 29 31 |  | 
| 30 32 | 
             
                  # Resets the rebalance manager state
         | 
| @@ -33,26 +35,12 @@ module Karafka | |
| 33 35 | 
             
                  def clear
         | 
| 34 36 | 
             
                    @assigned_partitions.clear
         | 
| 35 37 | 
             
                    @revoked_partitions.clear
         | 
| 36 | 
            -
                    @ | 
| 37 | 
            -
                  end
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                  # @return [Hash<String, Array<Integer>>] hash where the keys are the names of topics for
         | 
| 40 | 
            -
                  #   which we've lost partitions and array with ids of the partitions as the value
         | 
| 41 | 
            -
                  # @note We do not consider as lost topics and partitions that got revoked and assigned
         | 
| 42 | 
            -
                  def revoked_partitions
         | 
| 43 | 
            -
                    return @revoked_partitions if @revoked_partitions.empty?
         | 
| 44 | 
            -
                    return @lost_partitions unless @lost_partitions.empty?
         | 
| 45 | 
            -
             | 
| 46 | 
            -
                    @revoked_partitions.each do |topic, partitions|
         | 
| 47 | 
            -
                      @lost_partitions[topic] = partitions - @assigned_partitions.fetch(topic, EMPTY_ARRAY)
         | 
| 48 | 
            -
                    end
         | 
| 49 | 
            -
             | 
| 50 | 
            -
                    @lost_partitions
         | 
| 38 | 
            +
                    @changed = false
         | 
| 51 39 | 
             
                  end
         | 
| 52 40 |  | 
| 53 | 
            -
                  # @return [Boolean]  | 
| 54 | 
            -
                  def  | 
| 55 | 
            -
                     | 
| 41 | 
            +
                  # @return [Boolean] indicates a state change in the partitions assignment
         | 
| 42 | 
            +
                  def changed?
         | 
| 43 | 
            +
                    @changed
         | 
| 56 44 | 
             
                  end
         | 
| 57 45 |  | 
| 58 46 | 
             
                  # Callback that kicks in inside of rdkafka, when new partitions are assigned.
         | 
| @@ -62,6 +50,7 @@ module Karafka | |
| 62 50 | 
             
                  # @param partitions [Rdkafka::Consumer::TopicPartitionList]
         | 
| 63 51 | 
             
                  def on_partitions_assigned(_, partitions)
         | 
| 64 52 | 
             
                    @assigned_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
         | 
| 53 | 
            +
                    @changed = true
         | 
| 65 54 | 
             
                  end
         | 
| 66 55 |  | 
| 67 56 | 
             
                  # Callback that kicks in inside of rdkafka, when partitions are revoked.
         | 
| @@ -71,6 +60,18 @@ module Karafka | |
| 71 60 | 
             
                  # @param partitions [Rdkafka::Consumer::TopicPartitionList]
         | 
| 72 61 | 
             
                  def on_partitions_revoked(_, partitions)
         | 
| 73 62 | 
             
                    @revoked_partitions = partitions.to_h.transform_values { |part| part.map(&:partition) }
         | 
| 63 | 
            +
                    @changed = true
         | 
| 64 | 
            +
                  end
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                  # We consider as lost only partitions that were taken away and not re-assigned back to us
         | 
| 67 | 
            +
                  def lost_partitions
         | 
| 68 | 
            +
                    lost_partitions = {}
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    revoked_partitions.each do |topic, partitions|
         | 
| 71 | 
            +
                      lost_partitions[topic] = partitions - assigned_partitions.fetch(topic, EMPTY_ARRAY)
         | 
| 72 | 
            +
                    end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    lost_partitions
         | 
| 74 75 | 
             
                  end
         | 
| 75 76 | 
             
                end
         | 
| 76 77 | 
             
              end
         |