RubyGems - gitlab-sidekiq-fetcher - Versions diffs - 0.5.1 → 0.5.6 - Mend

gitlab-sidekiq-fetcher 0.5.1 → 0.5.6

Files changed (26) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.gitlab-ci.yml +7 -7
data/Gemfile +1 -0
data/Gemfile.lock +3 -0
data/README.md +11 -0
data/gitlab-sidekiq-fetcher.gemspec +10 -10
data/lib/sidekiq-reliable-fetch.rb +1 -0
data/lib/sidekiq/base_reliable_fetch.rb +117 -94
data/lib/sidekiq/interrupted_set.rb +47 -0
data/lib/sidekiq/semi_reliable_fetch.rb +7 -3
data/spec/base_reliable_fetch_spec.rb +31 -7
data/spec/fetch_shared_examples.rb +146 -8
data/spec/semi_reliable_fetch_spec.rb +35 -0
data/spec/spec_helper.rb +2 -0
data/tests/README.md +8 -6
data/tests/{retry_test → interruption}/config.rb +0 -1
data/tests/{retry_test/no_retry_test.rb → interruption/test_kill_signal.rb} +5 -4
data/tests/{retry_test/retry_test.rb → interruption/test_term_signal.rb} +5 -4
data/tests/interruption/worker.rb +15 -0
data/tests/{reliability_test → reliability}/config.rb +0 -0
data/tests/{reliability_test → reliability}/reliability_test.rb +0 -0
data/tests/{reliability_test → reliability}/worker.rb +0 -0
metadata +13 -13
data/tests/retry_test/no_retry_worker.rb +0 -21
data/tests/retry_test/worker.rb +0 -23

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: eb3707eefa697c806e40fc41b6406714341c5f8dd7115451391313a5f7f01725
-  data.tar.gz: 51046486212181fd92a7cbc2f48f707391ab3e1438ad814b29f2a12c84c2b8cf
+  metadata.gz: 789f9e7424fe05ba56d1a5dc4aae2bd1cc750cb4c6ab3a0d87055876dec4ac63
+  data.tar.gz: c17bca12b0e63b47c3725000de1430f69cfdef6c4b5b791e40deacc0da0e2b33
 SHA512:
-  metadata.gz: b7453b6ff9e45f2d9cc7b8ad3b120c17bb04093214c79967ed6a75698b26fe358bf10ab6424dff95f5cab2b3839ac20e5a2af23a01985f4fe715d2f00c3d086b
-  data.tar.gz: 03afebd011c716cdc96b59b2b18e80514701f6bef3674ab3bdeb568b35c1730c5c8906a3eb40ad9c8aff93b971dd88b538898b326ece66642ecf64f879524088
+  metadata.gz: 9684d40185a5cd89a8ada02ae8563a0ae8a278866cfe59d11e5e2cc8cbcb610f29998dc4f9df0fc07720a723f6823c39e41f375542c0d9621fde87bbfccfb01c
+  data.tar.gz: 146e99c8e8fd388d56dbbd099658dceb8210101df31ab3d362eeeefa3fd52e75e469df24e1b06be5adae2231c9c978a84f2e0681b3a0bc91b8b5f5798c941f96

data/.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 *.gem
 coverage
+.DS_Store

data/.gitlab-ci.yml CHANGED Viewed

@@ -25,7 +25,7 @@ rspec:
 .integration:
   stage: test
   script:
-    - cd tests/reliability_test
+    - cd tests/reliability
     - bundle exec ruby reliability_test.rb
   services:
     - redis:alpine
@@ -47,19 +47,19 @@ integration_basic:
   variables:
     JOB_FETCHER: basic
-retry_test:
+kill_interruption:
   stage: test
   script:
-    - cd tests/retry_test
-    - bundle exec ruby retry_test.rb
+    - cd tests/interruption
+    - bundle exec ruby test_kill_signal.rb
   services:
     - redis:alpine
-no_retry_test:
+term_interruption:
   stage: test
   script:
-    - cd tests/retry_test
-    - bundle exec ruby no_retry_test.rb
+    - cd tests/interruption
+    - bundle exec ruby test_term_signal.rb
   services:
     - redis:alpine

data/Gemfile CHANGED Viewed

@@ -9,4 +9,5 @@ group :test do
   gem "pry"
   gem "sidekiq", '~> 5.0'
   gem 'simplecov', require: false
+  gem 'stub_env', '~> 1.0'
 end

data/Gemfile.lock CHANGED Viewed

@@ -36,6 +36,8 @@ GEM
       json (>= 1.8, < 3)
       simplecov-html (~> 0.10.0)
     simplecov-html (0.10.2)
+    stub_env (1.0.4)
+      rspec (>= 2.0, < 4.0)
 PLATFORMS
   ruby
@@ -45,6 +47,7 @@ DEPENDENCIES
   rspec (~> 3)
   sidekiq (~> 5.0)
   simplecov
+  stub_env (~> 1.0)
 BUNDLED WITH
    1.17.1

data/README.md CHANGED Viewed

@@ -10,6 +10,17 @@ There are two strategies implemented: [Reliable fetch](http://redis.io/commands/
 semi-reliable fetch that uses regular `brpop` and `lpush` to pick the job and put it to working queue. The main benefit of "Reliable" strategy is that `rpoplpush` is atomic, eliminating a race condition in which jobs can be lost.
 However, it comes at a cost because `rpoplpush` can't watch multiple lists at the same time so we need to iterate over the entire queue list which significantly increases pressure on Redis when there are more than a few queues. The "semi-reliable" strategy is much more reliable than the default Sidekiq fetcher, though. Compared to the reliable fetch strategy, it does not increase pressure on Redis significantly.
+### Interruption handling
+Sidekiq expects any job to report succcess or to fail. In the last case, Sidekiq puts `retry_count` counter
+into the job and keeps to re-run the job until the counter reched the maximum allowed value. When the job has
+not been given a chance to finish its work(to report success or fail), for example, when it was killed forcibly or when the job was requeued, after receiving TERM signal, the standard retry mechanisme does not get into the game and the job will be retried indefinatelly. This is why Reliable fetcher maintains a special counter `interrupted_count`
+which is used to limit the amount of such retries. In both cases, Reliable Fetcher increments counter `interrupted_count` and rejects the job from running again when the counter exceeds `max_retries_after_interruption` times (default: 3 times).
+Such a job will be put to `interrupted` queue. This queue mostly behaves as Sidekiq Dead queue so it only stores a limited amount of jobs for a limited term. Same as for Dead queue, all the limits are configurable via `interrupted_max_jobs` (default: 10_000) and `interrupted_timeout_in_seconds` (default: 3 months) Sidekiq option keys.
+You can also disable special handling of interrupted jobs by setting `max_retries_after_interruption` into `-1`.
+In this case, interrupted jobs will be run without any limits from Reliable Fetcher and they won't be put into Interrupted queue.
 ## Installation

data/gitlab-sidekiq-fetcher.gemspec CHANGED Viewed

@@ -1,14 +1,14 @@
 Gem::Specification.new do |s|
-  s.name        = 'gitlab-sidekiq-fetcher'
-  s.version     = '0.5.1'
-  s.authors     = ['TEA', 'GitLab']
-  s.email       = 'valery@gitlab.com'
-  s.license     = 'LGPL-3.0'
-  s.homepage    = 'https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/'
-  s.summary     = 'Reliable fetch extension for Sidekiq'
-  s.description = 'Redis reliable queue pattern implemented in Sidekiq'
+  s.name          = 'gitlab-sidekiq-fetcher'
+  s.version       = '0.5.6'
+  s.authors       = ['TEA', 'GitLab']
+  s.email         = 'valery@gitlab.com'
+  s.license       = 'LGPL-3.0'
+  s.homepage      = 'https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/'
+  s.summary       = 'Reliable fetch extension for Sidekiq'
+  s.description   = 'Redis reliable queue pattern implemented in Sidekiq'
   s.require_paths = ['lib']
-  s.files = `git ls-files`.split($\)
-  s.test_files  = []
+  s.files         = `git ls-files`.split($\)
+  s.test_files    = []
   s.add_dependency 'sidekiq', '~> 5'
 end

data/lib/sidekiq-reliable-fetch.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'sidekiq'
+require 'sidekiq/api'
 require_relative 'sidekiq/base_reliable_fetch'
 require_relative 'sidekiq/reliable_fetch'

data/lib/sidekiq/base_reliable_fetch.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-require 'sidekiq/job_retry'
+require_relative 'interrupted_set'
 module Sidekiq
   class BaseReliableFetch
@@ -18,6 +18,13 @@ module Sidekiq
     # Defines the COUNT parameter that will be passed to Redis SCAN command
     SCAN_COUNT = 1000
+    # How much time a job can be interrupted
+    DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION = 3
+    # Regexes for matching working queue keys
+    WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*:[0-9a-f]*)\z/.freeze
+    LEGACY_WORKING_QUEUE_REGEX = /#{WORKING_QUEUE_PREFIX}:(queue:.*):([^:]*:[0-9]*)\z/.freeze
     UnitOfWork = Struct.new(:queue, :job) do
       def acknowledge
         Sidekiq.redis { |conn| conn.lrem(Sidekiq::BaseReliableFetch.working_queue_name(queue), 1, job) }
@@ -65,162 +72,178 @@ module Sidekiq
       end
     end
-    def self.pid
-      @pid ||= ::Process.pid
+    def self.hostname
+      Socket.gethostname
+    end
+    def self.process_nonce
+      @@process_nonce ||= SecureRandom.hex(6)
     end
-    def self.hostname
-      @hostname ||= Socket.gethostname
+    def self.identity
+      @@identity ||= "#{hostname}:#{$$}:#{process_nonce}"
     end
     def self.heartbeat
       Sidekiq.redis do |conn|
-        conn.set(heartbeat_key(hostname, pid), 1, ex: HEARTBEAT_LIFESPAN)
+        conn.set(heartbeat_key(identity), 1, ex: HEARTBEAT_LIFESPAN)
       end
-      Sidekiq.logger.debug("Heartbeat for hostname: #{hostname} and pid: #{pid}")
+      Sidekiq.logger.debug("Heartbeat for #{identity}")
     end
     def self.bulk_requeue(inprogress, _options)
       return if inprogress.empty?
-      Sidekiq.logger.debug('Re-queueing terminated jobs')
       Sidekiq.redis do |conn|
         inprogress.each do |unit_of_work|
           conn.multi do |multi|
-            multi.lpush(unit_of_work.queue, unit_of_work.job)
+            preprocess_interrupted_job(unit_of_work.job, unit_of_work.queue, multi)
             multi.lrem(working_queue_name(unit_of_work.queue), 1, unit_of_work.job)
           end
         end
       end
-      Sidekiq.logger.info("Pushed #{inprogress.size} jobs back to Redis")
     rescue => e
       Sidekiq.logger.warn("Failed to requeue #{inprogress.size} jobs: #{e.message}")
     end
-    def self.heartbeat_key(hostname, pid)
-      "reliable-fetcher-heartbeat-#{hostname}-#{pid}"
-    end
-    def self.working_queue_name(queue)
-      "#{WORKING_QUEUE_PREFIX}:#{queue}:#{hostname}:#{pid}"
+    def self.clean_working_queue!(original_queue, working_queue)
+      Sidekiq.redis do |conn|
+        while job = conn.rpop(working_queue)
+          preprocess_interrupted_job(job, original_queue)
+        end
+      end
     end
-    attr_reader :cleanup_interval, :last_try_to_take_lease_at, :lease_interval,
-                :queues, :use_semi_reliable_fetch,
-                :strictly_ordered_queues
+    def self.preprocess_interrupted_job(job, queue, conn = nil)
+      msg = Sidekiq.load_json(job)
+      msg['interrupted_count'] = msg['interrupted_count'].to_i + 1
-    def initialize(options)
-      @cleanup_interval = options.fetch(:cleanup_interval, DEFAULT_CLEANUP_INTERVAL)
-      @lease_interval = options.fetch(:lease_interval, DEFAULT_LEASE_INTERVAL)
-      @last_try_to_take_lease_at = 0
-      @strictly_ordered_queues = !!options[:strict]
-      @queues = options[:queues].map { |q| "queue:#{q}" }
+      if interruption_exhausted?(msg)
+        send_to_quarantine(msg, conn)
+      else
+        requeue_job(queue, msg, conn)
+      end
     end
-    def retrieve_work
-      clean_working_queues! if take_lease
+    def self.extract_queue_and_identity(key)
+      # New identity format is "{hostname}:{pid}:{randomhex}
+      # Old identity format is "{hostname}:{pid}"
+      # Queue names may also have colons (namespaced).
+      # Expressing this in a single regex is unreadable
-      retrieve_unit_of_work
-    end
+      # Test the newer expected format first, only checking the older if necessary
+      original_queue, identity = key.scan(WORKING_QUEUE_REGEX).flatten
+      return original_queue, identity unless original_queue.nil? || identity.nil?
-    def retrieve_unit_of_work
-      raise NotImplementedError,
-        "#{self.class} does not implement #{__method__}"
+      key.scan(LEGACY_WORKING_QUEUE_REGEX).flatten
     end
-    private
-    def clean_working_queue!(working_queue)
-      original_queue = working_queue.gsub(/#{WORKING_QUEUE_PREFIX}:|:[^:]*:[0-9]*\z/, '')
+    # Detect "old" jobs and requeue them because the worker they were assigned
+    # to probably failed miserably.
+    def self.clean_working_queues!
+      Sidekiq.logger.info('Cleaning working queues')
       Sidekiq.redis do |conn|
-        count = 0
-        while job = conn.rpop(working_queue)
-          msg = begin
-                  Sidekiq.load_json(job)
-                rescue => e
-                  Sidekiq.logger.info("Skipped job: #{job} as we couldn't parse it")
-                  next
-                end
-          msg['retry_count'] = msg['retry_count'].to_i + 1
-          if retries_exhausted?(msg)
-            send_to_morgue(msg)
-          else
-            job = Sidekiq.dump_json(msg)
+        conn.scan_each(match: "#{WORKING_QUEUE_PREFIX}:queue:*", count: SCAN_COUNT) do |key|
+          original_queue, identity = extract_queue_and_identity(key)
-            conn.lpush(original_queue, job)
+          next if original_queue.nil? || identity.nil?
-            count += 1
-          end
+          clean_working_queue!(original_queue, key) if worker_dead?(identity, conn)
         end
-        Sidekiq.logger.info("Requeued #{count} dead jobs to #{original_queue}")
       end
     end
-    def retries_exhausted?(msg)
-      # `retry` parameter can be empty when job is running the first time and when
-      # it's not specified in worker class explicitly.
-      # In that case, the default parameter gets injected into the job when
-      # it fails the first time in JobRetry#local.
-      # We should handle the case when `retry` is explicitly set to false
-      return true if msg['retry'] === false
+    def self.worker_dead?(identity, conn)
+      !conn.get(heartbeat_key(identity))
+    end
+    def self.heartbeat_key(identity)
+      "reliable-fetcher-heartbeat-#{identity.gsub(':', '-')}"
+    end
-      max_retries_default = Sidekiq.options.fetch(:max_retries, Sidekiq::JobRetry::DEFAULT_MAX_RETRY_ATTEMPTS)
+    def self.working_queue_name(queue)
+      "#{WORKING_QUEUE_PREFIX}:#{queue}:#{identity}"
+    end
-      max_retry_attempts = retry_attempts_from(msg['retry'], max_retries_default)
+    def self.interruption_exhausted?(msg)
+      return false if max_retries_after_interruption(msg['class']) < 0
-      msg['retry_count'] >= max_retry_attempts
+      msg['interrupted_count'].to_i >= max_retries_after_interruption(msg['class'])
     end
-    def retry_attempts_from(msg_retry, default)
-      if msg_retry.is_a?(Integer)
-        msg_retry
-      else
-        default
+    def self.max_retries_after_interruption(worker_class)
+      max_retries_after_interruption = nil
+      max_retries_after_interruption ||= begin
+        Object.const_get(worker_class).sidekiq_options[:max_retries_after_interruption]
+      rescue NameError
       end
+      max_retries_after_interruption ||= Sidekiq.options[:max_retries_after_interruption]
+      max_retries_after_interruption ||= DEFAULT_MAX_RETRIES_AFTER_INTERRUPTION
+      max_retries_after_interruption
     end
-    def send_to_morgue(msg)
+    def self.send_to_quarantine(msg, multi_connection = nil)
       Sidekiq.logger.warn(
         class: msg['class'],
         jid: msg['jid'],
-        message: %(Reliable Fetcher: adding dead #{msg['class']} job #{msg['jid']})
+        message: %(Reliable Fetcher: adding dead #{msg['class']} job #{msg['jid']} to interrupted queue)
       )
-      payload = Sidekiq.dump_json(msg)
-      Sidekiq::DeadSet.new.kill(payload, notify_failure: false)
+      job = Sidekiq.dump_json(msg)
+      Sidekiq::InterruptedSet.new.put(job, connection: multi_connection)
     end
-    # Detect "old" jobs and requeue them because the worker they were assigned
-    # to probably failed miserably.
-    def clean_working_queues!
-      Sidekiq.logger.info("Cleaning working queues")
+    # If you want this method to be run is a scope of multi connection
+    # you need to pass it
+    def self.requeue_job(queue, msg, conn)
+      with_connection(conn) do |conn|
+        conn.lpush(queue, Sidekiq.dump_json(msg))
+      end
-      Sidekiq.redis do |conn|
-        conn.scan_each(match: "#{WORKING_QUEUE_PREFIX}:queue:*", count: SCAN_COUNT) do |key|
-          # Example: "working:name_of_the_job:queue:{hostname}:{PID}"
-          hostname, pid = key.scan(/:([^:]*):([0-9]*)\z/).flatten
+      Sidekiq.logger.info(
+        message: "Pushed job #{msg['jid']} back to queue #{queue}",
+        jid: msg['jid'],
+        queue: queue
+      )
+    end
-          continue if hostname.nil? || pid.nil?
+    # Yield block with an existing connection or creates another one
+    def self.with_connection(conn, &block)
+      return yield(conn) if conn
-          clean_working_queue!(key) if worker_dead?(hostname, pid)
-        end
-      end
+      Sidekiq.redis { |conn| yield(conn) }
     end
-    def worker_dead?(hostname, pid)
-      Sidekiq.redis do |conn|
-        !conn.get(self.class.heartbeat_key(hostname, pid))
-      end
+    attr_reader :cleanup_interval, :last_try_to_take_lease_at, :lease_interval,
+                :queues, :use_semi_reliable_fetch,
+                :strictly_ordered_queues
+    def initialize(options)
+      @cleanup_interval = options.fetch(:cleanup_interval, DEFAULT_CLEANUP_INTERVAL)
+      @lease_interval = options.fetch(:lease_interval, DEFAULT_LEASE_INTERVAL)
+      @last_try_to_take_lease_at = 0
+      @strictly_ordered_queues = !!options[:strict]
+      @queues = options[:queues].map { |q| "queue:#{q}" }
     end
+    def retrieve_work
+      self.class.clean_working_queues! if take_lease
+      retrieve_unit_of_work
+    end
+    def retrieve_unit_of_work
+      raise NotImplementedError,
+        "#{self.class} does not implement #{__method__}"
+    end
+    private
     def take_lease
       return unless allowed_to_take_a_lease?

data/lib/sidekiq/interrupted_set.rb ADDED Viewed

@@ -0,0 +1,47 @@
+require 'sidekiq/api'
+module Sidekiq
+  class InterruptedSet < ::Sidekiq::JobSet
+    DEFAULT_MAX_CAPACITY = 10_000
+    DEFAULT_MAX_TIMEOUT = 90 * 24 * 60 * 60 # 3 months
+    def initialize
+      super "interrupted"
+    end
+    def put(message, opts = {})
+      now = Time.now.to_f
+      with_multi_connection(opts[:connection]) do |conn|
+        conn.zadd(name, now.to_s, message)
+        conn.zremrangebyscore(name, '-inf', now - self.class.timeout)
+        conn.zremrangebyrank(name, 0, - self.class.max_jobs)
+      end
+      true
+    end
+    # Yield block inside an existing multi connection or creates new one
+    def with_multi_connection(conn, &block)
+      return yield(conn) if conn
+      Sidekiq.redis do |c|
+        c.multi do |multi|
+          yield(multi)
+        end
+      end
+    end
+    def retry_all
+      each(&:retry) while size > 0
+    end
+    def self.max_jobs
+      Sidekiq.options[:interrupted_max_jobs] || DEFAULT_MAX_CAPACITY
+    end
+    def self.timeout
+      Sidekiq.options[:interrupted_timeout_in_seconds] || DEFAULT_MAX_TIMEOUT
+    end
+  end
+end

data/lib/sidekiq/semi_reliable_fetch.rb CHANGED Viewed

@@ -5,14 +5,14 @@ module Sidekiq
     # We want the fetch operation to timeout every few seconds so the thread
     # can check if the process is shutting down. This constant is only used
     # for semi-reliable fetch.
-    SEMI_RELIABLE_FETCH_TIMEOUT = 2 # seconds
+    DEFAULT_SEMI_RELIABLE_FETCH_TIMEOUT = 2 # seconds
     def initialize(options)
       super
       if strictly_ordered_queues
         @queues = @queues.uniq
-        @queues << SEMI_RELIABLE_FETCH_TIMEOUT
+        @queues << semi_reliable_fetch_timeout
       end
     end
@@ -36,9 +36,13 @@ module Sidekiq
         @queues
       else
         queues = @queues.shuffle.uniq
-        queues << SEMI_RELIABLE_FETCH_TIMEOUT
+        queues << semi_reliable_fetch_timeout
         queues
       end
     end
+    def semi_reliable_fetch_timeout
+      @semi_reliable_fetch_timeout ||= ENV['SIDEKIQ_SEMI_RELIABLE_FETCH_TIMEOUT']&.to_i || DEFAULT_SEMI_RELIABLE_FETCH_TIMEOUT
+    end
   end
 end

data/spec/base_reliable_fetch_spec.rb CHANGED Viewed

@@ -5,10 +5,11 @@ require 'sidekiq/reliable_fetch'
 require 'sidekiq/semi_reliable_fetch'
 describe Sidekiq::BaseReliableFetch do
+  let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
   before { Sidekiq.redis(&:flushdb) }
   describe 'UnitOfWork' do
-    let(:job) { Sidekiq.dump_json({ class: 'Bob', args: [1, 2, 'foo'] }) }
     let(:fetcher) { Sidekiq::ReliableFetch.new(queues: ['foo']) }
     describe '#requeue' do
@@ -39,19 +40,42 @@ describe Sidekiq::BaseReliableFetch do
   end
   describe '.bulk_requeue' do
+    let!(:queue1) { Sidekiq::Queue.new('foo') }
+    let!(:queue2) { Sidekiq::Queue.new('bar') }
     it 'requeues the bulk' do
-      queue1 = Sidekiq::Queue.new('foo')
-      queue2 = Sidekiq::Queue.new('bar')
+      uow = described_class::UnitOfWork
+      jobs = [ uow.new('queue:foo', job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
+      described_class.bulk_requeue(jobs, queues: [])
-      expect(queue1.size).to eq 0
-      expect(queue2.size).to eq 0
+      expect(queue1.size).to eq 2
+      expect(queue2.size).to eq 1
+    end
+    it 'puts jobs into interrupted queue' do
       uow = described_class::UnitOfWork
-      jobs = [ uow.new('queue:foo', 'bob'), uow.new('queue:foo', 'bar'), uow.new('queue:bar', 'widget') ]
+      interrupted_job = Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo'], interrupted_count: 3)
+      jobs = [ uow.new('queue:foo', interrupted_job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
+      described_class.bulk_requeue(jobs, queues: [])
+      expect(queue1.size).to eq 1
+      expect(queue2.size).to eq 1
+      expect(Sidekiq::InterruptedSet.new.size).to eq 1
+    end
+    it 'does not put jobs into interrupted queue if it is disabled' do
+      Sidekiq.options[:max_retries_after_interruption] = -1
+      uow = described_class::UnitOfWork
+      interrupted_job = Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo'], interrupted_count: 3)
+      jobs = [ uow.new('queue:foo', interrupted_job), uow.new('queue:foo', job), uow.new('queue:bar', job) ]
       described_class.bulk_requeue(jobs, queues: [])
       expect(queue1.size).to eq 2
       expect(queue2.size).to eq 1
+      expect(Sidekiq::InterruptedSet.new.size).to eq 0
+      Sidekiq.options[:max_retries_after_interruption] = 3
     end
   end
@@ -63,7 +87,7 @@ describe Sidekiq::BaseReliableFetch do
     Sidekiq.redis do |conn|
       sleep 0.2 # Give the time to heartbeat thread to make a loop
-      heartbeat_key = described_class.heartbeat_key(Socket.gethostname, ::Process.pid)
+      heartbeat_key = described_class.heartbeat_key(described_class.identity)
       heartbeat = conn.get(heartbeat_key)
       expect(heartbeat).not_to be_nil

data/spec/fetch_shared_examples.rb CHANGED Viewed

@@ -4,7 +4,7 @@ shared_examples 'a Sidekiq fetcher' do
   before { Sidekiq.redis(&:flushdb) }
   describe '#retrieve_work' do
-    let(:job) { Sidekiq.dump_json({ class: 'Bob', args: [1, 2, 'foo'] }) }
+    let(:job) { Sidekiq.dump_json(class: 'Bob', args: [1, 2, 'foo']) }
     let(:fetcher) { described_class.new(queues: ['assigned']) }
     it 'retrieves the job and puts it to working queue' do
@@ -24,17 +24,18 @@ shared_examples 'a Sidekiq fetcher' do
       expect(fetcher.retrieve_work).to be_nil
     end
-    it 'requeues jobs from dead working queue with incremented retry_count' do
+    it 'requeues jobs from dead working queue with incremented interrupted_count' do
       Sidekiq.redis do |conn|
         conn.rpush(other_process_working_queue_name('assigned'), job)
       end
       expected_job = Sidekiq.load_json(job)
-      expected_job['retry_count'] = 1
+      expected_job['interrupted_count'] = 1
       expected_job = Sidekiq.dump_json(expected_job)
       uow = fetcher.retrieve_work
+      expect(uow).to_not be_nil
       expect(uow.job).to eq expected_job
       Sidekiq.redis do |conn|
@@ -42,6 +43,40 @@ shared_examples 'a Sidekiq fetcher' do
       end
     end
+    it 'ignores working queue keys in unknown formats' do
+      # Add a spurious non-numeric char segment at the end; this simulates any other
+      # incorrect form in general
+      malformed_key = "#{other_process_working_queue_name('assigned')}:X"
+      Sidekiq.redis do |conn|
+        conn.rpush(malformed_key, job)
+      end
+      uow = fetcher.retrieve_work
+      Sidekiq.redis do |conn|
+        expect(conn.llen(malformed_key)).to eq 1
+      end
+    end
+    it 'requeues jobs from legacy dead working queue with incremented interrupted_count' do
+      Sidekiq.redis do |conn|
+        conn.rpush(legacy_other_process_working_queue_name('assigned'), job)
+      end
+      expected_job = Sidekiq.load_json(job)
+      expected_job['interrupted_count'] = 1
+      expected_job = Sidekiq.dump_json(expected_job)
+      uow = fetcher.retrieve_work
+      expect(uow).to_not be_nil
+      expect(uow.job).to eq expected_job
+      Sidekiq.redis do |conn|
+        expect(conn.llen(legacy_other_process_working_queue_name('assigned'))).to eq 0
+      end
+    end
     it 'does not requeue jobs from live working queue' do
       working_queue = live_other_process_working_queue_name('assigned')
@@ -61,8 +96,7 @@ shared_examples 'a Sidekiq fetcher' do
     it 'does not clean up orphaned jobs more than once per cleanup interval' do
       Sidekiq.redis = Sidekiq::RedisConnection.create(url: REDIS_URL, size: 10)
-      expect_any_instance_of(described_class)
-        .to receive(:clean_working_queues!).once
+      expect(described_class).to receive(:clean_working_queues!).once
       threads = 10.times.map do
         Thread.new do
@@ -98,6 +132,104 @@ shared_examples 'a Sidekiq fetcher' do
       expect(jobs).to include 'this_job_should_not_stuck'
     end
+    context 'with namespaced queues' do
+      let (:queue) { 'namespace:assigned' }
+      let (:fetcher) { described_class.new(queues: [queue]) }
+      it 'requeues jobs from dead namespaced working queue with incremented interrupted_count' do
+        Sidekiq.redis do |conn|
+          conn.rpush(other_process_working_queue_name(queue), job)
+        end
+        expected_job = Sidekiq.load_json(job)
+        expected_job['interrupted_count'] = 1
+        expected_job = Sidekiq.dump_json(expected_job)
+        uow = fetcher.retrieve_work
+        expect(uow).to_not be_nil
+        expect(uow.job).to eq expected_job
+        Sidekiq.redis do |conn|
+          expect(conn.llen(other_process_working_queue_name(queue))).to eq 0
+        end
+      end
+      it 'does not requeue jobs in a namespaced queue from live working queue' do
+        working_queue = live_other_process_working_queue_name(queue)
+        Sidekiq.redis do |conn|
+          conn.rpush(working_queue, job)
+        end
+        uow = fetcher.retrieve_work
+        expect(uow).to be_nil
+        Sidekiq.redis do |conn|
+          expect(conn.llen(working_queue)).to eq 1
+        end
+      end
+    end
+    context 'with deeper namespaced queues' do
+      let (:queue) { 'deep:namespace:assigned' }
+      let (:fetcher) { described_class.new(queues: [queue]) }
+      it 'requeues jobs from dead namespaced working queue with incremented interrupted_count' do
+        Sidekiq.redis do |conn|
+          conn.rpush(other_process_working_queue_name(queue), job)
+        end
+        expected_job = Sidekiq.load_json(job)
+        expected_job['interrupted_count'] = 1
+        expected_job = Sidekiq.dump_json(expected_job)
+        uow = fetcher.retrieve_work
+        expect(uow).to_not be_nil
+        expect(uow.job).to eq expected_job
+        Sidekiq.redis do |conn|
+          expect(conn.llen(other_process_working_queue_name(queue))).to eq 0
+        end
+      end
+      it 'does not requeue jobs in a deeper namespaced queue from live working queue' do
+        working_queue = live_other_process_working_queue_name(queue)
+        Sidekiq.redis do |conn|
+          conn.rpush(working_queue, job)
+        end
+        uow = fetcher.retrieve_work
+        expect(uow).to be_nil
+        Sidekiq.redis do |conn|
+          expect(conn.llen(working_queue)).to eq 1
+        end
+      end
+    end
+    context 'with short cleanup interval' do
+      let(:short_interval) { 1 }
+      let(:fetcher) { described_class.new(queues: queues, lease_interval: short_interval, cleanup_interval: short_interval) }
+      it 'requeues when there is no heartbeat' do
+        Sidekiq.redis { |conn| conn.rpush('queue:assigned', job) }
+        # Use of retrieve_work twice with a sleep ensures we have exercised the
+        # `identity` method to create the working queue key name and that it
+        # matches the patterns used in the cleanup
+        uow = fetcher.retrieve_work
+        sleep(short_interval + 1)
+        uow = fetcher.retrieve_work
+        # Will only receive a UnitOfWork if the job was detected as failed and requeued
+        expect(uow).to_not be_nil
+      end
+    end
   end
 end
@@ -107,17 +239,23 @@ def working_queue_size(queue_name)
   end
 end
-def other_process_working_queue_name(queue)
+def legacy_other_process_working_queue_name(queue)
   "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}"
 end
+def other_process_working_queue_name(queue)
+  "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{Socket.gethostname}:#{::Process.pid + 1}:#{::SecureRandom.hex(6)}"
+end
 def live_other_process_working_queue_name(queue)
   pid = ::Process.pid + 1
   hostname = Socket.gethostname
+  nonce = SecureRandom.hex(6)
   Sidekiq.redis do |conn|
-    conn.set(Sidekiq::BaseReliableFetch.heartbeat_key(hostname, pid), 1)
+    conn.set(Sidekiq::BaseReliableFetch.heartbeat_key("#{hostname}-#{pid}-#{nonce}"), 1)
   end
-  "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}"
+  "#{Sidekiq::BaseReliableFetch::WORKING_QUEUE_PREFIX}:queue:#{queue}:#{hostname}:#{pid}:#{nonce}"
 end

data/spec/semi_reliable_fetch_spec.rb CHANGED Viewed

@@ -5,4 +5,39 @@ require 'sidekiq/semi_reliable_fetch'
 describe Sidekiq::SemiReliableFetch do
   include_examples 'a Sidekiq fetcher'
+  describe '#retrieve_work' do
+    context 'timeout config' do
+      let(:queues) { ['stuff_to_do'] }
+      let(:fetcher) { described_class.new(queues: queues) }
+      before do
+        stub_env('SIDEKIQ_SEMI_RELIABLE_FETCH_TIMEOUT', timeout)
+      end
+      context 'when the timeout is not configured' do
+        let(:timeout) { nil }
+        it 'brpops with the default timeout timeout' do
+          Sidekiq.redis do |connection|
+            expect(connection).to receive(:brpop).with("queue:stuff_to_do", 2).once.and_call_original
+            fetcher.retrieve_work
+          end
+        end
+      end
+      context 'when the timeout is set in the env' do
+        let(:timeout) { '5' }
+        it 'brpops with the default timeout timeout' do
+          Sidekiq.redis do |connection|
+            expect(connection).to receive(:brpop).with("queue:stuff_to_do", 5).once.and_call_original
+            fetcher.retrieve_work
+          end
+        end
+      end
+    end
+  end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'sidekiq/util'
 require 'sidekiq/api'
 require 'pry'
 require 'simplecov'
+require 'stub_env'
 SimpleCov.start
@@ -29,6 +30,7 @@ Sidekiq.logger.level = Logger::ERROR
 #
 # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
 RSpec.configure do |config|
+  config.include StubEnv::Helpers
   # rspec-expectations config goes here. You can use an alternate
   # assertion/expectation library such as wrong or the stdlib/minitest
   # assertions if you prefer.

data/tests/README.md CHANGED Viewed

@@ -18,18 +18,20 @@ You need to have redis server running on default HTTP port `6379`. To use other
 This tool spawns configured number of Sidekiq workers and when the amount of processed jobs is about half of origin
 number it will kill all the workers with `kill -9` and then it will spawn new workers again until all the jobs are processed. To track the process and counters we use Redis keys/counters.
-# How to run retry tests
+# How to run interruption tests
 ```
-cd retry_test
-bundle exec ruby retry_test.rb
+cd tests/interruption
-# To verify that workers with "retry: false" are not retried
-bundle exec ruby no_retry_test.rb
+# Verify "KILL" signal
+bundle exec ruby test_kill_signal.rb
+# Verify "TERM" signal
+bundle exec ruby test_term_signal.rb
 ```
 It requires Redis to be running on 6379 port.
 ## How it works
-It spawns Sidekiq workers then creates a job that will kill itself after a moment. The  reliable fetcher will bring it back. The purpose is to verify that job is run no more then `retry` parameter says even when job was killed.
+It spawns Sidekiq workers then creates a job that will kill itself after a moment. The  reliable fetcher will bring it back. The purpose is to verify that job is run no more then allowed number of times.

data/tests/{retry_test → interruption}/config.rb RENAMED Viewed

@@ -2,7 +2,6 @@
 require_relative '../../lib/sidekiq-reliable-fetch'
 require_relative 'worker'
-require_relative 'no_retry_worker'
 TEST_CLEANUP_INTERVAL = 20
 TEST_LEASE_INTERVAL = 5

data/tests/{retry_test/no_retry_test.rb → interruption/test_kill_signal.rb} RENAMED Viewed

@@ -4,21 +4,22 @@ require 'sidekiq'
 require_relative 'config'
 require_relative '../support/utils'
-NUM_WORKERS = 2 # one worker will be killed and one spare worker t verify that job is not picked up
+EXPECTED_NUM_TIMES_BEEN_RUN = 3
+NUM_WORKERS = EXPECTED_NUM_TIMES_BEEN_RUN + 1
 Sidekiq.redis(&:flushdb)
 pids = spawn_workers(NUM_WORKERS)
-jid = NoRetryTestWorker.perform_async
+RetryTestWorker.perform_async
 sleep 300
 Sidekiq.redis do |redis|
   times_has_been_run = redis.get('times_has_been_run').to_i
-  assert 'The job has been run', times_has_been_run, 1
+  assert 'The job has been run', times_has_been_run, EXPECTED_NUM_TIMES_BEEN_RUN
 end
-assert 'Found dead jobs', Sidekiq::DeadSet.new.size, 1
+assert 'Found interruption exhausted jobs', Sidekiq::InterruptedSet.new.size, 1
 stop_workers(pids)

data/tests/{retry_test/retry_test.rb → interruption/test_term_signal.rb} RENAMED Viewed

@@ -4,21 +4,22 @@ require 'sidekiq'
 require_relative 'config'
 require_relative '../support/utils'
-NUM_WORKERS = RetryTestWorker::EXPECTED_NUM_TIMES_BEEN_RUN + 1
+EXPECTED_NUM_TIMES_BEEN_RUN = 3
+NUM_WORKERS = EXPECTED_NUM_TIMES_BEEN_RUN + 1
 Sidekiq.redis(&:flushdb)
 pids = spawn_workers(NUM_WORKERS)
-jid = RetryTestWorker.perform_async
+RetryTestWorker.perform_async('TERM', 60)
 sleep 300
 Sidekiq.redis do |redis|
   times_has_been_run = redis.get('times_has_been_run').to_i
-  assert 'The job has been run', times_has_been_run, RetryTestWorker::EXPECTED_NUM_TIMES_BEEN_RUN
+  assert 'The job has been run', times_has_been_run, EXPECTED_NUM_TIMES_BEEN_RUN
 end
-assert 'Found dead jobs', Sidekiq::DeadSet.new.size, 1
+assert 'Found interruption exhausted jobs', Sidekiq::InterruptedSet.new.size, 1
 stop_workers(pids)

data/tests/interruption/worker.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+class RetryTestWorker
+  include Sidekiq::Worker
+  def perform(signal = 'KILL', wait_seconds = 1)
+    Sidekiq.redis do |redis|
+      redis.incr('times_has_been_run')
+    end
+    Process.kill(signal, Process.pid)
+    sleep wait_seconds
+  end
+end

data/tests/{reliability_test → reliability}/config.rb RENAMED Viewed

File without changes

data/tests/{reliability_test → reliability}/reliability_test.rb RENAMED Viewed

File without changes

data/tests/{reliability_test → reliability}/worker.rb RENAMED Viewed

File without changes

metadata CHANGED Viewed

@@ -1,15 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: gitlab-sidekiq-fetcher
 version: !ruby/object:Gem::Version
-  version: 0.5.1
+  version: 0.5.6
 platform: ruby
 authors:
 - TEA
 - GitLab
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-08-06 00:00:00.000000000 Z
+date: 2021-03-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sidekiq
@@ -42,6 +42,7 @@ files:
 - gitlab-sidekiq-fetcher.gemspec
 - lib/sidekiq-reliable-fetch.rb
 - lib/sidekiq/base_reliable_fetch.rb
+- lib/sidekiq/interrupted_set.rb
 - lib/sidekiq/reliable_fetch.rb
 - lib/sidekiq/semi_reliable_fetch.rb
 - spec/base_reliable_fetch_spec.rb
@@ -50,20 +51,19 @@ files:
 - spec/semi_reliable_fetch_spec.rb
 - spec/spec_helper.rb
 - tests/README.md
-- tests/reliability_test/config.rb
-- tests/reliability_test/reliability_test.rb
-- tests/reliability_test/worker.rb
-- tests/retry_test/config.rb
-- tests/retry_test/no_retry_test.rb
-- tests/retry_test/no_retry_worker.rb
-- tests/retry_test/retry_test.rb
-- tests/retry_test/worker.rb
+- tests/interruption/config.rb
+- tests/interruption/test_kill_signal.rb
+- tests/interruption/test_term_signal.rb
+- tests/interruption/worker.rb
+- tests/reliability/config.rb
+- tests/reliability/reliability_test.rb
+- tests/reliability/worker.rb
 - tests/support/utils.rb
 homepage: https://gitlab.com/gitlab-org/sidekiq-reliable-fetch/
 licenses:
 - LGPL-3.0
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -79,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubygems_version: 3.0.3
-signing_key:
+signing_key:
 specification_version: 4
 summary: Reliable fetch extension for Sidekiq
 test_files: []

data/tests/retry_test/no_retry_worker.rb DELETED Viewed

@@ -1,21 +0,0 @@
-# frozen_string_literal: true
-class NoRetryTestWorker
-  include Sidekiq::Worker
-  sidekiq_options retry: false
-  sidekiq_retry_in do |count, exception|
-    1 # retry in one second
-  end
-  def perform
-    sleep 1
-    Sidekiq.redis do |redis|
-      redis.incr('times_has_been_run')
-    end
-    Process.kill('KILL', Process.pid) # Job suicide, OOM killer imitation
-  end
-end

data/tests/retry_test/worker.rb DELETED Viewed

@@ -1,23 +0,0 @@
-# frozen_string_literal: true
-class RetryTestWorker
-  include Sidekiq::Worker
-  EXPECTED_NUM_TIMES_BEEN_RUN = 2
-  sidekiq_options retry: EXPECTED_NUM_TIMES_BEEN_RUN
-  sidekiq_retry_in do |count, exception|
-    1 # retry in one second
-  end
-  def perform
-    sleep 1
-    Sidekiq.redis do |redis|
-      redis.incr('times_has_been_run')
-    end
-    Process.kill('KILL', Process.pid) # Job suicide, OOM killer imitation
-  end
-end