RubyGems - pgbus - Versions diffs - 0.1.5 → 0.1.6 - Mend

pgbus 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/README.md +326 -11
data/app/controllers/pgbus/api/insights_controller.rb +16 -0
data/app/controllers/pgbus/insights_controller.rb +10 -0
data/app/controllers/pgbus/locks_controller.rb +9 -0
data/app/helpers/pgbus/application_helper.rb +28 -0
data/app/models/pgbus/job_lock.rb +82 -0
data/app/models/pgbus/job_stat.rb +94 -0
data/app/views/layouts/pgbus/application.html.erb +32 -8
data/app/views/pgbus/dashboard/_stats_cards.html.erb +20 -20
data/app/views/pgbus/insights/show.html.erb +161 -0
data/app/views/pgbus/locks/index.html.erb +53 -0
data/config/routes.rb +3 -0
data/lib/generators/pgbus/add_job_locks_generator.rb +52 -0
data/lib/generators/pgbus/add_job_stats_generator.rb +52 -0
data/lib/generators/pgbus/add_outbox_generator.rb +1 -1
data/lib/generators/pgbus/add_queue_states_generator.rb +1 -1
data/lib/generators/pgbus/add_recurring_generator.rb +1 -1
data/lib/generators/pgbus/install_generator.rb +1 -1
data/lib/generators/pgbus/templates/add_job_locks.rb.erb +21 -0
data/lib/generators/pgbus/templates/add_job_stats.rb.erb +18 -0
data/lib/generators/pgbus/upgrade_pgmq_generator.rb +1 -1
data/lib/pgbus/active_job/adapter.rb +58 -4
data/lib/pgbus/active_job/executor.rb +45 -0
data/lib/pgbus/client.rb +8 -22
data/lib/pgbus/configuration.rb +6 -0
data/lib/pgbus/engine.rb +1 -0
data/lib/pgbus/process/consumer_priority.rb +64 -0
data/lib/pgbus/process/dispatcher.rb +29 -0
data/lib/pgbus/process/queue_lock.rb +87 -0
data/lib/pgbus/process/supervisor.rb +6 -1
data/lib/pgbus/process/wake_signal.rb +53 -0
data/lib/pgbus/process/worker.rb +36 -6
data/lib/pgbus/queue_factory.rb +62 -0
data/lib/pgbus/uniqueness.rb +169 -0
data/lib/pgbus/version.rb +1 -1
data/lib/pgbus/web/data_source.rb +49 -0
data/lib/pgbus.rb +1 -0
metadata +17 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f7594e67d8f35115e8a8498a64c766a37bd2705f62ee0414471ac08370596d51
-  data.tar.gz: 31048f0243cf7eddf24e49a79fee753390051637b1173534891d7155c42fc734
+  metadata.gz: c99eb0bd5a09f7396e64468e30ece233f9a2274882c44b3c836139dadad414df
+  data.tar.gz: 4a1e1264977bbb87db1c771e8364bf91e86f906c3d2f7a7d2e892730ce4634a1
 SHA512:
-  metadata.gz: 2351357593c783a226b0dccda08f278c2ddf7ab40f2d727bf7c3d5366e3a24aa3ae79670f0a09d867f1f774350e6677e17960d9fd90b46d4be94f7bf4085ba96
-  data.tar.gz: 2e427fc91be9934d37ec6bc21a0e1ffec5be034fdc31ead152c3c86006805778ee3ed9e5490df112c659e2f9f7363333b35889f1dc9374cf1bb0f3069ade65c4
+  metadata.gz: 875fb38641995f8d8af516fabd378e363c2addbc94790507a31d4bb7e9b4c24111b8c748185f30d9d56a9b50beb77d88b70d2bd977f2d6d05794c61be4ec1fca
+  data.tar.gz: abd4b1d81e543019ad8bbc5d3b16698c2845fe45abe0e7c96bc33b10298d6d165ca64b54a83354f958adb8e1b32abbad817de469986d5ccaf8cc02d5e9ccc56c

data/README.md CHANGED Viewed

@@ -14,6 +14,14 @@ PostgreSQL-native job processing and event bus for Rails, built on [PGMQ](https:
 - [Quick start](#quick-start)
 - [Concurrency controls](#concurrency-controls)
 - [Batches](#batches)
+- [Job uniqueness](#job-uniqueness)
+- [Priority queues](#priority-queues)
+- [Single active consumer](#single-active-consumer)
+- [Consumer priority](#consumer-priority)
+- [Circuit breaker and queue pause/resume](#circuit-breaker-and-queue-pauseresume)
+- [Prefetch flow control](#prefetch-flow-control)
+- [Transactional outbox](#transactional-outbox)
+- [Archive compaction](#archive-compaction)
 - [Configuration reference](#configuration-reference)
 - [Architecture](#architecture)
 - [CLI](#cli)
@@ -30,9 +38,18 @@ PostgreSQL-native job processing and event bus for Rails, built on [PGMQ](https:
 - **Dead letter queues** -- automatic DLQ routing after configurable retries
 - **Worker recycling** -- memory, job count, and lifetime limits prevent runaway processes
 - **LISTEN/NOTIFY** -- instant wake-up, polling as fallback only
-- **Idempotent events** -- deduplication via `(event_id, handler_class)` unique index
-- **Live dashboard** -- Turbo Frames auto-refresh, no ActionCable required
-- **Supervisor/worker model** -- forked processes with heartbeat monitoring
+- **Idempotent events** -- deduplication via `(event_id, handler_class)` unique index with in-memory cache
+- **Live dashboard** -- Turbo Frames auto-refresh with throughput rate, no ActionCable required
+- **Supervisor/worker model** -- forked processes with heartbeat monitoring and lifecycle state machine
+- **Priority queues** -- route jobs to priority sub-queues, highest-priority-first processing
+- **Circuit breaker** -- auto-pause queues after consecutive failures, exponential backoff
+- **Queue pause/resume** -- manual or automatic via dashboard
+- **Prefetch flow control** -- cap in-flight messages per worker to prevent overload
+- **Archive compaction** -- automatic purge of old archived messages
+- **Transactional outbox** -- publish events atomically inside database transactions
+- **Single active consumer** -- advisory-lock-based exclusive queue processing for strict ordering
+- **Consumer priority** -- higher-priority workers get first dibs, lower-priority workers back off
+- **Job uniqueness** -- prevent duplicate jobs with reaper-based crash recovery, no TTL-driven expiry
 ## Requirements
@@ -66,11 +83,17 @@ production:
   default_queue: default
   pool_size: 10
   max_retries: 5
+  prefetch_limit: 20
   workers:
     - queues: [default, mailers]
       threads: 10
+      consumer_priority: 10
     - queues: [critical]
       threads: 5
+      single_active_consumer: true
+    - queues: [default, mailers]
+      threads: 5
+      consumer_priority: 0    # fallback worker
   event_consumers:
     - queues: [orders, payments]
       threads: 5
@@ -319,6 +342,255 @@ end
 4. When `completed_jobs + discarded_jobs == total_jobs`, the batch status flips to `"finished"` and callback jobs are enqueued
 5. The dispatcher cleans up finished batches older than 7 days
+## Job uniqueness
+Prevent duplicate jobs from running. Unlike `limits_concurrency` (which controls *how many* jobs with the same key run), uniqueness guarantees *at most one* job with a given key exists in the system at any time.
+```ruby
+class ImportOrderJob < ApplicationJob
+  ensures_uniqueness strategy: :until_executed,
+                     key: ->(order_id) { "import-order-#{order_id}" },
+                     on_conflict: :reject
+  def perform(order_id)
+    # Only ONE instance per order_id can exist — from enqueue through completion.
+    # If another ImportOrderJob for this order_id is already enqueued or running,
+    # the duplicate is rejected immediately.
+  end
+end
+```
+### Strategies
+| Strategy | Lock acquired | Lock released | Prevents |
+|----------|--------------|---------------|----------|
+| `:until_executed` | At enqueue | On completion or DLQ | Duplicate enqueue AND execution |
+| `:while_executing` | At execution start | On completion or DLQ | Duplicate execution only |
+### Conflict policies
+| Policy | Behavior |
+|--------|----------|
+| `:reject` | Raise `Pgbus::JobNotUnique` (default) |
+| `:discard` | Silently drop the duplicate |
+| `:log` | Log a warning and drop |
+### Lock lifecycle
+The lock is **never released by a timer**. It is held as long as the job exists in the system:
+```text
+Enqueue ──→ pgbus_job_locks (state: queued, owner_pid: nil)
+                  │
+  Worker picks up job
+                  │
+                  ▼
+           claim_for_execution! (state: executing, owner_pid: PID)
+                  │
+          ┌───────┴───────┐
+          ▼               ▼
+      Success           Crash
+      release!        (lock orphaned)
+      (row deleted)       │
+                          ▼
+                    Reaper checks:
+                    Is owner_pid in pgbus_processes
+                    with fresh heartbeat?
+                          │
+                    ┌─────┴─────┐
+                    No          Yes
+                    ▼            ▼
+                release!      (keep lock,
+                (orphaned)     job is running)
+```
+**Crash recovery** works through the reaper (runs every 5 minutes in the dispatcher). It cross-references `owner_pid` in `pgbus_job_locks` against `pgbus_processes` heartbeats. If the owning worker has no fresh heartbeat, the lock is orphaned and released — the PGMQ message's visibility timeout will expire and the job will be retried by another worker.
+A last-resort TTL (default 24 hours) handles the case where the entire pgbus supervisor is dead and the reaper itself can't run.
+### Uniqueness vs concurrency controls
+| | `ensures_uniqueness` | `limits_concurrency` |
+|---|---|---|
+| **Purpose** | Prevent duplicate jobs | Limit concurrent execution slots |
+| **Lock type** | Binary lock (one or none) | Counting semaphore (up to N) |
+| **At enqueue** | `:until_executed` blocks duplicates | Checks semaphore, blocks/discards/raises |
+| **At execution** | `:while_executing` blocks duplicate runs | Not checked (semaphore acquired at enqueue) |
+| **Duplicate in queue** | `:until_executed`: impossible. `:while_executing`: allowed, only one runs | Allowed up to N, rest blocked |
+| **Crash recovery** | Reaper checks heartbeats | Semaphore `expires_at` + dispatcher cleanup |
+| **Use when** | "This exact job must not run twice" | "At most N of these can run at once" |
+**When to use which:**
+- Payment processing, order import, unique email sends → `ensures_uniqueness`
+- Rate-limited API calls, resource-constrained tasks → `limits_concurrency`
+- Both at once → combine them (they use separate tables, no conflicts)
+### Setup
+```bash
+rails generate pgbus:add_job_locks                  # Add the migration
+rails generate pgbus:add_job_locks --database=pgbus # For separate database
+```
+## Priority queues
+Route jobs to priority sub-queues so high-priority work is processed first:
+```ruby
+Pgbus.configure do |config|
+  config.priority_levels = 3    # Creates _p0, _p1, _p2 sub-queues per logical queue
+  config.default_priority = 1   # Jobs without explicit priority go to _p1
+end
+```
+Workers read from `_p0` (highest) first, then `_p1`, then `_p2`. Only when higher-priority sub-queues are empty does the worker read from lower ones.
+Use ActiveJob's built-in `priority` attribute:
+```ruby
+class CriticalAlertJob < ApplicationJob
+  queue_as :default
+  queue_with_priority 0  # Highest priority
+  def perform(alert_id)
+    # ...
+  end
+end
+class ReportJob < ApplicationJob
+  queue_as :default
+  queue_with_priority 2  # Lowest priority
+  def perform(report_id)
+    # ...
+  end
+end
+```
+When `priority_levels` is `nil` (default), priority queues are disabled and all jobs go to a single queue per logical name.
+## Single active consumer
+For queues that require strict ordering, enable single active consumer mode. Only one worker process can read from a queue at a time -- others skip it and process other queues.
+```yaml
+# config/pgbus.yml
+production:
+  workers:
+    - queues: [ordered_events]
+      threads: 1
+      single_active_consumer: true
+    - queues: [ordered_events]
+      threads: 1
+      single_active_consumer: true  # Standby — takes over if the first worker dies
+```
+Uses PostgreSQL session-level advisory locks (`pg_try_advisory_lock`). The lock is non-blocking -- workers that can't acquire it simply skip the queue. Locks auto-release on connection close (including crashes), so failover is automatic.
+## Consumer priority
+When multiple workers subscribe to the same queues, higher-priority workers process messages first. Lower-priority workers back off (3x polling interval) when a higher-priority worker is active.
+```yaml
+# config/pgbus.yml
+production:
+  workers:
+    - queues: [default]
+      threads: 10
+      consumer_priority: 10     # Primary — polls at base interval
+    - queues: [default]
+      threads: 5
+      consumer_priority: 0      # Fallback — polls at 3x interval when primary is healthy
+```
+Priority is stored in heartbeat metadata. Workers check the `pgbus_processes` table to discover higher-priority peers. When a high-priority worker goes stale (no heartbeat for 5 minutes), lower-priority workers automatically resume normal polling.
+## Circuit breaker and queue pause/resume
+Pgbus automatically pauses queues that fail repeatedly, preventing cascading failures.
+```ruby
+Pgbus.configure do |config|
+  config.circuit_breaker_enabled = true   # default
+  config.circuit_breaker_threshold = 5    # consecutive failures before tripping
+  config.circuit_breaker_base_backoff = 30  # seconds (doubles per trip)
+  config.circuit_breaker_max_backoff = 600  # 10 minute cap
+end
+```
+When a queue hits the failure threshold:
+1. The circuit breaker **auto-pauses** the queue with exponential backoff
+2. After the backoff expires, the queue **auto-resumes** and the trip counter resets
+3. If failures continue, each trip doubles the backoff (capped at `max_backoff`)
+You can also **manually pause/resume** queues from the dashboard. The pause state is stored in the `pgbus_queue_states` table and survives restarts.
+```bash
+rails generate pgbus:add_queue_states           # Add the queue_states migration
+rails generate pgbus:add_queue_states --database=pgbus  # For separate database
+```
+## Prefetch flow control
+Cap the number of in-flight (claimed but unfinished) messages per worker:
+```ruby
+Pgbus.configure do |config|
+  config.prefetch_limit = 20  # nil = unlimited (default)
+end
+```
+The worker tracks in-flight messages with an atomic counter and only fetches `min(idle_threads, prefetch_available)` messages per cycle. The counter is decremented in an `ensure` block so it never gets stuck.
+## Transactional outbox
+Publish events atomically inside your database transactions. A background poller moves outbox entries to PGMQ.
+```bash
+rails generate pgbus:add_outbox                  # Add the outbox migration
+rails generate pgbus:add_outbox --database=pgbus # For separate database
+```
+```ruby
+Pgbus.configure do |config|
+  config.outbox_enabled = true
+  config.outbox_poll_interval = 1.0  # seconds
+  config.outbox_batch_size = 100
+  config.outbox_retention = 24 * 3600  # keep published entries for 24h
+end
+```
+Usage:
+```ruby
+ActiveRecord::Base.transaction do
+  order = Order.create!(params)
+  # Published atomically with the order — if the transaction rolls back,
+  # the outbox entry is also rolled back. No lost or phantom events.
+  Pgbus::Outbox.publish("default", { order_id: order.id })
+  # For topic-based event bus:
+  Pgbus::Outbox.publish_event("orders.created", { order_id: order.id })
+end
+```
+The outbox poller uses `FOR UPDATE SKIP LOCKED` inside a transaction to claim entries, publishes them to PGMQ, and marks them as published. Failed entries are skipped and retried next cycle.
+## Archive compaction
+PGMQ archive tables grow unbounded. Pgbus automatically purges old entries:
+```ruby
+Pgbus.configure do |config|
+  config.archive_retention = 7 * 24 * 3600       # 7 days (default)
+  config.archive_compaction_interval = 3600       # run every hour (default)
+  config.archive_compaction_batch_size = 1000     # delete in batches (default)
+end
+```
+The dispatcher runs archive compaction as part of its maintenance loop, deleting archived messages older than `archive_retention` in batches to avoid long-running transactions.
 ## Configuration reference
 | Option | Default | Description |
@@ -336,27 +608,46 @@ end
 | `max_memory_mb` | `nil` | Recycle worker when memory exceeds N MB |
 | `max_worker_lifetime` | `nil` | Recycle worker after N seconds |
 | `listen_notify` | `true` | Use PGMQ's LISTEN/NOTIFY for instant wake-up |
+| `prefetch_limit` | `nil` | Max in-flight messages per worker (nil = unlimited) |
 | `dispatch_interval` | `1.0` | Seconds between dispatcher maintenance ticks |
+| `circuit_breaker_enabled` | `true` | Enable auto-pause on consecutive failures |
+| `circuit_breaker_threshold` | `5` | Consecutive failures before tripping |
+| `circuit_breaker_base_backoff` | `30` | Base backoff seconds (doubles per trip) |
+| `circuit_breaker_max_backoff` | `600` | Max backoff cap in seconds |
+| `priority_levels` | `nil` | Number of priority sub-queues (nil = disabled, 2-10) |
+| `default_priority` | `1` | Default priority for jobs without explicit priority |
+| `archive_retention` | `604800` | Seconds to keep archived messages (7 days) |
+| `archive_compaction_interval` | `3600` | Seconds between archive cleanup runs |
+| `archive_compaction_batch_size` | `1000` | Rows deleted per batch during compaction |
+| `outbox_enabled` | `false` | Enable transactional outbox poller process |
+| `outbox_poll_interval` | `1.0` | Seconds between outbox poll cycles |
+| `outbox_batch_size` | `100` | Max entries per outbox poll cycle |
+| `outbox_retention` | `86400` | Seconds to keep published outbox entries (1 day) |
 | `idempotency_ttl` | `604800` | Seconds to keep processed event records (7 days, cleaned hourly) |
 | `web_auth` | `nil` | Lambda for dashboard authentication |
 | `web_refresh_interval` | `5000` | Dashboard auto-refresh interval in milliseconds |
 | `web_live_updates` | `true` | Enable Turbo Frames auto-refresh on dashboard |
+| `stats_enabled` | `true` | Record job execution stats for insights dashboard |
+| `stats_retention` | `604800` | Seconds to keep job stats (7 days) |
 ## Architecture
 ```text
 Supervisor (fork manager)
-  ├── Worker 1        (queues: [default, mailers], threads: 10)
-  ├── Worker 2        (queues: [critical], threads: 5)
-  ├── Dispatcher      (maintenance: idempotency cleanup, stale process reaping)
-  └── Consumer        (event bus topics)
+  ├── Worker 1        (queues: [default, mailers], threads: 10, priority: 10)
+  ├── Worker 2        (queues: [critical], threads: 5, single_active_consumer: true)
+  ├── Dispatcher      (maintenance: cleanup, compaction, reaping, circuit breaker)
+  ├── Scheduler       (recurring tasks via cron)
+  ├── Consumer        (event bus topics)
+  └── Outbox Poller   (transactional outbox → PGMQ, when enabled)
 PostgreSQL + PGMQ
   ├── pgbus_default          (job queue)
   ├── pgbus_default_dlq      (dead letter queue)
   ├── pgbus_critical         (job queue)
   ├── pgbus_critical_dlq     (dead letter queue)
-  └── pgbus_mailers          (job queue)
+  ├── pgbus_mailers          (job queue)
+  └── pgbus_queue_states     (pause/resume + circuit breaker state)
 ```
 ### How it works
@@ -395,15 +686,33 @@ pgbus help      # Show help
 The dashboard is a mountable Rails engine at `/pgbus` with:
-- **Overview** -- queue depths, enqueued count, active processes, failure count
-- **Queues** -- per-queue metrics, purge actions
+- **Overview** -- queue depths, enqueued count, active processes, failure count, throughput rate
+- **Queues** -- per-queue metrics, purge/pause/resume actions
 - **Jobs** -- enqueued and failed jobs, retry/discard actions
 - **Dead letter** -- DLQ messages with retry/discard, bulk actions
 - **Processes** -- active workers/dispatcher/consumers with heartbeat status
 - **Events** -- registered subscribers and processed events
+- **Outbox** -- transactional outbox entries pending publication
+- **Locks** -- active job uniqueness locks with state (queued/executing), owner PID@hostname, age
+- **Insights** -- throughput chart (jobs/min), status distribution donut, slowest job classes table
 All tables use Turbo Frames for periodic auto-refresh without page reloads.
+### Dark mode
+The dashboard supports dark mode via Tailwind CSS `dark:` classes. It respects your system preference on first visit and persists your choice via localStorage. Toggle with the sun/moon button in the nav bar.
+### Job stats and insights
+The executor records every job completion to `pgbus_job_stats` (job class, queue, status, duration). The insights page visualizes this data with ApexCharts (loaded via CDN, zero npm dependencies).
+```bash
+rails generate pgbus:add_job_stats           # Add the stats migration
+rails generate pgbus:add_job_stats --database=pgbus
+```
+Stats collection is enabled by default (`config.stats_enabled = true`). Old stats are cleaned up by the dispatcher based on `config.stats_retention` (default: 7 days). If the migration hasn't been run yet, stat recording is silently skipped.
 ## Database tables
 Pgbus uses these tables (created via PGMQ and migrations):
@@ -411,13 +720,19 @@ Pgbus uses these tables (created via PGMQ and migrations):
 | Table | Purpose |
 |-------|---------|
 | `q_pgbus_*` | PGMQ job queues (managed by PGMQ) |
-| `a_pgbus_*` | PGMQ archive tables (managed by PGMQ) |
+| `a_pgbus_*` | PGMQ archive tables (managed by PGMQ, compacted by dispatcher) |
 | `pgbus_processes` | Heartbeat tracking for workers/dispatcher/consumers |
 | `pgbus_failed_events` | Failed event dispatch records |
 | `pgbus_processed_events` | Idempotency deduplication (event_id, handler_class) |
 | `pgbus_semaphores` | Concurrency control counting semaphores |
 | `pgbus_blocked_executions` | Jobs waiting for a concurrency semaphore slot |
 | `pgbus_batches` | Batch tracking with job counters and callback config |
+| `pgbus_job_locks` | Job uniqueness locks (state, owner_pid, reaper correlation) |
+| `pgbus_job_stats` | Job execution metrics (class, queue, status, duration) |
+| `pgbus_queue_states` | Queue pause/resume and circuit breaker state |
+| `pgbus_outbox_entries` | Transactional outbox entries pending publication |
+| `pgbus_recurring_tasks` | Recurring job definitions |
+| `pgbus_recurring_executions` | Recurring job execution history |
 ## Switching from another backend

data/app/controllers/pgbus/api/insights_controller.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+module Pgbus
+  module Api
+    class InsightsController < ApplicationController
+      def show
+        render json: {
+          summary: data_source.job_stats_summary,
+          throughput: data_source.job_throughput,
+          status_counts: data_source.job_status_counts,
+          slowest: data_source.slowest_job_classes
+        }
+      end
+    end
+  end
+end

data/app/controllers/pgbus/insights_controller.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+module Pgbus
+  class InsightsController < ApplicationController
+    def show
+      @summary = data_source.job_stats_summary
+      @slowest = data_source.slowest_job_classes
+    end
+  end
+end

data/app/controllers/pgbus/locks_controller.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+module Pgbus
+  class LocksController < ApplicationController
+    def index
+      @locks = data_source.job_locks
+    end
+  end
+end

data/app/helpers/pgbus/application_helper.rb CHANGED Viewed

@@ -45,6 +45,34 @@ module Pgbus
       end
     end
+    def pgbus_duration(seconds)
+      return "—" unless seconds
+      seconds = seconds.to_i
+      if seconds < 60
+        "#{seconds}s"
+      elsif seconds < 3600
+        "#{seconds / 60}m #{seconds % 60}s"
+      elsif seconds < 86_400
+        "#{seconds / 3600}h #{(seconds % 3600) / 60}m"
+      else
+        "#{seconds / 86_400}d #{(seconds % 86_400) / 3600}h"
+      end
+    end
+    def pgbus_ms_duration(millis)
+      return "—" unless millis
+      millis = millis.to_i
+      if millis < 1000
+        "#{millis}ms"
+      elsif millis < 60_000
+        "#{(millis / 1000.0).round(1)}s"
+      else
+        "#{(millis / 60_000.0).round(1)}m"
+      end
+    end
     def pgbus_paused_badge(paused)
       return unless paused

data/app/models/pgbus/job_lock.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+module Pgbus
+  class JobLock < Pgbus::ApplicationRecord
+    self.table_name = "pgbus_job_locks"
+    # States:
+    #   queued    — lock held from enqueue time (:until_executed), no worker yet
+    #   executing — lock held by an active worker process
+    STATES = %w[queued executing].freeze
+    scope :executing, -> { where(state: "executing") }
+    scope :queued_locks, -> { where(state: "queued") }
+    scope :expired, ->(now = Time.current) { where("expires_at < ?", now) }
+    # Atomically try to acquire a lock.
+    # Cleans up expired locks for this key first (crash recovery at acquire time).
+    # Returns true if acquired, false if already locked.
+    def self.acquire!(lock_key, job_class:, ttl:, job_id: nil, state: "queued", owner_pid: nil, owner_hostname: nil)
+      # Remove any expired lock for this key inline (last-resort TTL recovery)
+      where(lock_key: lock_key).where("expires_at < ?", Time.current).delete_all
+      result = insert(
+        {
+          lock_key: lock_key, job_class: job_class, job_id: job_id,
+          state: state, owner_pid: owner_pid, owner_hostname: owner_hostname,
+          expires_at: Time.current + ttl
+        },
+        unique_by: :lock_key
+      )
+      result.rows.any?
+    rescue ActiveRecord::RecordNotUnique
+      false
+    end
+    # Transition a queued lock to executing state and claim ownership.
+    # Called when a worker starts executing a job that was locked at enqueue time.
+    def self.claim_for_execution!(lock_key, owner_pid:, owner_hostname:, ttl:)
+      where(lock_key: lock_key).update_all(
+        state: "executing",
+        owner_pid: owner_pid,
+        owner_hostname: owner_hostname,
+        expires_at: Time.current + ttl
+      )
+    end
+    # Release a lock by key.
+    def self.release!(lock_key)
+      where(lock_key: lock_key).delete_all
+    end
+    # Check if a lock is currently held (regardless of expiry — reaper handles orphans).
+    def self.locked?(lock_key)
+      where(lock_key: lock_key).exists?
+    end
+    # Reap orphaned locks: locks in 'executing' state whose owner_pid
+    # has no healthy entry in pgbus_processes.
+    # Returns the number of orphaned locks released.
+    # Reap orphaned locks by matching (pid, hostname) against live process entries.
+    # A lock is orphaned if no healthy process exists with the same pid AND hostname.
+    def self.reap_orphaned!
+      alive_workers = ProcessEntry
+                      .where("last_heartbeat_at >= ?", Time.current - Process::Heartbeat::ALIVE_THRESHOLD)
+                      .pluck(:pid, :hostname)
+      orphaned = executing.select do |lock|
+        alive_workers.none? { |pid, hostname| pid == lock.owner_pid && hostname == lock.owner_hostname }
+      end
+      return 0 if orphaned.empty?
+      where(id: orphaned.map(&:id)).delete_all
+    end
+    # Last-resort cleanup: delete locks whose expires_at has passed.
+    # This only fires when the reaper itself can't run (e.g., entire supervisor dead).
+    def self.cleanup_expired!
+      expired.delete_all
+    end
+  end
+end

data/app/models/pgbus/job_stat.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+module Pgbus
+  class JobStat < Pgbus::ApplicationRecord
+    self.table_name = "pgbus_job_stats"
+    scope :since, ->(time) { where("created_at >= ?", time) }
+    scope :successful, -> { where(status: "success") }
+    scope :failed, -> { where(status: "failed") }
+    scope :dead_lettered, -> { where(status: "dead_lettered") }
+    # Record a job execution stat. Called by the executor after each job.
+    def self.record!(job_class:, queue_name:, status:, duration_ms:)
+      return unless table_exists?
+      create!(
+        job_class: job_class,
+        queue_name: queue_name,
+        status: status,
+        duration_ms: duration_ms
+      )
+    rescue StandardError => e
+      Pgbus.logger.debug { "[Pgbus] Failed to record job stat: #{e.message}" }
+    end
+    # Memoized — intentionally never invalidated at runtime. If the
+    # pgbus_job_stats migration runs while the app is already running,
+    # a restart is required for stat recording to begin.
+    def self.table_exists?
+      return @table_exists if defined?(@table_exists)
+      @table_exists = connection.table_exists?(table_name)
+    rescue StandardError
+      @table_exists = false
+    end
+    # Throughput: jobs per minute bucketed by minute for the last N minutes
+    def self.throughput(minutes: 60)
+      since(minutes.minutes.ago)
+        .group("date_trunc('minute', created_at)")
+        .order(Arel.sql("date_trunc('minute', created_at)"))
+        .count
+    end
+    # Average duration by job class
+    def self.avg_duration_by_class(minutes: 60)
+      since(minutes.minutes.ago)
+        .group(:job_class)
+        .order(Arel.sql("AVG(duration_ms) DESC"))
+        .average(:duration_ms)
+    end
+    # Success/fail/DLQ counts
+    def self.status_counts(minutes: 60)
+      since(minutes.minutes.ago).group(:status).count
+    end
+    # Top N slowest job classes by average duration
+    def self.slowest_classes(limit: 10, minutes: 60)
+      since(minutes.minutes.ago)
+        .group(:job_class)
+        .order(Arel.sql("AVG(duration_ms) DESC"))
+        .limit(limit)
+        .pluck(:job_class, Arel.sql("COUNT(*)"), Arel.sql("ROUND(AVG(duration_ms))"), Arel.sql("MAX(duration_ms)"))
+        .map { |cls, count, avg, max| { job_class: cls, count: count.to_i, avg_ms: avg.to_i, max_ms: max.to_i } }
+    end
+    # Single-query aggregate summary using conditional counts.
+    def self.summary(minutes: 60)
+      row = since(minutes.minutes.ago).pick(
+        Arel.sql("COUNT(*)"),
+        Arel.sql("COUNT(*) FILTER (WHERE status = 'success')"),
+        Arel.sql("COUNT(*) FILTER (WHERE status = 'failed')"),
+        Arel.sql("COUNT(*) FILTER (WHERE status = 'dead_lettered')"),
+        Arel.sql("ROUND(AVG(duration_ms)::numeric, 1)"),
+        Arel.sql("MAX(duration_ms)")
+      )
+      {
+        total: row[0].to_i,
+        success: row[1].to_i,
+        failed: row[2].to_i,
+        dead_lettered: row[3].to_i,
+        avg_duration_ms: row[4]&.to_f || 0,
+        max_duration_ms: row[5].to_i
+      }
+    end
+    # Cleanup old stats
+    def self.cleanup!(older_than:)
+      where("created_at < ?", older_than).delete_all
+    end
+  end
+end