pgbus 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +326 -11
  3. data/app/controllers/pgbus/api/insights_controller.rb +16 -0
  4. data/app/controllers/pgbus/insights_controller.rb +10 -0
  5. data/app/controllers/pgbus/locks_controller.rb +9 -0
  6. data/app/helpers/pgbus/application_helper.rb +28 -0
  7. data/app/models/pgbus/job_lock.rb +82 -0
  8. data/app/models/pgbus/job_stat.rb +94 -0
  9. data/app/views/layouts/pgbus/application.html.erb +32 -8
  10. data/app/views/pgbus/dashboard/_stats_cards.html.erb +20 -20
  11. data/app/views/pgbus/insights/show.html.erb +161 -0
  12. data/app/views/pgbus/locks/index.html.erb +53 -0
  13. data/config/routes.rb +3 -0
  14. data/lib/generators/pgbus/add_job_locks_generator.rb +52 -0
  15. data/lib/generators/pgbus/add_job_stats_generator.rb +52 -0
  16. data/lib/generators/pgbus/add_outbox_generator.rb +1 -1
  17. data/lib/generators/pgbus/add_queue_states_generator.rb +1 -1
  18. data/lib/generators/pgbus/add_recurring_generator.rb +1 -1
  19. data/lib/generators/pgbus/install_generator.rb +1 -1
  20. data/lib/generators/pgbus/templates/add_job_locks.rb.erb +21 -0
  21. data/lib/generators/pgbus/templates/add_job_stats.rb.erb +18 -0
  22. data/lib/generators/pgbus/upgrade_pgmq_generator.rb +1 -1
  23. data/lib/pgbus/active_job/adapter.rb +58 -4
  24. data/lib/pgbus/active_job/executor.rb +45 -0
  25. data/lib/pgbus/client.rb +8 -22
  26. data/lib/pgbus/configuration.rb +6 -0
  27. data/lib/pgbus/engine.rb +1 -0
  28. data/lib/pgbus/process/consumer_priority.rb +64 -0
  29. data/lib/pgbus/process/dispatcher.rb +29 -0
  30. data/lib/pgbus/process/queue_lock.rb +87 -0
  31. data/lib/pgbus/process/supervisor.rb +6 -1
  32. data/lib/pgbus/process/wake_signal.rb +53 -0
  33. data/lib/pgbus/process/worker.rb +36 -6
  34. data/lib/pgbus/queue_factory.rb +62 -0
  35. data/lib/pgbus/uniqueness.rb +169 -0
  36. data/lib/pgbus/version.rb +1 -1
  37. data/lib/pgbus/web/data_source.rb +49 -0
  38. data/lib/pgbus.rb +1 -0
  39. metadata +17 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f7594e67d8f35115e8a8498a64c766a37bd2705f62ee0414471ac08370596d51
4
- data.tar.gz: 31048f0243cf7eddf24e49a79fee753390051637b1173534891d7155c42fc734
3
+ metadata.gz: c99eb0bd5a09f7396e64468e30ece233f9a2274882c44b3c836139dadad414df
4
+ data.tar.gz: 4a1e1264977bbb87db1c771e8364bf91e86f906c3d2f7a7d2e892730ce4634a1
5
5
  SHA512:
6
- metadata.gz: 2351357593c783a226b0dccda08f278c2ddf7ab40f2d727bf7c3d5366e3a24aa3ae79670f0a09d867f1f774350e6677e17960d9fd90b46d4be94f7bf4085ba96
7
- data.tar.gz: 2e427fc91be9934d37ec6bc21a0e1ffec5be034fdc31ead152c3c86006805778ee3ed9e5490df112c659e2f9f7363333b35889f1dc9374cf1bb0f3069ade65c4
6
+ metadata.gz: 875fb38641995f8d8af516fabd378e363c2addbc94790507a31d4bb7e9b4c24111b8c748185f30d9d56a9b50beb77d88b70d2bd977f2d6d05794c61be4ec1fca
7
+ data.tar.gz: abd4b1d81e543019ad8bbc5d3b16698c2845fe45abe0e7c96bc33b10298d6d165ca64b54a83354f958adb8e1b32abbad817de469986d5ccaf8cc02d5e9ccc56c
data/README.md CHANGED
@@ -14,6 +14,14 @@ PostgreSQL-native job processing and event bus for Rails, built on [PGMQ](https:
14
14
  - [Quick start](#quick-start)
15
15
  - [Concurrency controls](#concurrency-controls)
16
16
  - [Batches](#batches)
17
+ - [Job uniqueness](#job-uniqueness)
18
+ - [Priority queues](#priority-queues)
19
+ - [Single active consumer](#single-active-consumer)
20
+ - [Consumer priority](#consumer-priority)
21
+ - [Circuit breaker and queue pause/resume](#circuit-breaker-and-queue-pauseresume)
22
+ - [Prefetch flow control](#prefetch-flow-control)
23
+ - [Transactional outbox](#transactional-outbox)
24
+ - [Archive compaction](#archive-compaction)
17
25
  - [Configuration reference](#configuration-reference)
18
26
  - [Architecture](#architecture)
19
27
  - [CLI](#cli)
@@ -30,9 +38,18 @@ PostgreSQL-native job processing and event bus for Rails, built on [PGMQ](https:
30
38
  - **Dead letter queues** -- automatic DLQ routing after configurable retries
31
39
  - **Worker recycling** -- memory, job count, and lifetime limits prevent runaway processes
32
40
  - **LISTEN/NOTIFY** -- instant wake-up, polling as fallback only
33
- - **Idempotent events** -- deduplication via `(event_id, handler_class)` unique index
34
- - **Live dashboard** -- Turbo Frames auto-refresh, no ActionCable required
35
- - **Supervisor/worker model** -- forked processes with heartbeat monitoring
41
+ - **Idempotent events** -- deduplication via `(event_id, handler_class)` unique index with in-memory cache
42
+ - **Live dashboard** -- Turbo Frames auto-refresh with throughput rate, no ActionCable required
43
+ - **Supervisor/worker model** -- forked processes with heartbeat monitoring and lifecycle state machine
44
+ - **Priority queues** -- route jobs to priority sub-queues, highest-priority-first processing
45
+ - **Circuit breaker** -- auto-pause queues after consecutive failures, exponential backoff
46
+ - **Queue pause/resume** -- manual or automatic via dashboard
47
+ - **Prefetch flow control** -- cap in-flight messages per worker to prevent overload
48
+ - **Archive compaction** -- automatic purge of old archived messages
49
+ - **Transactional outbox** -- publish events atomically inside database transactions
50
+ - **Single active consumer** -- advisory-lock-based exclusive queue processing for strict ordering
51
+ - **Consumer priority** -- higher-priority workers get first dibs, lower-priority workers back off
52
+ - **Job uniqueness** -- prevent duplicate jobs with reaper-based crash recovery, no TTL-driven expiry
36
53
 
37
54
  ## Requirements
38
55
 
@@ -66,11 +83,17 @@ production:
66
83
  default_queue: default
67
84
  pool_size: 10
68
85
  max_retries: 5
86
+ prefetch_limit: 20
69
87
  workers:
70
88
  - queues: [default, mailers]
71
89
  threads: 10
90
+ consumer_priority: 10
72
91
  - queues: [critical]
73
92
  threads: 5
93
+ single_active_consumer: true
94
+ - queues: [default, mailers]
95
+ threads: 5
96
+ consumer_priority: 0 # fallback worker
74
97
  event_consumers:
75
98
  - queues: [orders, payments]
76
99
  threads: 5
@@ -319,6 +342,255 @@ end
319
342
  4. When `completed_jobs + discarded_jobs == total_jobs`, the batch status flips to `"finished"` and callback jobs are enqueued
320
343
  5. The dispatcher cleans up finished batches older than 7 days
321
344
 
345
+ ## Job uniqueness
346
+
347
+ Prevent duplicate jobs from running. Unlike `limits_concurrency` (which controls *how many* jobs with the same key run), uniqueness guarantees *at most one* job with a given key exists in the system at any time.
348
+
349
+ ```ruby
350
+ class ImportOrderJob < ApplicationJob
351
+ ensures_uniqueness strategy: :until_executed,
352
+ key: ->(order_id) { "import-order-#{order_id}" },
353
+ on_conflict: :reject
354
+
355
+ def perform(order_id)
356
+ # Only ONE instance per order_id can exist — from enqueue through completion.
357
+ # If another ImportOrderJob for this order_id is already enqueued or running,
358
+ # the duplicate is rejected immediately.
359
+ end
360
+ end
361
+ ```
362
+
363
+ ### Strategies
364
+
365
+ | Strategy | Lock acquired | Lock released | Prevents |
366
+ |----------|--------------|---------------|----------|
367
+ | `:until_executed` | At enqueue | On completion or DLQ | Duplicate enqueue AND execution |
368
+ | `:while_executing` | At execution start | On completion or DLQ | Duplicate execution only |
369
+
370
+ ### Conflict policies
371
+
372
+ | Policy | Behavior |
373
+ |--------|----------|
374
+ | `:reject` | Raise `Pgbus::JobNotUnique` (default) |
375
+ | `:discard` | Silently drop the duplicate |
376
+ | `:log` | Log a warning and drop |
377
+
378
+ ### Lock lifecycle
379
+
380
+ The lock is **never released by a timer**. It is held as long as the job exists in the system:
381
+
382
+ ```text
383
+ Enqueue ──→ pgbus_job_locks (state: queued, owner_pid: nil)
384
+
385
+ Worker picks up job
386
+
387
+
388
+ claim_for_execution! (state: executing, owner_pid: PID)
389
+
390
+ ┌───────┴───────┐
391
+ ▼ ▼
392
+ Success Crash
393
+ release! (lock orphaned)
394
+ (row deleted) │
395
+
396
+ Reaper checks:
397
+ Is owner_pid in pgbus_processes
398
+ with fresh heartbeat?
399
+
400
+ ┌─────┴─────┐
401
+ No Yes
402
+ ▼ ▼
403
+ release! (keep lock,
404
+ (orphaned) job is running)
405
+ ```
406
+
407
+ **Crash recovery** works through the reaper (runs every 5 minutes in the dispatcher). It cross-references `owner_pid` in `pgbus_job_locks` against `pgbus_processes` heartbeats. If the owning worker has no fresh heartbeat, the lock is orphaned and released — the PGMQ message's visibility timeout will expire and the job will be retried by another worker.
408
+
409
+ A last-resort TTL (default 24 hours) handles the case where the entire pgbus supervisor is dead and the reaper itself can't run.
410
+
411
+ ### Uniqueness vs concurrency controls
412
+
413
+ | | `ensures_uniqueness` | `limits_concurrency` |
414
+ |---|---|---|
415
+ | **Purpose** | Prevent duplicate jobs | Limit concurrent execution slots |
416
+ | **Lock type** | Binary lock (one or none) | Counting semaphore (up to N) |
417
+ | **At enqueue** | `:until_executed` blocks duplicates | Checks semaphore, blocks/discards/raises |
418
+ | **At execution** | `:while_executing` blocks duplicate runs | Not checked (semaphore acquired at enqueue) |
419
+ | **Duplicate in queue** | `:until_executed`: impossible. `:while_executing`: allowed, only one runs | Allowed up to N, rest blocked |
420
+ | **Crash recovery** | Reaper checks heartbeats | Semaphore `expires_at` + dispatcher cleanup |
421
+ | **Use when** | "This exact job must not run twice" | "At most N of these can run at once" |
422
+
423
+ **When to use which:**
424
+ - Payment processing, order import, unique email sends → `ensures_uniqueness`
425
+ - Rate-limited API calls, resource-constrained tasks → `limits_concurrency`
426
+ - Both at once → combine them (they use separate tables, no conflicts)
427
+
428
+ ### Setup
429
+
430
+ ```bash
431
+ rails generate pgbus:add_job_locks # Add the migration
432
+ rails generate pgbus:add_job_locks --database=pgbus # For separate database
433
+ ```
434
+
435
+ ## Priority queues
436
+
437
+ Route jobs to priority sub-queues so high-priority work is processed first:
438
+
439
+ ```ruby
440
+ Pgbus.configure do |config|
441
+ config.priority_levels = 3 # Creates _p0, _p1, _p2 sub-queues per logical queue
442
+ config.default_priority = 1 # Jobs without explicit priority go to _p1
443
+ end
444
+ ```
445
+
446
+ Workers read from `_p0` (highest) first, then `_p1`, then `_p2`. Only when higher-priority sub-queues are empty does the worker read from lower ones.
447
+
448
+ Use ActiveJob's built-in `priority` attribute:
449
+
450
+ ```ruby
451
+ class CriticalAlertJob < ApplicationJob
452
+ queue_as :default
453
+ queue_with_priority 0 # Highest priority
454
+
455
+ def perform(alert_id)
456
+ # ...
457
+ end
458
+ end
459
+
460
+ class ReportJob < ApplicationJob
461
+ queue_as :default
462
+ queue_with_priority 2 # Lowest priority
463
+
464
+ def perform(report_id)
465
+ # ...
466
+ end
467
+ end
468
+ ```
469
+
470
+ When `priority_levels` is `nil` (default), priority queues are disabled and all jobs go to a single queue per logical name.
471
+
472
+ ## Single active consumer
473
+
474
+ For queues that require strict ordering, enable single active consumer mode. Only one worker process can read from a queue at a time -- others skip it and process other queues.
475
+
476
+ ```yaml
477
+ # config/pgbus.yml
478
+ production:
479
+ workers:
480
+ - queues: [ordered_events]
481
+ threads: 1
482
+ single_active_consumer: true
483
+ - queues: [ordered_events]
484
+ threads: 1
485
+ single_active_consumer: true # Standby — takes over if the first worker dies
486
+ ```
487
+
488
+ Uses PostgreSQL session-level advisory locks (`pg_try_advisory_lock`). The lock is non-blocking -- workers that can't acquire it simply skip the queue. Locks auto-release on connection close (including crashes), so failover is automatic.
489
+
490
+ ## Consumer priority
491
+
492
+ When multiple workers subscribe to the same queues, higher-priority workers process messages first. Lower-priority workers back off (3x polling interval) when a higher-priority worker is active.
493
+
494
+ ```yaml
495
+ # config/pgbus.yml
496
+ production:
497
+ workers:
498
+ - queues: [default]
499
+ threads: 10
500
+ consumer_priority: 10 # Primary — polls at base interval
501
+ - queues: [default]
502
+ threads: 5
503
+ consumer_priority: 0 # Fallback — polls at 3x interval when primary is healthy
504
+ ```
505
+
506
+ Priority is stored in heartbeat metadata. Workers check the `pgbus_processes` table to discover higher-priority peers. When a high-priority worker goes stale (no heartbeat for 5 minutes), lower-priority workers automatically resume normal polling.
507
+
508
+ ## Circuit breaker and queue pause/resume
509
+
510
+ Pgbus automatically pauses queues that fail repeatedly, preventing cascading failures.
511
+
512
+ ```ruby
513
+ Pgbus.configure do |config|
514
+ config.circuit_breaker_enabled = true # default
515
+ config.circuit_breaker_threshold = 5 # consecutive failures before tripping
516
+ config.circuit_breaker_base_backoff = 30 # seconds (doubles per trip)
517
+ config.circuit_breaker_max_backoff = 600 # 10 minute cap
518
+ end
519
+ ```
520
+
521
+ When a queue hits the failure threshold:
522
+ 1. The circuit breaker **auto-pauses** the queue with exponential backoff
523
+ 2. After the backoff expires, the queue **auto-resumes** and the trip counter resets
524
+ 3. If failures continue, each trip doubles the backoff (capped at `max_backoff`)
525
+
526
+ You can also **manually pause/resume** queues from the dashboard. The pause state is stored in the `pgbus_queue_states` table and survives restarts.
527
+
528
+ ```bash
529
+ rails generate pgbus:add_queue_states # Add the queue_states migration
530
+ rails generate pgbus:add_queue_states --database=pgbus # For separate database
531
+ ```
532
+
533
+ ## Prefetch flow control
534
+
535
+ Cap the number of in-flight (claimed but unfinished) messages per worker:
536
+
537
+ ```ruby
538
+ Pgbus.configure do |config|
539
+ config.prefetch_limit = 20 # nil = unlimited (default)
540
+ end
541
+ ```
542
+
543
+ The worker tracks in-flight messages with an atomic counter and only fetches `min(idle_threads, prefetch_available)` messages per cycle. The counter is decremented in an `ensure` block so it never gets stuck.
544
+
545
+ ## Transactional outbox
546
+
547
+ Publish events atomically inside your database transactions. A background poller moves outbox entries to PGMQ.
548
+
549
+ ```bash
550
+ rails generate pgbus:add_outbox # Add the outbox migration
551
+ rails generate pgbus:add_outbox --database=pgbus # For separate database
552
+ ```
553
+
554
+ ```ruby
555
+ Pgbus.configure do |config|
556
+ config.outbox_enabled = true
557
+ config.outbox_poll_interval = 1.0 # seconds
558
+ config.outbox_batch_size = 100
559
+ config.outbox_retention = 24 * 3600 # keep published entries for 24h
560
+ end
561
+ ```
562
+
563
+ Usage:
564
+
565
+ ```ruby
566
+ ActiveRecord::Base.transaction do
567
+ order = Order.create!(params)
568
+
569
+ # Published atomically with the order — if the transaction rolls back,
570
+ # the outbox entry is also rolled back. No lost or phantom events.
571
+ Pgbus::Outbox.publish("default", { order_id: order.id })
572
+
573
+ # For topic-based event bus:
574
+ Pgbus::Outbox.publish_event("orders.created", { order_id: order.id })
575
+ end
576
+ ```
577
+
578
+ The outbox poller uses `FOR UPDATE SKIP LOCKED` inside a transaction to claim entries, publishes them to PGMQ, and marks them as published. Failed entries are skipped and retried next cycle.
579
+
580
+ ## Archive compaction
581
+
582
+ PGMQ archive tables grow unbounded. Pgbus automatically purges old entries:
583
+
584
+ ```ruby
585
+ Pgbus.configure do |config|
586
+ config.archive_retention = 7 * 24 * 3600 # 7 days (default)
587
+ config.archive_compaction_interval = 3600 # run every hour (default)
588
+ config.archive_compaction_batch_size = 1000 # delete in batches (default)
589
+ end
590
+ ```
591
+
592
+ The dispatcher runs archive compaction as part of its maintenance loop, deleting archived messages older than `archive_retention` in batches to avoid long-running transactions.
593
+
322
594
  ## Configuration reference
323
595
 
324
596
  | Option | Default | Description |
@@ -336,27 +608,46 @@ end
336
608
  | `max_memory_mb` | `nil` | Recycle worker when memory exceeds N MB |
337
609
  | `max_worker_lifetime` | `nil` | Recycle worker after N seconds |
338
610
  | `listen_notify` | `true` | Use PGMQ's LISTEN/NOTIFY for instant wake-up |
611
+ | `prefetch_limit` | `nil` | Max in-flight messages per worker (nil = unlimited) |
339
612
  | `dispatch_interval` | `1.0` | Seconds between dispatcher maintenance ticks |
613
+ | `circuit_breaker_enabled` | `true` | Enable auto-pause on consecutive failures |
614
+ | `circuit_breaker_threshold` | `5` | Consecutive failures before tripping |
615
+ | `circuit_breaker_base_backoff` | `30` | Base backoff seconds (doubles per trip) |
616
+ | `circuit_breaker_max_backoff` | `600` | Max backoff cap in seconds |
617
+ | `priority_levels` | `nil` | Number of priority sub-queues (nil = disabled, 2-10) |
618
+ | `default_priority` | `1` | Default priority for jobs without explicit priority |
619
+ | `archive_retention` | `604800` | Seconds to keep archived messages (7 days) |
620
+ | `archive_compaction_interval` | `3600` | Seconds between archive cleanup runs |
621
+ | `archive_compaction_batch_size` | `1000` | Rows deleted per batch during compaction |
622
+ | `outbox_enabled` | `false` | Enable transactional outbox poller process |
623
+ | `outbox_poll_interval` | `1.0` | Seconds between outbox poll cycles |
624
+ | `outbox_batch_size` | `100` | Max entries per outbox poll cycle |
625
+ | `outbox_retention` | `86400` | Seconds to keep published outbox entries (1 day) |
340
626
  | `idempotency_ttl` | `604800` | Seconds to keep processed event records (7 days, cleaned hourly) |
341
627
  | `web_auth` | `nil` | Lambda for dashboard authentication |
342
628
  | `web_refresh_interval` | `5000` | Dashboard auto-refresh interval in milliseconds |
343
629
  | `web_live_updates` | `true` | Enable Turbo Frames auto-refresh on dashboard |
630
+ | `stats_enabled` | `true` | Record job execution stats for insights dashboard |
631
+ | `stats_retention` | `604800` | Seconds to keep job stats (7 days) |
344
632
 
345
633
  ## Architecture
346
634
 
347
635
  ```text
348
636
  Supervisor (fork manager)
349
- ├── Worker 1 (queues: [default, mailers], threads: 10)
350
- ├── Worker 2 (queues: [critical], threads: 5)
351
- ├── Dispatcher (maintenance: idempotency cleanup, stale process reaping)
352
- └── Consumer (event bus topics)
637
+ ├── Worker 1 (queues: [default, mailers], threads: 10, priority: 10)
638
+ ├── Worker 2 (queues: [critical], threads: 5, single_active_consumer: true)
639
+ ├── Dispatcher (maintenance: cleanup, compaction, reaping, circuit breaker)
640
+ ├── Scheduler (recurring tasks via cron)
641
+ ├── Consumer (event bus topics)
642
+ └── Outbox Poller (transactional outbox → PGMQ, when enabled)
353
643
 
354
644
  PostgreSQL + PGMQ
355
645
  ├── pgbus_default (job queue)
356
646
  ├── pgbus_default_dlq (dead letter queue)
357
647
  ├── pgbus_critical (job queue)
358
648
  ├── pgbus_critical_dlq (dead letter queue)
359
- └── pgbus_mailers (job queue)
649
+ ├── pgbus_mailers (job queue)
650
+ └── pgbus_queue_states (pause/resume + circuit breaker state)
360
651
  ```
361
652
 
362
653
  ### How it works
@@ -395,15 +686,33 @@ pgbus help # Show help
395
686
 
396
687
  The dashboard is a mountable Rails engine at `/pgbus` with:
397
688
 
398
- - **Overview** -- queue depths, enqueued count, active processes, failure count
399
- - **Queues** -- per-queue metrics, purge actions
689
+ - **Overview** -- queue depths, enqueued count, active processes, failure count, throughput rate
690
+ - **Queues** -- per-queue metrics, purge/pause/resume actions
400
691
  - **Jobs** -- enqueued and failed jobs, retry/discard actions
401
692
  - **Dead letter** -- DLQ messages with retry/discard, bulk actions
402
693
  - **Processes** -- active workers/dispatcher/consumers with heartbeat status
403
694
  - **Events** -- registered subscribers and processed events
695
+ - **Outbox** -- transactional outbox entries pending publication
696
+ - **Locks** -- active job uniqueness locks with state (queued/executing), owner PID@hostname, age
697
+ - **Insights** -- throughput chart (jobs/min), status distribution donut, slowest job classes table
404
698
 
405
699
  All tables use Turbo Frames for periodic auto-refresh without page reloads.
406
700
 
701
+ ### Dark mode
702
+
703
+ The dashboard supports dark mode via Tailwind CSS `dark:` classes. It respects your system preference on first visit and persists your choice via localStorage. Toggle with the sun/moon button in the nav bar.
704
+
705
+ ### Job stats and insights
706
+
707
+ The executor records every job completion to `pgbus_job_stats` (job class, queue, status, duration). The insights page visualizes this data with ApexCharts (loaded via CDN, zero npm dependencies).
708
+
709
+ ```bash
710
+ rails generate pgbus:add_job_stats # Add the stats migration
711
+ rails generate pgbus:add_job_stats --database=pgbus
712
+ ```
713
+
714
+ Stats collection is enabled by default (`config.stats_enabled = true`). Old stats are cleaned up by the dispatcher based on `config.stats_retention` (default: 7 days). If the migration hasn't been run yet, stat recording is silently skipped.
715
+
407
716
  ## Database tables
408
717
 
409
718
  Pgbus uses these tables (created via PGMQ and migrations):
@@ -411,13 +720,19 @@ Pgbus uses these tables (created via PGMQ and migrations):
411
720
  | Table | Purpose |
412
721
  |-------|---------|
413
722
  | `q_pgbus_*` | PGMQ job queues (managed by PGMQ) |
414
- | `a_pgbus_*` | PGMQ archive tables (managed by PGMQ) |
723
+ | `a_pgbus_*` | PGMQ archive tables (managed by PGMQ, compacted by dispatcher) |
415
724
  | `pgbus_processes` | Heartbeat tracking for workers/dispatcher/consumers |
416
725
  | `pgbus_failed_events` | Failed event dispatch records |
417
726
  | `pgbus_processed_events` | Idempotency deduplication (event_id, handler_class) |
418
727
  | `pgbus_semaphores` | Concurrency control counting semaphores |
419
728
  | `pgbus_blocked_executions` | Jobs waiting for a concurrency semaphore slot |
420
729
  | `pgbus_batches` | Batch tracking with job counters and callback config |
730
+ | `pgbus_job_locks` | Job uniqueness locks (state, owner_pid, reaper correlation) |
731
+ | `pgbus_job_stats` | Job execution metrics (class, queue, status, duration) |
732
+ | `pgbus_queue_states` | Queue pause/resume and circuit breaker state |
733
+ | `pgbus_outbox_entries` | Transactional outbox entries pending publication |
734
+ | `pgbus_recurring_tasks` | Recurring job definitions |
735
+ | `pgbus_recurring_executions` | Recurring job execution history |
421
736
 
422
737
  ## Switching from another backend
423
738
 
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ module Api
5
+ class InsightsController < ApplicationController
6
+ def show
7
+ render json: {
8
+ summary: data_source.job_stats_summary,
9
+ throughput: data_source.job_throughput,
10
+ status_counts: data_source.job_status_counts,
11
+ slowest: data_source.slowest_job_classes
12
+ }
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ class InsightsController < ApplicationController
5
+ def show
6
+ @summary = data_source.job_stats_summary
7
+ @slowest = data_source.slowest_job_classes
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ class LocksController < ApplicationController
5
+ def index
6
+ @locks = data_source.job_locks
7
+ end
8
+ end
9
+ end
@@ -45,6 +45,34 @@ module Pgbus
45
45
  end
46
46
  end
47
47
 
48
+ def pgbus_duration(seconds)
49
+ return "—" unless seconds
50
+
51
+ seconds = seconds.to_i
52
+ if seconds < 60
53
+ "#{seconds}s"
54
+ elsif seconds < 3600
55
+ "#{seconds / 60}m #{seconds % 60}s"
56
+ elsif seconds < 86_400
57
+ "#{seconds / 3600}h #{(seconds % 3600) / 60}m"
58
+ else
59
+ "#{seconds / 86_400}d #{(seconds % 86_400) / 3600}h"
60
+ end
61
+ end
62
+
63
+ def pgbus_ms_duration(millis)
64
+ return "—" unless millis
65
+
66
+ millis = millis.to_i
67
+ if millis < 1000
68
+ "#{millis}ms"
69
+ elsif millis < 60_000
70
+ "#{(millis / 1000.0).round(1)}s"
71
+ else
72
+ "#{(millis / 60_000.0).round(1)}m"
73
+ end
74
+ end
75
+
48
76
  def pgbus_paused_badge(paused)
49
77
  return unless paused
50
78
 
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ class JobLock < Pgbus::ApplicationRecord
5
+ self.table_name = "pgbus_job_locks"
6
+
7
+ # States:
8
+ # queued — lock held from enqueue time (:until_executed), no worker yet
9
+ # executing — lock held by an active worker process
10
+ STATES = %w[queued executing].freeze
11
+
12
+ scope :executing, -> { where(state: "executing") }
13
+ scope :queued_locks, -> { where(state: "queued") }
14
+ scope :expired, ->(now = Time.current) { where("expires_at < ?", now) }
15
+
16
+ # Atomically try to acquire a lock.
17
+ # Cleans up expired locks for this key first (crash recovery at acquire time).
18
+ # Returns true if acquired, false if already locked.
19
+ def self.acquire!(lock_key, job_class:, ttl:, job_id: nil, state: "queued", owner_pid: nil, owner_hostname: nil)
20
+ # Remove any expired lock for this key inline (last-resort TTL recovery)
21
+ where(lock_key: lock_key).where("expires_at < ?", Time.current).delete_all
22
+
23
+ result = insert(
24
+ {
25
+ lock_key: lock_key, job_class: job_class, job_id: job_id,
26
+ state: state, owner_pid: owner_pid, owner_hostname: owner_hostname,
27
+ expires_at: Time.current + ttl
28
+ },
29
+ unique_by: :lock_key
30
+ )
31
+ result.rows.any?
32
+ rescue ActiveRecord::RecordNotUnique
33
+ false
34
+ end
35
+
36
+ # Transition a queued lock to executing state and claim ownership.
37
+ # Called when a worker starts executing a job that was locked at enqueue time.
38
+ def self.claim_for_execution!(lock_key, owner_pid:, owner_hostname:, ttl:)
39
+ where(lock_key: lock_key).update_all(
40
+ state: "executing",
41
+ owner_pid: owner_pid,
42
+ owner_hostname: owner_hostname,
43
+ expires_at: Time.current + ttl
44
+ )
45
+ end
46
+
47
+ # Release a lock by key.
48
+ def self.release!(lock_key)
49
+ where(lock_key: lock_key).delete_all
50
+ end
51
+
52
+ # Check if a lock is currently held (regardless of expiry — reaper handles orphans).
53
+ def self.locked?(lock_key)
54
+ where(lock_key: lock_key).exists?
55
+ end
56
+
57
+ # Reap orphaned locks: locks in 'executing' state whose owner_pid
58
+ # has no healthy entry in pgbus_processes.
59
+ # Returns the number of orphaned locks released.
60
+ # Reap orphaned locks by matching (pid, hostname) against live process entries.
61
+ # A lock is orphaned if no healthy process exists with the same pid AND hostname.
62
+ def self.reap_orphaned!
63
+ alive_workers = ProcessEntry
64
+ .where("last_heartbeat_at >= ?", Time.current - Process::Heartbeat::ALIVE_THRESHOLD)
65
+ .pluck(:pid, :hostname)
66
+
67
+ orphaned = executing.select do |lock|
68
+ alive_workers.none? { |pid, hostname| pid == lock.owner_pid && hostname == lock.owner_hostname }
69
+ end
70
+
71
+ return 0 if orphaned.empty?
72
+
73
+ where(id: orphaned.map(&:id)).delete_all
74
+ end
75
+
76
+ # Last-resort cleanup: delete locks whose expires_at has passed.
77
+ # This only fires when the reaper itself can't run (e.g., entire supervisor dead).
78
+ def self.cleanup_expired!
79
+ expired.delete_all
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ class JobStat < Pgbus::ApplicationRecord
5
+ self.table_name = "pgbus_job_stats"
6
+
7
+ scope :since, ->(time) { where("created_at >= ?", time) }
8
+ scope :successful, -> { where(status: "success") }
9
+ scope :failed, -> { where(status: "failed") }
10
+ scope :dead_lettered, -> { where(status: "dead_lettered") }
11
+
12
+ # Record a job execution stat. Called by the executor after each job.
13
+ def self.record!(job_class:, queue_name:, status:, duration_ms:)
14
+ return unless table_exists?
15
+
16
+ create!(
17
+ job_class: job_class,
18
+ queue_name: queue_name,
19
+ status: status,
20
+ duration_ms: duration_ms
21
+ )
22
+ rescue StandardError => e
23
+ Pgbus.logger.debug { "[Pgbus] Failed to record job stat: #{e.message}" }
24
+ end
25
+
26
+ # Memoized — intentionally never invalidated at runtime. If the
27
+ # pgbus_job_stats migration runs while the app is already running,
28
+ # a restart is required for stat recording to begin.
29
+ def self.table_exists?
30
+ return @table_exists if defined?(@table_exists)
31
+
32
+ @table_exists = connection.table_exists?(table_name)
33
+ rescue StandardError
34
+ @table_exists = false
35
+ end
36
+
37
+ # Throughput: jobs per minute bucketed by minute for the last N minutes
38
+ def self.throughput(minutes: 60)
39
+ since(minutes.minutes.ago)
40
+ .group("date_trunc('minute', created_at)")
41
+ .order(Arel.sql("date_trunc('minute', created_at)"))
42
+ .count
43
+ end
44
+
45
+ # Average duration by job class
46
+ def self.avg_duration_by_class(minutes: 60)
47
+ since(minutes.minutes.ago)
48
+ .group(:job_class)
49
+ .order(Arel.sql("AVG(duration_ms) DESC"))
50
+ .average(:duration_ms)
51
+ end
52
+
53
+ # Success/fail/DLQ counts
54
+ def self.status_counts(minutes: 60)
55
+ since(minutes.minutes.ago).group(:status).count
56
+ end
57
+
58
+ # Top N slowest job classes by average duration
59
+ def self.slowest_classes(limit: 10, minutes: 60)
60
+ since(minutes.minutes.ago)
61
+ .group(:job_class)
62
+ .order(Arel.sql("AVG(duration_ms) DESC"))
63
+ .limit(limit)
64
+ .pluck(:job_class, Arel.sql("COUNT(*)"), Arel.sql("ROUND(AVG(duration_ms))"), Arel.sql("MAX(duration_ms)"))
65
+ .map { |cls, count, avg, max| { job_class: cls, count: count.to_i, avg_ms: avg.to_i, max_ms: max.to_i } }
66
+ end
67
+
68
+ # Single-query aggregate summary using conditional counts.
69
+ def self.summary(minutes: 60)
70
+ row = since(minutes.minutes.ago).pick(
71
+ Arel.sql("COUNT(*)"),
72
+ Arel.sql("COUNT(*) FILTER (WHERE status = 'success')"),
73
+ Arel.sql("COUNT(*) FILTER (WHERE status = 'failed')"),
74
+ Arel.sql("COUNT(*) FILTER (WHERE status = 'dead_lettered')"),
75
+ Arel.sql("ROUND(AVG(duration_ms)::numeric, 1)"),
76
+ Arel.sql("MAX(duration_ms)")
77
+ )
78
+
79
+ {
80
+ total: row[0].to_i,
81
+ success: row[1].to_i,
82
+ failed: row[2].to_i,
83
+ dead_lettered: row[3].to_i,
84
+ avg_duration_ms: row[4]&.to_f || 0,
85
+ max_duration_ms: row[5].to_i
86
+ }
87
+ end
88
+
89
+ # Cleanup old stats
90
+ def self.cleanup!(older_than:)
91
+ where("created_at < ?", older_than).delete_all
92
+ end
93
+ end
94
+ end