pgbus 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +326 -11
- data/app/controllers/pgbus/api/insights_controller.rb +16 -0
- data/app/controllers/pgbus/insights_controller.rb +10 -0
- data/app/controllers/pgbus/locks_controller.rb +9 -0
- data/app/helpers/pgbus/application_helper.rb +28 -0
- data/app/models/pgbus/job_lock.rb +82 -0
- data/app/models/pgbus/job_stat.rb +94 -0
- data/app/views/layouts/pgbus/application.html.erb +32 -8
- data/app/views/pgbus/dashboard/_stats_cards.html.erb +20 -20
- data/app/views/pgbus/insights/show.html.erb +161 -0
- data/app/views/pgbus/locks/index.html.erb +53 -0
- data/config/routes.rb +3 -0
- data/lib/generators/pgbus/add_job_locks_generator.rb +52 -0
- data/lib/generators/pgbus/add_job_stats_generator.rb +52 -0
- data/lib/generators/pgbus/add_outbox_generator.rb +1 -1
- data/lib/generators/pgbus/add_queue_states_generator.rb +1 -1
- data/lib/generators/pgbus/add_recurring_generator.rb +1 -1
- data/lib/generators/pgbus/install_generator.rb +1 -1
- data/lib/generators/pgbus/templates/add_job_locks.rb.erb +21 -0
- data/lib/generators/pgbus/templates/add_job_stats.rb.erb +18 -0
- data/lib/generators/pgbus/upgrade_pgmq_generator.rb +1 -1
- data/lib/pgbus/active_job/adapter.rb +58 -4
- data/lib/pgbus/active_job/executor.rb +45 -0
- data/lib/pgbus/client.rb +8 -22
- data/lib/pgbus/configuration.rb +6 -0
- data/lib/pgbus/engine.rb +1 -0
- data/lib/pgbus/process/consumer_priority.rb +64 -0
- data/lib/pgbus/process/dispatcher.rb +29 -0
- data/lib/pgbus/process/queue_lock.rb +87 -0
- data/lib/pgbus/process/supervisor.rb +6 -1
- data/lib/pgbus/process/wake_signal.rb +53 -0
- data/lib/pgbus/process/worker.rb +36 -6
- data/lib/pgbus/queue_factory.rb +62 -0
- data/lib/pgbus/uniqueness.rb +169 -0
- data/lib/pgbus/version.rb +1 -1
- data/lib/pgbus/web/data_source.rb +49 -0
- data/lib/pgbus.rb +1 -0
- metadata +17 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c99eb0bd5a09f7396e64468e30ece233f9a2274882c44b3c836139dadad414df
|
|
4
|
+
data.tar.gz: 4a1e1264977bbb87db1c771e8364bf91e86f906c3d2f7a7d2e892730ce4634a1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 875fb38641995f8d8af516fabd378e363c2addbc94790507a31d4bb7e9b4c24111b8c748185f30d9d56a9b50beb77d88b70d2bd977f2d6d05794c61be4ec1fca
|
|
7
|
+
data.tar.gz: abd4b1d81e543019ad8bbc5d3b16698c2845fe45abe0e7c96bc33b10298d6d165ca64b54a83354f958adb8e1b32abbad817de469986d5ccaf8cc02d5e9ccc56c
|
data/README.md
CHANGED
|
@@ -14,6 +14,14 @@ PostgreSQL-native job processing and event bus for Rails, built on [PGMQ](https:
|
|
|
14
14
|
- [Quick start](#quick-start)
|
|
15
15
|
- [Concurrency controls](#concurrency-controls)
|
|
16
16
|
- [Batches](#batches)
|
|
17
|
+
- [Job uniqueness](#job-uniqueness)
|
|
18
|
+
- [Priority queues](#priority-queues)
|
|
19
|
+
- [Single active consumer](#single-active-consumer)
|
|
20
|
+
- [Consumer priority](#consumer-priority)
|
|
21
|
+
- [Circuit breaker and queue pause/resume](#circuit-breaker-and-queue-pauseresume)
|
|
22
|
+
- [Prefetch flow control](#prefetch-flow-control)
|
|
23
|
+
- [Transactional outbox](#transactional-outbox)
|
|
24
|
+
- [Archive compaction](#archive-compaction)
|
|
17
25
|
- [Configuration reference](#configuration-reference)
|
|
18
26
|
- [Architecture](#architecture)
|
|
19
27
|
- [CLI](#cli)
|
|
@@ -30,9 +38,18 @@ PostgreSQL-native job processing and event bus for Rails, built on [PGMQ](https:
|
|
|
30
38
|
- **Dead letter queues** -- automatic DLQ routing after configurable retries
|
|
31
39
|
- **Worker recycling** -- memory, job count, and lifetime limits prevent runaway processes
|
|
32
40
|
- **LISTEN/NOTIFY** -- instant wake-up, polling as fallback only
|
|
33
|
-
- **Idempotent events** -- deduplication via `(event_id, handler_class)` unique index
|
|
34
|
-
- **Live dashboard** -- Turbo Frames auto-refresh, no ActionCable required
|
|
35
|
-
- **Supervisor/worker model** -- forked processes with heartbeat monitoring
|
|
41
|
+
- **Idempotent events** -- deduplication via `(event_id, handler_class)` unique index with in-memory cache
|
|
42
|
+
- **Live dashboard** -- Turbo Frames auto-refresh with throughput rate, no ActionCable required
|
|
43
|
+
- **Supervisor/worker model** -- forked processes with heartbeat monitoring and lifecycle state machine
|
|
44
|
+
- **Priority queues** -- route jobs to priority sub-queues, highest-priority-first processing
|
|
45
|
+
- **Circuit breaker** -- auto-pause queues after consecutive failures, exponential backoff
|
|
46
|
+
- **Queue pause/resume** -- manual or automatic via dashboard
|
|
47
|
+
- **Prefetch flow control** -- cap in-flight messages per worker to prevent overload
|
|
48
|
+
- **Archive compaction** -- automatic purge of old archived messages
|
|
49
|
+
- **Transactional outbox** -- publish events atomically inside database transactions
|
|
50
|
+
- **Single active consumer** -- advisory-lock-based exclusive queue processing for strict ordering
|
|
51
|
+
- **Consumer priority** -- higher-priority workers get first dibs, lower-priority workers back off
|
|
52
|
+
- **Job uniqueness** -- prevent duplicate jobs with reaper-based crash recovery, no TTL-driven expiry
|
|
36
53
|
|
|
37
54
|
## Requirements
|
|
38
55
|
|
|
@@ -66,11 +83,17 @@ production:
|
|
|
66
83
|
default_queue: default
|
|
67
84
|
pool_size: 10
|
|
68
85
|
max_retries: 5
|
|
86
|
+
prefetch_limit: 20
|
|
69
87
|
workers:
|
|
70
88
|
- queues: [default, mailers]
|
|
71
89
|
threads: 10
|
|
90
|
+
consumer_priority: 10
|
|
72
91
|
- queues: [critical]
|
|
73
92
|
threads: 5
|
|
93
|
+
single_active_consumer: true
|
|
94
|
+
- queues: [default, mailers]
|
|
95
|
+
threads: 5
|
|
96
|
+
consumer_priority: 0 # fallback worker
|
|
74
97
|
event_consumers:
|
|
75
98
|
- queues: [orders, payments]
|
|
76
99
|
threads: 5
|
|
@@ -319,6 +342,255 @@ end
|
|
|
319
342
|
4. When `completed_jobs + discarded_jobs == total_jobs`, the batch status flips to `"finished"` and callback jobs are enqueued
|
|
320
343
|
5. The dispatcher cleans up finished batches older than 7 days
|
|
321
344
|
|
|
345
|
+
## Job uniqueness
|
|
346
|
+
|
|
347
|
+
Prevent duplicate jobs from running. Unlike `limits_concurrency` (which controls *how many* jobs with the same key run), uniqueness guarantees *at most one* job with a given key exists in the system at any time.
|
|
348
|
+
|
|
349
|
+
```ruby
|
|
350
|
+
class ImportOrderJob < ApplicationJob
|
|
351
|
+
ensures_uniqueness strategy: :until_executed,
|
|
352
|
+
key: ->(order_id) { "import-order-#{order_id}" },
|
|
353
|
+
on_conflict: :reject
|
|
354
|
+
|
|
355
|
+
def perform(order_id)
|
|
356
|
+
# Only ONE instance per order_id can exist — from enqueue through completion.
|
|
357
|
+
# If another ImportOrderJob for this order_id is already enqueued or running,
|
|
358
|
+
# the duplicate is rejected immediately.
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
### Strategies
|
|
364
|
+
|
|
365
|
+
| Strategy | Lock acquired | Lock released | Prevents |
|
|
366
|
+
|----------|--------------|---------------|----------|
|
|
367
|
+
| `:until_executed` | At enqueue | On completion or DLQ | Duplicate enqueue AND execution |
|
|
368
|
+
| `:while_executing` | At execution start | On completion or DLQ | Duplicate execution only |
|
|
369
|
+
|
|
370
|
+
### Conflict policies
|
|
371
|
+
|
|
372
|
+
| Policy | Behavior |
|
|
373
|
+
|--------|----------|
|
|
374
|
+
| `:reject` | Raise `Pgbus::JobNotUnique` (default) |
|
|
375
|
+
| `:discard` | Silently drop the duplicate |
|
|
376
|
+
| `:log` | Log a warning and drop |
|
|
377
|
+
|
|
378
|
+
### Lock lifecycle
|
|
379
|
+
|
|
380
|
+
The lock is **never released by a timer**. It is held as long as the job exists in the system:
|
|
381
|
+
|
|
382
|
+
```text
|
|
383
|
+
Enqueue ──→ pgbus_job_locks (state: queued, owner_pid: nil)
|
|
384
|
+
│
|
|
385
|
+
Worker picks up job
|
|
386
|
+
│
|
|
387
|
+
▼
|
|
388
|
+
claim_for_execution! (state: executing, owner_pid: PID)
|
|
389
|
+
│
|
|
390
|
+
┌───────┴───────┐
|
|
391
|
+
▼ ▼
|
|
392
|
+
Success Crash
|
|
393
|
+
release! (lock orphaned)
|
|
394
|
+
(row deleted) │
|
|
395
|
+
▼
|
|
396
|
+
Reaper checks:
|
|
397
|
+
Is owner_pid in pgbus_processes
|
|
398
|
+
with fresh heartbeat?
|
|
399
|
+
│
|
|
400
|
+
┌─────┴─────┐
|
|
401
|
+
No Yes
|
|
402
|
+
▼ ▼
|
|
403
|
+
release! (keep lock,
|
|
404
|
+
(orphaned) job is running)
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
**Crash recovery** works through the reaper (runs every 5 minutes in the dispatcher). It cross-references `owner_pid` in `pgbus_job_locks` against `pgbus_processes` heartbeats. If the owning worker has no fresh heartbeat, the lock is orphaned and released — the PGMQ message's visibility timeout will expire and the job will be retried by another worker.
|
|
408
|
+
|
|
409
|
+
A last-resort TTL (default 24 hours) handles the case where the entire pgbus supervisor is dead and the reaper itself can't run.
|
|
410
|
+
|
|
411
|
+
### Uniqueness vs concurrency controls
|
|
412
|
+
|
|
413
|
+
| | `ensures_uniqueness` | `limits_concurrency` |
|
|
414
|
+
|---|---|---|
|
|
415
|
+
| **Purpose** | Prevent duplicate jobs | Limit concurrent execution slots |
|
|
416
|
+
| **Lock type** | Binary lock (one or none) | Counting semaphore (up to N) |
|
|
417
|
+
| **At enqueue** | `:until_executed` blocks duplicates | Checks semaphore, blocks/discards/raises |
|
|
418
|
+
| **At execution** | `:while_executing` blocks duplicate runs | Not checked (semaphore acquired at enqueue) |
|
|
419
|
+
| **Duplicate in queue** | `:until_executed`: impossible. `:while_executing`: allowed, only one runs | Allowed up to N, rest blocked |
|
|
420
|
+
| **Crash recovery** | Reaper checks heartbeats | Semaphore `expires_at` + dispatcher cleanup |
|
|
421
|
+
| **Use when** | "This exact job must not run twice" | "At most N of these can run at once" |
|
|
422
|
+
|
|
423
|
+
**When to use which:**
|
|
424
|
+
- Payment processing, order import, unique email sends → `ensures_uniqueness`
|
|
425
|
+
- Rate-limited API calls, resource-constrained tasks → `limits_concurrency`
|
|
426
|
+
- Both at once → combine them (they use separate tables, no conflicts)
|
|
427
|
+
|
|
428
|
+
### Setup
|
|
429
|
+
|
|
430
|
+
```bash
|
|
431
|
+
rails generate pgbus:add_job_locks # Add the migration
|
|
432
|
+
rails generate pgbus:add_job_locks --database=pgbus # For separate database
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
## Priority queues
|
|
436
|
+
|
|
437
|
+
Route jobs to priority sub-queues so high-priority work is processed first:
|
|
438
|
+
|
|
439
|
+
```ruby
|
|
440
|
+
Pgbus.configure do |config|
|
|
441
|
+
config.priority_levels = 3 # Creates _p0, _p1, _p2 sub-queues per logical queue
|
|
442
|
+
config.default_priority = 1 # Jobs without explicit priority go to _p1
|
|
443
|
+
end
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
Workers read from `_p0` (highest) first, then `_p1`, then `_p2`. Only when higher-priority sub-queues are empty does the worker read from lower ones.
|
|
447
|
+
|
|
448
|
+
Use ActiveJob's built-in `priority` attribute:
|
|
449
|
+
|
|
450
|
+
```ruby
|
|
451
|
+
class CriticalAlertJob < ApplicationJob
|
|
452
|
+
queue_as :default
|
|
453
|
+
queue_with_priority 0 # Highest priority
|
|
454
|
+
|
|
455
|
+
def perform(alert_id)
|
|
456
|
+
# ...
|
|
457
|
+
end
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
class ReportJob < ApplicationJob
|
|
461
|
+
queue_as :default
|
|
462
|
+
queue_with_priority 2 # Lowest priority
|
|
463
|
+
|
|
464
|
+
def perform(report_id)
|
|
465
|
+
# ...
|
|
466
|
+
end
|
|
467
|
+
end
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
When `priority_levels` is `nil` (default), priority queues are disabled and all jobs go to a single queue per logical name.
|
|
471
|
+
|
|
472
|
+
## Single active consumer
|
|
473
|
+
|
|
474
|
+
For queues that require strict ordering, enable single active consumer mode. Only one worker process can read from a queue at a time -- others skip it and process other queues.
|
|
475
|
+
|
|
476
|
+
```yaml
|
|
477
|
+
# config/pgbus.yml
|
|
478
|
+
production:
|
|
479
|
+
workers:
|
|
480
|
+
- queues: [ordered_events]
|
|
481
|
+
threads: 1
|
|
482
|
+
single_active_consumer: true
|
|
483
|
+
- queues: [ordered_events]
|
|
484
|
+
threads: 1
|
|
485
|
+
single_active_consumer: true # Standby — takes over if the first worker dies
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
Uses PostgreSQL session-level advisory locks (`pg_try_advisory_lock`). The lock is non-blocking -- workers that can't acquire it simply skip the queue. Locks auto-release on connection close (including crashes), so failover is automatic.
|
|
489
|
+
|
|
490
|
+
## Consumer priority
|
|
491
|
+
|
|
492
|
+
When multiple workers subscribe to the same queues, higher-priority workers process messages first. Lower-priority workers back off (3x polling interval) when a higher-priority worker is active.
|
|
493
|
+
|
|
494
|
+
```yaml
|
|
495
|
+
# config/pgbus.yml
|
|
496
|
+
production:
|
|
497
|
+
workers:
|
|
498
|
+
- queues: [default]
|
|
499
|
+
threads: 10
|
|
500
|
+
consumer_priority: 10 # Primary — polls at base interval
|
|
501
|
+
- queues: [default]
|
|
502
|
+
threads: 5
|
|
503
|
+
consumer_priority: 0 # Fallback — polls at 3x interval when primary is healthy
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
Priority is stored in heartbeat metadata. Workers check the `pgbus_processes` table to discover higher-priority peers. When a high-priority worker goes stale (no heartbeat for 5 minutes), lower-priority workers automatically resume normal polling.
|
|
507
|
+
|
|
508
|
+
## Circuit breaker and queue pause/resume
|
|
509
|
+
|
|
510
|
+
Pgbus automatically pauses queues that fail repeatedly, preventing cascading failures.
|
|
511
|
+
|
|
512
|
+
```ruby
|
|
513
|
+
Pgbus.configure do |config|
|
|
514
|
+
config.circuit_breaker_enabled = true # default
|
|
515
|
+
config.circuit_breaker_threshold = 5 # consecutive failures before tripping
|
|
516
|
+
config.circuit_breaker_base_backoff = 30 # seconds (doubles per trip)
|
|
517
|
+
config.circuit_breaker_max_backoff = 600 # 10 minute cap
|
|
518
|
+
end
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
When a queue hits the failure threshold:
|
|
522
|
+
1. The circuit breaker **auto-pauses** the queue with exponential backoff
|
|
523
|
+
2. After the backoff expires, the queue **auto-resumes** and the trip counter resets
|
|
524
|
+
3. If failures continue, each trip doubles the backoff (capped at `max_backoff`)
|
|
525
|
+
|
|
526
|
+
You can also **manually pause/resume** queues from the dashboard. The pause state is stored in the `pgbus_queue_states` table and survives restarts.
|
|
527
|
+
|
|
528
|
+
```bash
|
|
529
|
+
rails generate pgbus:add_queue_states # Add the queue_states migration
|
|
530
|
+
rails generate pgbus:add_queue_states --database=pgbus # For separate database
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
## Prefetch flow control
|
|
534
|
+
|
|
535
|
+
Cap the number of in-flight (claimed but unfinished) messages per worker:
|
|
536
|
+
|
|
537
|
+
```ruby
|
|
538
|
+
Pgbus.configure do |config|
|
|
539
|
+
config.prefetch_limit = 20 # nil = unlimited (default)
|
|
540
|
+
end
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
The worker tracks in-flight messages with an atomic counter and only fetches `min(idle_threads, prefetch_available)` messages per cycle. The counter is decremented in an `ensure` block so it never gets stuck.
|
|
544
|
+
|
|
545
|
+
## Transactional outbox
|
|
546
|
+
|
|
547
|
+
Publish events atomically inside your database transactions. A background poller moves outbox entries to PGMQ.
|
|
548
|
+
|
|
549
|
+
```bash
|
|
550
|
+
rails generate pgbus:add_outbox # Add the outbox migration
|
|
551
|
+
rails generate pgbus:add_outbox --database=pgbus # For separate database
|
|
552
|
+
```
|
|
553
|
+
|
|
554
|
+
```ruby
|
|
555
|
+
Pgbus.configure do |config|
|
|
556
|
+
config.outbox_enabled = true
|
|
557
|
+
config.outbox_poll_interval = 1.0 # seconds
|
|
558
|
+
config.outbox_batch_size = 100
|
|
559
|
+
config.outbox_retention = 24 * 3600 # keep published entries for 24h
|
|
560
|
+
end
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
Usage:
|
|
564
|
+
|
|
565
|
+
```ruby
|
|
566
|
+
ActiveRecord::Base.transaction do
|
|
567
|
+
order = Order.create!(params)
|
|
568
|
+
|
|
569
|
+
# Published atomically with the order — if the transaction rolls back,
|
|
570
|
+
# the outbox entry is also rolled back. No lost or phantom events.
|
|
571
|
+
Pgbus::Outbox.publish("default", { order_id: order.id })
|
|
572
|
+
|
|
573
|
+
# For topic-based event bus:
|
|
574
|
+
Pgbus::Outbox.publish_event("orders.created", { order_id: order.id })
|
|
575
|
+
end
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
The outbox poller uses `FOR UPDATE SKIP LOCKED` inside a transaction to claim entries, publishes them to PGMQ, and marks them as published. Failed entries are skipped and retried next cycle.
|
|
579
|
+
|
|
580
|
+
## Archive compaction
|
|
581
|
+
|
|
582
|
+
PGMQ archive tables grow unbounded. Pgbus automatically purges old entries:
|
|
583
|
+
|
|
584
|
+
```ruby
|
|
585
|
+
Pgbus.configure do |config|
|
|
586
|
+
config.archive_retention = 7 * 24 * 3600 # 7 days (default)
|
|
587
|
+
config.archive_compaction_interval = 3600 # run every hour (default)
|
|
588
|
+
config.archive_compaction_batch_size = 1000 # delete in batches (default)
|
|
589
|
+
end
|
|
590
|
+
```
|
|
591
|
+
|
|
592
|
+
The dispatcher runs archive compaction as part of its maintenance loop, deleting archived messages older than `archive_retention` in batches to avoid long-running transactions.
|
|
593
|
+
|
|
322
594
|
## Configuration reference
|
|
323
595
|
|
|
324
596
|
| Option | Default | Description |
|
|
@@ -336,27 +608,46 @@ end
|
|
|
336
608
|
| `max_memory_mb` | `nil` | Recycle worker when memory exceeds N MB |
|
|
337
609
|
| `max_worker_lifetime` | `nil` | Recycle worker after N seconds |
|
|
338
610
|
| `listen_notify` | `true` | Use PGMQ's LISTEN/NOTIFY for instant wake-up |
|
|
611
|
+
| `prefetch_limit` | `nil` | Max in-flight messages per worker (nil = unlimited) |
|
|
339
612
|
| `dispatch_interval` | `1.0` | Seconds between dispatcher maintenance ticks |
|
|
613
|
+
| `circuit_breaker_enabled` | `true` | Enable auto-pause on consecutive failures |
|
|
614
|
+
| `circuit_breaker_threshold` | `5` | Consecutive failures before tripping |
|
|
615
|
+
| `circuit_breaker_base_backoff` | `30` | Base backoff seconds (doubles per trip) |
|
|
616
|
+
| `circuit_breaker_max_backoff` | `600` | Max backoff cap in seconds |
|
|
617
|
+
| `priority_levels` | `nil` | Number of priority sub-queues (nil = disabled, 2-10) |
|
|
618
|
+
| `default_priority` | `1` | Default priority for jobs without explicit priority |
|
|
619
|
+
| `archive_retention` | `604800` | Seconds to keep archived messages (7 days) |
|
|
620
|
+
| `archive_compaction_interval` | `3600` | Seconds between archive cleanup runs |
|
|
621
|
+
| `archive_compaction_batch_size` | `1000` | Rows deleted per batch during compaction |
|
|
622
|
+
| `outbox_enabled` | `false` | Enable transactional outbox poller process |
|
|
623
|
+
| `outbox_poll_interval` | `1.0` | Seconds between outbox poll cycles |
|
|
624
|
+
| `outbox_batch_size` | `100` | Max entries per outbox poll cycle |
|
|
625
|
+
| `outbox_retention` | `86400` | Seconds to keep published outbox entries (1 day) |
|
|
340
626
|
| `idempotency_ttl` | `604800` | Seconds to keep processed event records (7 days, cleaned hourly) |
|
|
341
627
|
| `web_auth` | `nil` | Lambda for dashboard authentication |
|
|
342
628
|
| `web_refresh_interval` | `5000` | Dashboard auto-refresh interval in milliseconds |
|
|
343
629
|
| `web_live_updates` | `true` | Enable Turbo Frames auto-refresh on dashboard |
|
|
630
|
+
| `stats_enabled` | `true` | Record job execution stats for insights dashboard |
|
|
631
|
+
| `stats_retention` | `604800` | Seconds to keep job stats (7 days) |
|
|
344
632
|
|
|
345
633
|
## Architecture
|
|
346
634
|
|
|
347
635
|
```text
|
|
348
636
|
Supervisor (fork manager)
|
|
349
|
-
├── Worker 1 (queues: [default, mailers], threads: 10)
|
|
350
|
-
├── Worker 2 (queues: [critical], threads: 5)
|
|
351
|
-
├── Dispatcher (maintenance:
|
|
352
|
-
|
|
637
|
+
├── Worker 1 (queues: [default, mailers], threads: 10, priority: 10)
|
|
638
|
+
├── Worker 2 (queues: [critical], threads: 5, single_active_consumer: true)
|
|
639
|
+
├── Dispatcher (maintenance: cleanup, compaction, reaping, circuit breaker)
|
|
640
|
+
├── Scheduler (recurring tasks via cron)
|
|
641
|
+
├── Consumer (event bus topics)
|
|
642
|
+
└── Outbox Poller (transactional outbox → PGMQ, when enabled)
|
|
353
643
|
|
|
354
644
|
PostgreSQL + PGMQ
|
|
355
645
|
├── pgbus_default (job queue)
|
|
356
646
|
├── pgbus_default_dlq (dead letter queue)
|
|
357
647
|
├── pgbus_critical (job queue)
|
|
358
648
|
├── pgbus_critical_dlq (dead letter queue)
|
|
359
|
-
|
|
649
|
+
├── pgbus_mailers (job queue)
|
|
650
|
+
└── pgbus_queue_states (pause/resume + circuit breaker state)
|
|
360
651
|
```
|
|
361
652
|
|
|
362
653
|
### How it works
|
|
@@ -395,15 +686,33 @@ pgbus help # Show help
|
|
|
395
686
|
|
|
396
687
|
The dashboard is a mountable Rails engine at `/pgbus` with:
|
|
397
688
|
|
|
398
|
-
- **Overview** -- queue depths, enqueued count, active processes, failure count
|
|
399
|
-
- **Queues** -- per-queue metrics, purge actions
|
|
689
|
+
- **Overview** -- queue depths, enqueued count, active processes, failure count, throughput rate
|
|
690
|
+
- **Queues** -- per-queue metrics, purge/pause/resume actions
|
|
400
691
|
- **Jobs** -- enqueued and failed jobs, retry/discard actions
|
|
401
692
|
- **Dead letter** -- DLQ messages with retry/discard, bulk actions
|
|
402
693
|
- **Processes** -- active workers/dispatcher/consumers with heartbeat status
|
|
403
694
|
- **Events** -- registered subscribers and processed events
|
|
695
|
+
- **Outbox** -- transactional outbox entries pending publication
|
|
696
|
+
- **Locks** -- active job uniqueness locks with state (queued/executing), owner PID@hostname, age
|
|
697
|
+
- **Insights** -- throughput chart (jobs/min), status distribution donut, slowest job classes table
|
|
404
698
|
|
|
405
699
|
All tables use Turbo Frames for periodic auto-refresh without page reloads.
|
|
406
700
|
|
|
701
|
+
### Dark mode
|
|
702
|
+
|
|
703
|
+
The dashboard supports dark mode via Tailwind CSS `dark:` classes. It respects your system preference on first visit and persists your choice via localStorage. Toggle with the sun/moon button in the nav bar.
|
|
704
|
+
|
|
705
|
+
### Job stats and insights
|
|
706
|
+
|
|
707
|
+
The executor records every job completion to `pgbus_job_stats` (job class, queue, status, duration). The insights page visualizes this data with ApexCharts (loaded via CDN, zero npm dependencies).
|
|
708
|
+
|
|
709
|
+
```bash
|
|
710
|
+
rails generate pgbus:add_job_stats # Add the stats migration
|
|
711
|
+
rails generate pgbus:add_job_stats --database=pgbus
|
|
712
|
+
```
|
|
713
|
+
|
|
714
|
+
Stats collection is enabled by default (`config.stats_enabled = true`). Old stats are cleaned up by the dispatcher based on `config.stats_retention` (default: 7 days). If the migration hasn't been run yet, stat recording is silently skipped.
|
|
715
|
+
|
|
407
716
|
## Database tables
|
|
408
717
|
|
|
409
718
|
Pgbus uses these tables (created via PGMQ and migrations):
|
|
@@ -411,13 +720,19 @@ Pgbus uses these tables (created via PGMQ and migrations):
|
|
|
411
720
|
| Table | Purpose |
|
|
412
721
|
|-------|---------|
|
|
413
722
|
| `q_pgbus_*` | PGMQ job queues (managed by PGMQ) |
|
|
414
|
-
| `a_pgbus_*` | PGMQ archive tables (managed by PGMQ) |
|
|
723
|
+
| `a_pgbus_*` | PGMQ archive tables (managed by PGMQ, compacted by dispatcher) |
|
|
415
724
|
| `pgbus_processes` | Heartbeat tracking for workers/dispatcher/consumers |
|
|
416
725
|
| `pgbus_failed_events` | Failed event dispatch records |
|
|
417
726
|
| `pgbus_processed_events` | Idempotency deduplication (event_id, handler_class) |
|
|
418
727
|
| `pgbus_semaphores` | Concurrency control counting semaphores |
|
|
419
728
|
| `pgbus_blocked_executions` | Jobs waiting for a concurrency semaphore slot |
|
|
420
729
|
| `pgbus_batches` | Batch tracking with job counters and callback config |
|
|
730
|
+
| `pgbus_job_locks` | Job uniqueness locks (state, owner_pid, reaper correlation) |
|
|
731
|
+
| `pgbus_job_stats` | Job execution metrics (class, queue, status, duration) |
|
|
732
|
+
| `pgbus_queue_states` | Queue pause/resume and circuit breaker state |
|
|
733
|
+
| `pgbus_outbox_entries` | Transactional outbox entries pending publication |
|
|
734
|
+
| `pgbus_recurring_tasks` | Recurring job definitions |
|
|
735
|
+
| `pgbus_recurring_executions` | Recurring job execution history |
|
|
421
736
|
|
|
422
737
|
## Switching from another backend
|
|
423
738
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgbus
|
|
4
|
+
module Api
|
|
5
|
+
class InsightsController < ApplicationController
|
|
6
|
+
def show
|
|
7
|
+
render json: {
|
|
8
|
+
summary: data_source.job_stats_summary,
|
|
9
|
+
throughput: data_source.job_throughput,
|
|
10
|
+
status_counts: data_source.job_status_counts,
|
|
11
|
+
slowest: data_source.slowest_job_classes
|
|
12
|
+
}
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -45,6 +45,34 @@ module Pgbus
|
|
|
45
45
|
end
|
|
46
46
|
end
|
|
47
47
|
|
|
48
|
+
def pgbus_duration(seconds)
|
|
49
|
+
return "—" unless seconds
|
|
50
|
+
|
|
51
|
+
seconds = seconds.to_i
|
|
52
|
+
if seconds < 60
|
|
53
|
+
"#{seconds}s"
|
|
54
|
+
elsif seconds < 3600
|
|
55
|
+
"#{seconds / 60}m #{seconds % 60}s"
|
|
56
|
+
elsif seconds < 86_400
|
|
57
|
+
"#{seconds / 3600}h #{(seconds % 3600) / 60}m"
|
|
58
|
+
else
|
|
59
|
+
"#{seconds / 86_400}d #{(seconds % 86_400) / 3600}h"
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def pgbus_ms_duration(millis)
|
|
64
|
+
return "—" unless millis
|
|
65
|
+
|
|
66
|
+
millis = millis.to_i
|
|
67
|
+
if millis < 1000
|
|
68
|
+
"#{millis}ms"
|
|
69
|
+
elsif millis < 60_000
|
|
70
|
+
"#{(millis / 1000.0).round(1)}s"
|
|
71
|
+
else
|
|
72
|
+
"#{(millis / 60_000.0).round(1)}m"
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
48
76
|
def pgbus_paused_badge(paused)
|
|
49
77
|
return unless paused
|
|
50
78
|
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgbus
|
|
4
|
+
class JobLock < Pgbus::ApplicationRecord
|
|
5
|
+
self.table_name = "pgbus_job_locks"
|
|
6
|
+
|
|
7
|
+
# States:
|
|
8
|
+
# queued — lock held from enqueue time (:until_executed), no worker yet
|
|
9
|
+
# executing — lock held by an active worker process
|
|
10
|
+
STATES = %w[queued executing].freeze
|
|
11
|
+
|
|
12
|
+
scope :executing, -> { where(state: "executing") }
|
|
13
|
+
scope :queued_locks, -> { where(state: "queued") }
|
|
14
|
+
scope :expired, ->(now = Time.current) { where("expires_at < ?", now) }
|
|
15
|
+
|
|
16
|
+
# Atomically try to acquire a lock.
|
|
17
|
+
# Cleans up expired locks for this key first (crash recovery at acquire time).
|
|
18
|
+
# Returns true if acquired, false if already locked.
|
|
19
|
+
def self.acquire!(lock_key, job_class:, ttl:, job_id: nil, state: "queued", owner_pid: nil, owner_hostname: nil)
|
|
20
|
+
# Remove any expired lock for this key inline (last-resort TTL recovery)
|
|
21
|
+
where(lock_key: lock_key).where("expires_at < ?", Time.current).delete_all
|
|
22
|
+
|
|
23
|
+
result = insert(
|
|
24
|
+
{
|
|
25
|
+
lock_key: lock_key, job_class: job_class, job_id: job_id,
|
|
26
|
+
state: state, owner_pid: owner_pid, owner_hostname: owner_hostname,
|
|
27
|
+
expires_at: Time.current + ttl
|
|
28
|
+
},
|
|
29
|
+
unique_by: :lock_key
|
|
30
|
+
)
|
|
31
|
+
result.rows.any?
|
|
32
|
+
rescue ActiveRecord::RecordNotUnique
|
|
33
|
+
false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Transition a queued lock to executing state and claim ownership.
|
|
37
|
+
# Called when a worker starts executing a job that was locked at enqueue time.
|
|
38
|
+
def self.claim_for_execution!(lock_key, owner_pid:, owner_hostname:, ttl:)
|
|
39
|
+
where(lock_key: lock_key).update_all(
|
|
40
|
+
state: "executing",
|
|
41
|
+
owner_pid: owner_pid,
|
|
42
|
+
owner_hostname: owner_hostname,
|
|
43
|
+
expires_at: Time.current + ttl
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Release a lock by key.
|
|
48
|
+
def self.release!(lock_key)
|
|
49
|
+
where(lock_key: lock_key).delete_all
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Check if a lock is currently held (regardless of expiry — reaper handles orphans).
|
|
53
|
+
def self.locked?(lock_key)
|
|
54
|
+
where(lock_key: lock_key).exists?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Reap orphaned locks: locks in 'executing' state whose owner_pid
|
|
58
|
+
# has no healthy entry in pgbus_processes.
|
|
59
|
+
# Returns the number of orphaned locks released.
|
|
60
|
+
# Reap orphaned locks by matching (pid, hostname) against live process entries.
|
|
61
|
+
# A lock is orphaned if no healthy process exists with the same pid AND hostname.
|
|
62
|
+
def self.reap_orphaned!
|
|
63
|
+
alive_workers = ProcessEntry
|
|
64
|
+
.where("last_heartbeat_at >= ?", Time.current - Process::Heartbeat::ALIVE_THRESHOLD)
|
|
65
|
+
.pluck(:pid, :hostname)
|
|
66
|
+
|
|
67
|
+
orphaned = executing.select do |lock|
|
|
68
|
+
alive_workers.none? { |pid, hostname| pid == lock.owner_pid && hostname == lock.owner_hostname }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
return 0 if orphaned.empty?
|
|
72
|
+
|
|
73
|
+
where(id: orphaned.map(&:id)).delete_all
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Last-resort cleanup: delete locks whose expires_at has passed.
|
|
77
|
+
# This only fires when the reaper itself can't run (e.g., entire supervisor dead).
|
|
78
|
+
def self.cleanup_expired!
|
|
79
|
+
expired.delete_all
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgbus
|
|
4
|
+
class JobStat < Pgbus::ApplicationRecord
|
|
5
|
+
self.table_name = "pgbus_job_stats"
|
|
6
|
+
|
|
7
|
+
scope :since, ->(time) { where("created_at >= ?", time) }
|
|
8
|
+
scope :successful, -> { where(status: "success") }
|
|
9
|
+
scope :failed, -> { where(status: "failed") }
|
|
10
|
+
scope :dead_lettered, -> { where(status: "dead_lettered") }
|
|
11
|
+
|
|
12
|
+
# Record a job execution stat. Called by the executor after each job.
|
|
13
|
+
def self.record!(job_class:, queue_name:, status:, duration_ms:)
|
|
14
|
+
return unless table_exists?
|
|
15
|
+
|
|
16
|
+
create!(
|
|
17
|
+
job_class: job_class,
|
|
18
|
+
queue_name: queue_name,
|
|
19
|
+
status: status,
|
|
20
|
+
duration_ms: duration_ms
|
|
21
|
+
)
|
|
22
|
+
rescue StandardError => e
|
|
23
|
+
Pgbus.logger.debug { "[Pgbus] Failed to record job stat: #{e.message}" }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Memoized — intentionally never invalidated at runtime. If the
|
|
27
|
+
# pgbus_job_stats migration runs while the app is already running,
|
|
28
|
+
# a restart is required for stat recording to begin.
|
|
29
|
+
def self.table_exists?
|
|
30
|
+
return @table_exists if defined?(@table_exists)
|
|
31
|
+
|
|
32
|
+
@table_exists = connection.table_exists?(table_name)
|
|
33
|
+
rescue StandardError
|
|
34
|
+
@table_exists = false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Throughput: jobs per minute bucketed by minute for the last N minutes
|
|
38
|
+
def self.throughput(minutes: 60)
|
|
39
|
+
since(minutes.minutes.ago)
|
|
40
|
+
.group("date_trunc('minute', created_at)")
|
|
41
|
+
.order(Arel.sql("date_trunc('minute', created_at)"))
|
|
42
|
+
.count
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Average duration by job class
|
|
46
|
+
def self.avg_duration_by_class(minutes: 60)
|
|
47
|
+
since(minutes.minutes.ago)
|
|
48
|
+
.group(:job_class)
|
|
49
|
+
.order(Arel.sql("AVG(duration_ms) DESC"))
|
|
50
|
+
.average(:duration_ms)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Success/fail/DLQ counts
|
|
54
|
+
def self.status_counts(minutes: 60)
|
|
55
|
+
since(minutes.minutes.ago).group(:status).count
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Top N slowest job classes by average duration
|
|
59
|
+
def self.slowest_classes(limit: 10, minutes: 60)
|
|
60
|
+
since(minutes.minutes.ago)
|
|
61
|
+
.group(:job_class)
|
|
62
|
+
.order(Arel.sql("AVG(duration_ms) DESC"))
|
|
63
|
+
.limit(limit)
|
|
64
|
+
.pluck(:job_class, Arel.sql("COUNT(*)"), Arel.sql("ROUND(AVG(duration_ms))"), Arel.sql("MAX(duration_ms)"))
|
|
65
|
+
.map { |cls, count, avg, max| { job_class: cls, count: count.to_i, avg_ms: avg.to_i, max_ms: max.to_i } }
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Single-query aggregate summary using conditional counts.
|
|
69
|
+
def self.summary(minutes: 60)
|
|
70
|
+
row = since(minutes.minutes.ago).pick(
|
|
71
|
+
Arel.sql("COUNT(*)"),
|
|
72
|
+
Arel.sql("COUNT(*) FILTER (WHERE status = 'success')"),
|
|
73
|
+
Arel.sql("COUNT(*) FILTER (WHERE status = 'failed')"),
|
|
74
|
+
Arel.sql("COUNT(*) FILTER (WHERE status = 'dead_lettered')"),
|
|
75
|
+
Arel.sql("ROUND(AVG(duration_ms)::numeric, 1)"),
|
|
76
|
+
Arel.sql("MAX(duration_ms)")
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
{
|
|
80
|
+
total: row[0].to_i,
|
|
81
|
+
success: row[1].to_i,
|
|
82
|
+
failed: row[2].to_i,
|
|
83
|
+
dead_lettered: row[3].to_i,
|
|
84
|
+
avg_duration_ms: row[4]&.to_f || 0,
|
|
85
|
+
max_duration_ms: row[5].to_i
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Cleanup old stats
|
|
90
|
+
def self.cleanup!(older_than:)
|
|
91
|
+
where("created_at < ?", older_than).delete_all
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|