npm - rollbridge - Versions diffs - 0.1.4 → 0.1.6 - Mend

rollbridge 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +137 -4
package/TODO.md +47 -45
package/docs/cli.md +169 -6
package/docs/config.md +160 -3
package/docs/logging.md +77 -0
package/docs/nginx.md +104 -0
package/docs/releasing.md +53 -0
package/docs/tensorbuzz-runbook.md +129 -0
package/docs/velocious.md +238 -0
package/docs/workers.md +115 -0
package/package.json +3 -2
package/src/cli.js +317 -1
package/src/config.js +240 -6
package/src/daemon.js +284 -4
package/src/doctor.js +177 -0
package/src/event-log.js +47 -0
package/src/managed-process.js +287 -22
package/src/process-memory.js +110 -0
package/src/recover.js +134 -0
package/src/release-group.js +80 -21
package/src/state-store.js +103 -0
package/src/system-ids.js +71 -0
package/src/template.js +32 -0
package/test/completion.test.js +64 -0
package/test/config-validation.test.js +267 -0
package/test/doctor.test.js +205 -3
package/test/event-log.test.js +46 -0
package/test/fixtures/memory-hog.js +19 -0
package/test/managed-process.test.js +376 -0
package/test/process-memory.test.js +40 -0
package/test/recover.test.js +162 -0
package/test/release-group.test.js +22 -0
package/test/rollbridge.test.js +716 -6
package/test/state-store.test.js +69 -0
package/test/system-ids.test.js +24 -0
package/scripts/release-patch.js +0 -83

package/docs/velocious.md ADDED Viewed

@@ -0,0 +1,238 @@
+# Velocious deployment guide
+A Velocious backend typically runs four kinds of process: **Beacon** (the
+message broker other processes connect to), **background-jobs-main** (the job
+coordinator), **background-jobs-worker** (runs the jobs), and the **web/API**
+server. This guide maps each to a Rollbridge process policy, shows a complete
+`rollbridge.js`, and explains startup ordering and what happens on a deploy.
+A production version of this config lives at
+[`examples/tensorbuzz.com.js`](../examples/tensorbuzz.com.js).
+## Process mapping
+| Velocious process | Policy | Why |
+| --- | --- | --- |
+| `beacon` | `service` | A shared broker the other processes connect to. It should survive deploys and keep a **stable port**, so workers and the web process always reach the same Beacon. |
+| `background-jobs-main` | `service` (or `singleton`) | The job coordinator. Run it as a `service` when it should outlive releases on a stable port; run it as a `singleton` when it must run the latest release's code after every deploy (see [Choosing the jobs-main policy](#choosing-the-jobs-main-policy)). |
+| `background-jobs-worker` | `companion` | Release-scoped: one set of workers per active release, started before the web process and running that release's code. |
+| `web` | `proxied` | Receives external HTTP/WebSocket traffic, is health-checked before traffic switches, and is drained on the next deploy. Exactly one process is `proxied`. |
+See [README → Process Policies](../README.md#process-policies) for the full
+semantics of each policy and [`docs/config.md`](config.md) for every field.
+## Example `rollbridge.js`
+```js
+// rollbridge.js
+export default {
+  application: "tensorbuzz",
+  control: {path: "/tmp/rollbridge-tensorbuzz.sock"},
+  proxy: {
+    host: "127.0.0.1",
+    port: 4500,          // the stable port Nginx points at
+    healthPath: "/ping",
+    healthTimeoutMs: 30000,
+    drainTimeoutMs: 60000,
+    forceStopTimeoutMs: 10000
+  },
+  processes: [
+    // Shared broker — one daemon-wide instance on a stable port.
+    {
+      id: "beacon",
+      policy: "service",
+      cwd: "{{releasePath}}/backend",
+      env: {NODE_ENV: "production", VELOCIOUS_BEACON_PORT: "{{port}}"},
+      command: "npx velocious beacon",
+      port: 7330
+    },
+    // Job coordinator — waits for Beacon, stable port other jobs processes use.
+    {
+      id: "background-jobs-main",
+      policy: "service",
+      cwd: "{{releasePath}}/backend",
+      env: {
+        NODE_ENV: "production",
+        VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+        VELOCIOUS_BACKGROUND_JOBS_PORT: "{{port}}"
+      },
+      command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- npx velocious background-jobs-main",
+      port: 7331
+    },
+    // Workers — one set per release; raise gracefulStopMs to let in-flight
+    // jobs finish during a deploy.
+    {
+      id: "background-jobs-worker",
+      policy: "companion",
+      cwd: "{{releasePath}}/backend",
+      env: {
+        NODE_ENV: "production",
+        VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+        VELOCIOUS_BACKGROUND_JOBS_PORT: "{{ports.background-jobs-main}}"
+      },
+      command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- wait-for-it 127.0.0.1:{{ports.background-jobs-main}} --strict -- npx velocious background-jobs-worker",
+      gracefulStopMs: 60000
+    },
+    // Web/API — the one proxied process.
+    {
+      id: "web",
+      policy: "proxied",
+      cwd: "{{releasePath}}/backend",
+      env: {
+        NODE_ENV: "production",
+        VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+        VELOCIOUS_BACKGROUND_JOBS_PORT: "{{ports.background-jobs-main}}"
+      },
+      command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- wait-for-it 127.0.0.1:{{ports.background-jobs-main}} --strict -- npx velocious server --host 127.0.0.1 --port {{port}}",
+      port: {from: 14500, to: 14599},
+      health: {path: "/ping", timeoutMs: 30000, intervalMs: 500}
+    }
+  ]
+}
+```
+## Wiring processes together
+Beacon and `background-jobs-main` get **fixed** ports (`7330`, `7331`) because
+they are `service`s — a stable port lets every release's workers and web process
+find them. The proxied `web` process gets a **range** (`{from: 14500, to:
+14599}`); Rollbridge allocates a free port per release so the old and new web
+releases can run side by side during the drain.
+Cross-reference ports with `{{ports.<id>}}` and pass them to Velocious through
+`env`. Rollbridge also injects `ROLLBRIDGE_<ID>_PORT` for every process (e.g.
+`ROLLBRIDGE_BACKGROUND_JOBS_MAIN_PORT`), so you can read ports from the
+environment instead of templating if you prefer — see
+[`docs/config.md`](config.md#injected-environment-variables).
+### Startup ordering
+Only the `proxied` process is health-checked, so dependent processes must wait
+for their dependencies themselves. Two mechanisms combine:
+1. **Policy ordering.** On each deploy Rollbridge starts `service`s first, then
+   the release's `companion`s, then the `proxied` process (see
+   [README → Deploy ordering](../README.md#deploy-ordering)).
+2. **Readiness gating.** `wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- …`
+   blocks the command until Beacon's port accepts connections, so
+   `background-jobs-main`, the worker, and `web` don't start talking to Beacon
+   before it is listening. `wait-for-it` is a small standalone script (install it
+   on the host); any equivalent port-wait works.
+## Deploying
+Drive deploys through the Rollbridge CLI — Rollbridge ships no deploy-tool
+plugins (see [`docs/deploy-recipes.md`](deploy-recipes.md) for shell/CI/Capistrano
+recipes). The minimal step after a release directory is prepared:
+```bash
+release_path=/srv/tensorbuzz/releases/20260523120000  # prepared by your pipeline
+# Run backwards-compatible migrations BEFORE switching traffic: the old and new
+# web releases overlap during the drain.
+(cd "$release_path/backend" && npx velocious db:migrate)
+rollbridge deploy \
+  --ensure-daemon \
+  --config /etc/rollbridge/rollbridge.js \
+  --release-path "$release_path" \
+  --revision "$(git -C "$release_path/backend" rev-parse HEAD)"
+```
+`rollbridge deploy` starts the new release's worker and web process,
+health-checks `web` on its `{{port}}`/`/ping`, switches traffic, then drains and
+stops the previous release. It exits non-zero (leaving the previous release
+active) if the new release fails to start or health-check, so a failed deploy
+never promotes a broken release.
+## Background jobs across a deploy
+The worker is a `companion`, so each release runs its own workers:
+- On deploy, the **new** release's workers start (running the new code) before
+  traffic switches; the **old** release's workers are stopped when that release
+  is drained and retired — the worker's `stopSignal`, then `SIGKILL` after
+  `gracefulStopMs`.
+- Set `stopSignal` to the signal your worker drains on and `gracefulStopMs` to at
+  least your longest in-flight job, so a job gets time to finish before the
+  forced kill. Set `replicas` to run a pool of workers.
+See [`docs/workers.md`](workers.md) for the full safe background-job deployment
+pattern (companion + `replicas` + `stopSignal`/`lifecycle` hooks +
+`gracefulStopMs`), the old/new worker overlap, and `nonBlockingDrain` to start the
+old workers' drain immediately when a release is retired.
+### Worker recipe
+A complete `background-jobs-worker` entry that runs a pool and finishes in-flight
+jobs across a deploy:
+```js
+{
+  id: "background-jobs-worker",
+  policy: "companion",
+  cwd: "{{releasePath}}/backend",
+  env: {
+    NODE_ENV: "production",
+    VELOCIOUS_ENV: "production",
+    VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+    VELOCIOUS_BACKGROUND_JOBS_PORT: "{{ports.background-jobs-main}}"
+  },
+  command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- wait-for-it 127.0.0.1:{{ports.background-jobs-main}} --strict -- npx velocious background-jobs-worker",
+  replicas: 4,
+  gracefulStopMs: 60000
+}
+```
+- `replicas: 4` runs four worker instances (`background-jobs-worker#0` … `#3`),
+  each with `ROLLBRIDGE_REPLICA_INDEX`/`ROLLBRIDGE_REPLICA_COUNT` if you shard work.
+- On deploy the new release's workers start before traffic switches; the old
+  release's workers receive `SIGTERM` (the default `stopSignal`) when the old
+  release is retired, then `SIGKILL` after `gracefulStopMs` — so size
+  `gracefulStopMs` to your longest job. Both releases' workers briefly consume the
+  shared queue, so keep job code backwards-compatible and jobs idempotent.
+If your worker quiesces on a command or a non-default signal, add a `lifecycle`
+block — Rollbridge runs `quietCommand`, drains for up to `drainTimeoutMs`, then
+stops. For example, send a quiet signal to the worker's process group before the
+drain:
+```js
+lifecycle: {quietCommand: "kill -TSTP -$ROLLBRIDGE_PID", drainTimeoutMs: 60000}
+```
+### Choosing the jobs-main policy
+`background-jobs-main` is duplicate-unsafe (you never want two coordinators), so
+it is either a `service` or a `singleton` — never a `companion`:
+- **`service`** — keeps running across deploys on its stable port. Workers from
+  every release talk to the same coordinator, so there's no coordination gap on
+  deploy. The trade-off: a `service` keeps running the **release it was started
+  from** and only adopts the latest release's template if it crashes and
+  restarts (or the daemon restarts). If `background-jobs-main` itself needs the
+  newest code immediately after every deploy, this is the wrong policy.
+- **`singleton`** — Rollbridge stops the old instance and then starts the new
+  one on each deploy, so it always runs the latest release's code and two copies
+  never overlap. The trade-off: a brief coordination gap while it restarts.
+Beacon is a broker rather than code that changes per release, so `service` is
+almost always right for it.
+## Verifying
+After a deploy, `rollbridge status` should show `beacon` and
+`background-jobs-main` as long-lived `service`s with unchanged ports across
+deploys, one `background-jobs-worker` for the active release, and the `web`
+process `proxied` with its connection counts. Use
+[`rollbridge logs --process <id>`](cli.md) to read recent output from any
+process, and [`docs/troubleshooting.md`](troubleshooting.md) for health-check,
+port, and draining problems.
+For the front end, point Nginx at the stable `proxy.port` (here `4500`), never at
+a release's web port — see [`docs/nginx.md`](nginx.md).

package/docs/workers.md ADDED Viewed

@@ -0,0 +1,115 @@
+# Background-job worker deployment
+This guide covers deploying background-job workers (or any non-HTTP worker pool)
+with Rollbridge so that in-flight jobs finish across a deploy. It uses features
+that exist today; the command-based lifecycle hooks mentioned at the end are
+still on the roadmap.
+## Run workers as a `companion`
+Give each worker the `companion` policy. Companions are **release-scoped**: every
+release starts its own workers running that release's code, and a release's
+workers are stopped only when that release is retired (drained) after a newer
+release takes over. They start **before** the `proxied` web process, so they're
+ready before traffic switches.
+```js
+{
+  id: "worker",
+  policy: "companion",
+  cwd: "{{releasePath}}",
+  command: "npx velocious background-jobs-worker"
+}
+```
+## Scale the pool with `replicas`
+Set `replicas` to run several identical workers (a port-less companion only).
+Each instance runs as `worker#0`, `worker#1`, … and gets
+`ROLLBRIDGE_REPLICA_INDEX` / `ROLLBRIDGE_REPLICA_COUNT` (and `{{replicaIndex}}` /
+`{{replicaCount}}`), so an instance can claim a distinct shard, queue, or lock:
+```js
+{id: "worker", policy: "companion", command: "npx velocious background-jobs-worker", replicas: 4}
+```
+Restart the pool with `rollbridge restart --process worker` (all replicas) or a
+single instance with `rollbridge restart --process worker#0`.
+## Finish in-flight jobs on stop (`stopSignal` + `gracefulStopMs`)
+When Rollbridge stops a worker — during a deploy's drain, a `rollbridge restart`,
+or shutdown — it sends the worker's **`stopSignal`** (default `SIGTERM`), waits up
+to **`gracefulStopMs`**, then `SIGKILL`s it if it hasn't exited. That window is
+the worker's chance to finish its current job and exit cleanly.
+- Set `stopSignal` to the signal your worker quiets/drains on. Many job runners
+  finish the current job and exit on `SIGTERM` (the default); some use `SIGINT`
+  or `SIGQUIT`. Use the one your worker treats as "drain and exit".
+- Set `gracefulStopMs` to at least your longest job's duration, so a job in
+  progress is not cut off by the `SIGKILL` fallback.
+```js
+{
+  id: "worker",
+  policy: "companion",
+  command: "npx velocious background-jobs-worker",
+  replicas: 4,
+  stopSignal: "SIGTERM",
+  gracefulStopMs: 60000
+}
+```
+## What happens across a deploy
+1. The new release's workers start (running the **new** code) before traffic
+   switches to the new web process.
+2. Both old and new workers run while the previous release drains, so **both
+   code versions consume the shared queue at once.** Keep job code
+   backwards-compatible across a deploy — the same rule as database migrations.
+3. When the previous release is retired (its HTTP/WebSocket connections close or
+   `proxy.drainTimeoutMs` elapses), its workers are stopped: `stopSignal`, then
+   `SIGKILL` after `gracefulStopMs`.
+Because old workers are retired on the release's **connection** drain (not on
+their own job queue draining), a job still running when the release is retired
+gets only the `gracefulStopMs` window to finish. Keep jobs **idempotent and
+safe to retry** so a job interrupted at the `SIGKILL` fallback can run again.
+## Command-based lifecycle hooks
+For workers that quiesce or drain via a command rather than a single signal, set
+a `lifecycle` block. When Rollbridge gracefully stops the worker it runs
+`quietCommand` (stop accepting new work), then drains (`drainCommand`, or waits up
+to `drainTimeoutMs` for the worker to exit), then `stopCommand` or `stopSignal`,
+then `SIGKILL` after `gracefulStopMs`. Each hook gets `ROLLBRIDGE_PID` and is
+bounded by a timeout, so a slow hook can't wedge a deploy.
+```js
+{
+  id: "worker",
+  policy: "companion",
+  command: "npx velocious background-jobs-worker",
+  replicas: 4,
+  lifecycle: {quietCommand: "kill -TSTP -$ROLLBRIDGE_PID", drainTimeoutMs: 60000}
+}
+```
+See [`docs/config.md`](config.md#processeslifecycle) for the hook reference.
+## Non-blocking drain
+By default a retired release's workers are stopped only after the proxied
+process's connections have drained. Set `nonBlockingDrain: true` on a worker
+companion whose work is independent of the web process (a job worker on a shared
+queue) to start its graceful stop **immediately** when the release is retired —
+in parallel with the connection drain. The new release's workers handle new work
+while the old workers finish their in-flight jobs:
+```js
+{id: "worker", policy: "companion", command: "…", nonBlockingDrain: true, gracefulStopMs: 60000}
+```
+See [`docs/config.md`](config.md) for `stopSignal`, `replicas`, and
+`gracefulStopMs`, and [`docs/velocious.md`](velocious.md) for a full Velocious
+deployment (Beacon, jobs-main, workers, web) example.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "rollbridge",
-  "version": "0.1.4",
+  "version": "0.1.6",
   "description": "Zero-downtime process supervisor and local traffic switcher for deploy-managed apps.",
   "keywords": [
     "deploy",
@@ -28,7 +28,7 @@
   "scripts": {
     "all-checks": "npm run typecheck && npm run lint && npm test",
     "lint": "eslint",
-    "release:patch": "node scripts/release-patch.js",
+    "release:patch": "release-patch",
     "test": "node --test test/*.test.js",
     "typecheck": "tsc --noEmit"
   },
@@ -46,6 +46,7 @@
     "eslint": "^10.4.0",
     "eslint-plugin-jsdoc": "^62.9.0",
     "globals": "^17.6.0",
+    "release-patch": "^1.0.0",
     "typescript": "^6.0.3"
   }
 }