npm - rollbridge - Versions diffs - 0.1.4 → 0.1.5 - Mend

rollbridge 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +33 -0
package/TODO.md +5 -5
package/docs/cli.md +23 -0
package/docs/config.md +21 -1
package/docs/nginx.md +104 -0
package/docs/velocious.md +200 -0
package/package.json +3 -2
package/src/cli.js +27 -0
package/src/config.js +73 -2
package/src/daemon.js +77 -0
package/src/managed-process.js +65 -9
package/src/release-group.js +9 -0
package/test/config-validation.test.js +40 -0
package/test/managed-process.test.js +86 -0
package/test/rollbridge.test.js +194 -1
package/scripts/release-patch.js +0 -83

package/README.md CHANGED Viewed

@@ -91,6 +91,18 @@ after the process starts before the first health probe — like a readiness
 probe's initial delay, useful for apps with a known boot time. The delay runs
 before the `health.timeoutMs` window begins.
+Set a process's `restart` policy to control automatic restarts after a crash.
+`restart.maxRestarts` caps how many restarts are allowed within `restart.windowMs`
+before Rollbridge gives up and leaves the process `failed` (`maxRestarts: 0`
+disables restarts entirely), while `restart.backoffFactor` — with an optional
+`restart.maxDelayMs` cap — backs off the `restartDelayMs` delay on each successive
+restart. With no `restart` block, a crashed process keeps restarting after
+`restartDelayMs`, as before. See [`docs/config.md`](docs/config.md#processesrestart).
+```js
+restart: {maxRestarts: 5, windowMs: 60000, backoffFactor: 2, maxDelayMs: 30000}
+```
 Set `releaseRetention` to bound how many stopped (drained) releases the daemon
 keeps in memory and reports in `status`. `keep` (default `10`) retains the most
 recent stopped releases; `maxAgeMs` (default `0`, disabled) also prunes stopped
@@ -133,6 +145,10 @@ fails the process start with a clear error, so typos surface immediately.
 Production-ready examples live in `examples/`, including
 `examples/tensorbuzz.com.js` for the current TensorBuzz backend deployment.
+See [`docs/velocious.md`](docs/velocious.md) for a Velocious deployment guide —
+how Beacon, background-jobs-main, background-jobs-worker, and the web process map
+to Rollbridge policies, with startup ordering and deploy behavior.
 See [`docs/config.md`](docs/config.md) for the full config reference — every
 field, its default, validation rules, template variables, and the environment
 variables Rollbridge injects.
@@ -349,6 +365,15 @@ Stop the active release:
 rollbridge stop --config rollbridge.js
 ```
+Restart non-proxied processes in place — all of them, one by id, or a policy
+group (the proxied process is never restarted; use `deploy` for that):
+```bash
+rollbridge restart --config rollbridge.js                      # all non-proxied processes
+rollbridge restart --config rollbridge.js --process background-jobs-worker
+rollbridge restart --config rollbridge.js --policy companion
+```
 Shut down the daemon and managed processes:
 ```bash
@@ -371,6 +396,10 @@ location / {
 }
 ```
+See [`docs/nginx.md`](docs/nginx.md) for the full guide — WebSocket upgrade
+headers, timeouts for long-lived connections, forwarded headers, and common
+failure modes (502/503, dropped WebSockets).
 ## Running under systemd
 Run the long-lived daemon as a systemd service so it starts on boot and is
@@ -417,6 +446,10 @@ Maintainers can publish a patch release from the latest default branch:
 npm run release:patch
 ```
+The release script owns the package version bump, lockfile update, default-branch
+commit, push, and npm publish. Do not run `npm version` manually before running
+it.
 ## License
 Rollbridge is released under the [MIT License](LICENSE).

package/TODO.md CHANGED Viewed

@@ -26,9 +26,9 @@ This roadmap tracks planned Rollbridge features and documentation. Rollbridge sh
   - [ ] Restart memory-heavy workers gracefully when possible, with a forced stop timeout.
   - [ ] Add tests with a fixture process that allocates memory above the configured limit.
 - [ ] Worker auto-restart and restart policy controls.
-  - [ ] Add config for max restarts, restart window, exponential backoff, and disabled restart behavior.
+  - [x] Add config for max restarts, restart window, exponential backoff, and disabled restart behavior (per-process `restart` policy).
   - [ ] Distinguish crash restarts, deploy replacements, manual restarts, and memory restarts in status/events.
-  - [ ] Add a `restart` CLI command for a single process, a policy group, or all non-proxied workers.
+  - [x] Add a `restart` CLI command for a single process, a policy group, or all non-proxied workers.
   - [ ] Keep restart behavior safe for job workers by using lifecycle hooks before termination.
 - [ ] Graceful job-worker lifecycle.
   - [ ] Add generic lifecycle hooks such as `quietCommand`, `drainCommand`, `drainTimeoutMs`, and `stopCommand`.
@@ -81,7 +81,7 @@ This roadmap tracks planned Rollbridge features and documentation. Rollbridge sh
 - [x] Add npm package metadata such as repository, license, bugs, and homepage.
 - [x] Add systemd service examples for the Rollbridge daemon.
 - [x] Add tests for malformed control socket JSON and unknown control commands.
-- [ ] Add tests for duplicate IDs and singleton replacement failure behavior.
+- [x] Add tests for duplicate IDs and singleton replacement failure behavior.
 - [x] Add tests for proxy behavior when the active release exits unexpectedly.
 ## Documentation TODO
@@ -91,8 +91,8 @@ This roadmap tracks planned Rollbridge features and documentation. Rollbridge sh
 - [x] Expand process policy docs with deployment examples for `proxied`, `companion`, `singleton`, and `service`.
 - [ ] Document memory checks and auto-restart behavior after the feature lands.
 - [ ] Document worker lifecycle hooks and safe background-job deployment patterns after the feature lands.
-- [ ] Add a Velocious deployment guide with Beacon, background-jobs-main, background-jobs-worker, and web process examples.
-- [ ] Add an Nginx guide with WebSocket headers, timeouts, and common failure modes.
+- [x] Add a Velocious deployment guide with Beacon, background-jobs-main, background-jobs-worker, and web process examples (`docs/velocious.md`).
+- [x] Add an Nginx guide with WebSocket headers, timeouts, and common failure modes (`docs/nginx.md`).
 - [x] Add deploy-tool recipes that call Rollbridge CLI commands directly (`docs/deploy-recipes.md`).
 - [x] Add a Capistrano recipe showing shell commands only; do not add a Capistrano plugin or Rollbridge-specific Capistrano tasks (`docs/deploy-recipes.md`).
 - [ ] Add a TensorBuzz-specific runbook for current production ports, external services, deploy ordering, and rollback constraints.

package/docs/cli.md CHANGED Viewed

@@ -100,6 +100,29 @@ Stops the active release (or the release named by `--release-id`) and prints the
 updated status JSON. With no active release, the proxy answers `503` until the
 next deploy.
+## `restart`
+```
+rollbridge restart [--config <path>] [--process <id>] [--policy <policy>]
+```
+Restarts **non-proxied** processes and prints `{"restarted": [<ids>]}`. Like
+`systemctl restart`, a running process is bounced (stop, then start) and a
+crashed or stopped one is revived — so this is also how you bring back a process
+that exhausted its `restart` budget (see [`config.md`](config.md#processesrestart)).
+Selectors:
+- no selector — restart every non-proxied process (companions, singletons, and services);
+- `--process <id>` — restart only that process;
+- `--policy <companion|singleton|service>` — restart only processes with that policy.
+The proxied process is never restarted in place — that would drop traffic.
+Targeting it (by id or `--policy proxied`) is an error; use `rollbridge deploy`
+for a zero-downtime replacement. `--process <id>` with an id that is not a
+managed process (unknown, or a companion with no active release) is also an
+error. Restarting a `service` bounces a shared broker (for example Velocious
+Beacon), which briefly disrupts every process that depends on it.
 ## `shutdown`
 ```

package/docs/config.md CHANGED Viewed

@@ -68,9 +68,28 @@ release records; the deploy tool still owns on-disk release directories.
 | `port` | number or `{from, to}` | unset | Port (or range) allocated per release. **Required for the `proxied` process.** A plain number `n` means the fixed port `n` (`{from: n, to: n}`). |
 | `health` | object or `false` | enabled with defaults | Health check for the `proxied` process; set `false` to disable (see below). |
 | `gracefulStopMs` | number | `proxy.forceStopTimeoutMs` | `SIGTERM`→`SIGKILL` window for this process. |
-| `restartDelayMs` | number | `1000` | Delay before restarting this process after a crash. |
+| `restartDelayMs` | number | `1000` | Base delay before restarting this process after a crash (the backoff base; see `restart`). |
+| `restart` | object | unlimited restarts, constant delay | Automatic-restart policy: cap, rolling window, and backoff (see below). |
 | `outputLines` | positive integer | `50` | Recent stdout/stderr lines retained per process and reported by `status`/`logs`. |
+### `processes[].restart`
+Controls automatic restarts of a crashed process (a release's active processes
+and daemon-wide `service`s). The base delay is the process's `restartDelayMs`;
+when the policy's limit is reached the process is left `failed` and not
+restarted again.
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `restart.maxRestarts` | non-negative integer | unset (unlimited) | Maximum automatic restarts allowed within `windowMs` before Rollbridge stops restarting the process. `0` disables automatic restarts entirely. |
+| `restart.windowMs` | non-negative number | `0` (process lifetime) | Rolling window over which `maxRestarts` is counted and after which the backoff resets. `0` counts over the process's whole lifetime. |
+| `restart.backoffFactor` | number ≥ 1 | `1` (constant) | Multiplier applied to `restartDelayMs` on each successive restart in the window: `delay = restartDelayMs × backoffFactor ^ n`. `1` keeps a constant delay. |
+| `restart.maxDelayMs` | non-negative number | `0` (no cap) | Upper bound on the backed-off delay. `0` means no cap. |
+With the defaults a crashed process restarts indefinitely after `restartDelayMs`.
+Pair `backoffFactor`/`windowMs` to back off and self-heal after a clean run, or
+set `maxRestarts` to give up on a process stuck in a crash loop.
 ### `processes[].health`
 Only the `proxied` process is health-checked (before traffic switches to a new
@@ -126,3 +145,4 @@ Rollbridge sets these in every managed process's environment (the process's own
 - `port` must be a positive port number or an ascending `{from, to}` range.
 - `control.mode` must be an octal mode between `0` and `0o777`.
 - `outputLines` and `releaseRetention.keep` must be positive/non-negative integers; `health.startDelayMs` and `releaseRetention.maxAgeMs` must be non-negative numbers.
+- `restart.maxRestarts` must be a non-negative integer (omit it for unlimited restarts); `restart.backoffFactor` must be a number ≥ 1; `restart.windowMs` and `restart.maxDelayMs` must be non-negative numbers.

package/docs/nginx.md ADDED Viewed

@@ -0,0 +1,104 @@
+# Nginx guide
+Nginx should always proxy to the **stable Rollbridge proxy port**
+(`proxy.host:proxy.port`), never directly to a release process — release ports
+are allocated per deploy and change. Rollbridge forwards both HTTP and WebSocket
+traffic to the active release and drains old connections across deploys.
+## Server block
+```nginx
+# Maps the Upgrade header so WebSocket requests get "Connection: upgrade" and
+# normal requests get a closed/keep-alive connection.
+map $http_upgrade $connection_upgrade {
+  default upgrade;
+  ''      close;
+}
+server {
+  listen 443 ssl;
+  server_name app.example.com;
+  # ssl_certificate / ssl_certificate_key ...
+  location / {
+    proxy_pass http://127.0.0.1:8182;   # Rollbridge proxy.host:proxy.port
+    # WebSocket upgrade
+    proxy_http_version 1.1;
+    proxy_set_header Upgrade $http_upgrade;
+    proxy_set_header Connection $connection_upgrade;
+    # Pass the real client through to the app
+    proxy_set_header Host $host;
+    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+    proxy_set_header X-Forwarded-Proto $scheme;
+    proxy_set_header X-Real-IP $remote_addr;
+    # Long-lived connections (WebSocket/SSE) — see "Timeouts" below
+    proxy_read_timeout 3600s;
+    proxy_send_timeout 3600s;
+  }
+}
+```
+The repository README shows a minimal version of this block; the additions here
+matter for production.
+## WebSocket headers
+Rollbridge's proxy has WebSocket support enabled, so the only requirement is that
+Nginx forwards the upgrade handshake:
+- `proxy_http_version 1.1` — WebSocket upgrades require HTTP/1.1 (the default is 1.0).
+- `proxy_set_header Upgrade $http_upgrade;` and `proxy_set_header Connection $connection_upgrade;` — forward the upgrade. Using the `map` above is preferred over a hard-coded `Connection "upgrade"`, so non-WebSocket requests aren't forced into an upgrade.
+If these are missing, WebSocket clients fail to connect (the handshake never
+completes) while plain HTTP still works.
+## Timeouts
+Nginx's `proxy_read_timeout`/`proxy_send_timeout` default to **60s**. An idle
+WebSocket (or a slow streaming response) is closed once that elapses, so
+long-lived connections silently drop after a minute unless you raise them — set
+them on the relevant `location` (or globally) to a value above your longest idle
+period.
+Related Rollbridge timeouts (configured in `rollbridge.js`, not Nginx):
+- `proxy.healthTimeoutMs` gates how long a new release has to become healthy
+  before a deploy aborts — it does not affect request timeouts.
+- `proxy.drainTimeoutMs` is how long Rollbridge keeps an old release alive for
+  in-flight connections during a deploy. Keep Nginx's `proxy_read_timeout` for
+  WebSocket locations comfortably above it so the front end doesn't cut
+  connections Rollbridge is still draining.
+## Forwarded headers
+Set `X-Forwarded-For`, `X-Forwarded-Proto`, and `Host` so the app behind
+Rollbridge sees the real client and scheme. Rollbridge proxies with
+`X-Forwarded-*` enabled, but it can only forward what Nginx provides — terminate
+TLS at Nginx and pass `X-Forwarded-Proto $scheme` so the app knows the original
+request was HTTPS.
+For Server-Sent Events or other streamed responses, also disable response
+buffering on that location so events flush immediately:
+```nginx
+location /events {
+  proxy_pass http://127.0.0.1:8182;
+  proxy_http_version 1.1;
+  proxy_buffering off;
+  proxy_read_timeout 3600s;
+}
+```
+## Common failure modes
+| Symptom | Cause | Fix |
+| --- | --- | --- |
+| `502 Bad Gateway` | Rollbridge can't reach the active release's process (it crashed or is restarting); Rollbridge returns `Bad gateway` and Nginx relays it. | Check `rollbridge status` / `rollbridge logs --process <id>` (see [troubleshooting.md](troubleshooting.md)). The process auto-restarts on its port. |
+| `503` / `No active release` | No release is active — before the first deploy, or after `rollbridge stop`. | Deploy a release (`rollbridge deploy`). |
+| WebSocket drops after ~60s | `proxy_read_timeout` left at the 60s default. | Raise `proxy_read_timeout`/`proxy_send_timeout` on the WebSocket location. |
+| WebSocket never connects (plain HTTP works) | Missing `proxy_http_version 1.1` and the `Upgrade`/`Connection` headers. | Add the WebSocket directives shown above. |
+| `504 Gateway Timeout` | A slow response exceeded `proxy_read_timeout`. | Raise the timeout, or speed up the endpoint. |
+| Connections cut mid-deploy | Nginx `proxy_read_timeout` shorter than `proxy.drainTimeoutMs`. | Raise the Nginx timeout above `proxy.drainTimeoutMs`. |

package/docs/velocious.md ADDED Viewed

@@ -0,0 +1,200 @@
+# Velocious deployment guide
+A Velocious backend typically runs four kinds of process: **Beacon** (the
+message broker other processes connect to), **background-jobs-main** (the job
+coordinator), **background-jobs-worker** (runs the jobs), and the **web/API**
+server. This guide maps each to a Rollbridge process policy, shows a complete
+`rollbridge.js`, and explains startup ordering and what happens on a deploy.
+A production version of this config lives at
+[`examples/tensorbuzz.com.js`](../examples/tensorbuzz.com.js).
+## Process mapping
+| Velocious process | Policy | Why |
+| --- | --- | --- |
+| `beacon` | `service` | A shared broker the other processes connect to. It should survive deploys and keep a **stable port**, so workers and the web process always reach the same Beacon. |
+| `background-jobs-main` | `service` (or `singleton`) | The job coordinator. Run it as a `service` when it should outlive releases on a stable port; run it as a `singleton` when it must run the latest release's code after every deploy (see [Choosing the jobs-main policy](#choosing-the-jobs-main-policy)). |
+| `background-jobs-worker` | `companion` | Release-scoped: one set of workers per active release, started before the web process and running that release's code. |
+| `web` | `proxied` | Receives external HTTP/WebSocket traffic, is health-checked before traffic switches, and is drained on the next deploy. Exactly one process is `proxied`. |
+See [README → Process Policies](../README.md#process-policies) for the full
+semantics of each policy and [`docs/config.md`](config.md) for every field.
+## Example `rollbridge.js`
+```js
+// rollbridge.js
+export default {
+  application: "tensorbuzz",
+  control: {path: "/tmp/rollbridge-tensorbuzz.sock"},
+  proxy: {
+    host: "127.0.0.1",
+    port: 4500,          // the stable port Nginx points at
+    healthPath: "/ping",
+    healthTimeoutMs: 30000,
+    drainTimeoutMs: 60000,
+    forceStopTimeoutMs: 10000
+  },
+  processes: [
+    // Shared broker — one daemon-wide instance on a stable port.
+    {
+      id: "beacon",
+      policy: "service",
+      cwd: "{{releasePath}}/backend",
+      env: {NODE_ENV: "production", VELOCIOUS_BEACON_PORT: "{{port}}"},
+      command: "npx velocious beacon",
+      port: 7330
+    },
+    // Job coordinator — waits for Beacon, stable port other jobs processes use.
+    {
+      id: "background-jobs-main",
+      policy: "service",
+      cwd: "{{releasePath}}/backend",
+      env: {
+        NODE_ENV: "production",
+        VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+        VELOCIOUS_BACKGROUND_JOBS_PORT: "{{port}}"
+      },
+      command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- npx velocious background-jobs-main",
+      port: 7331
+    },
+    // Workers — one set per release; raise gracefulStopMs to let in-flight
+    // jobs finish during a deploy.
+    {
+      id: "background-jobs-worker",
+      policy: "companion",
+      cwd: "{{releasePath}}/backend",
+      env: {
+        NODE_ENV: "production",
+        VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+        VELOCIOUS_BACKGROUND_JOBS_PORT: "{{ports.background-jobs-main}}"
+      },
+      command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- wait-for-it 127.0.0.1:{{ports.background-jobs-main}} --strict -- npx velocious background-jobs-worker",
+      gracefulStopMs: 60000
+    },
+    // Web/API — the one proxied process.
+    {
+      id: "web",
+      policy: "proxied",
+      cwd: "{{releasePath}}/backend",
+      env: {
+        NODE_ENV: "production",
+        VELOCIOUS_BEACON_PORT: "{{ports.beacon}}",
+        VELOCIOUS_BACKGROUND_JOBS_PORT: "{{ports.background-jobs-main}}"
+      },
+      command: "wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- wait-for-it 127.0.0.1:{{ports.background-jobs-main}} --strict -- npx velocious server --host 127.0.0.1 --port {{port}}",
+      port: {from: 14500, to: 14599},
+      health: {path: "/ping", timeoutMs: 30000, intervalMs: 500}
+    }
+  ]
+}
+```
+## Wiring processes together
+Beacon and `background-jobs-main` get **fixed** ports (`7330`, `7331`) because
+they are `service`s — a stable port lets every release's workers and web process
+find them. The proxied `web` process gets a **range** (`{from: 14500, to:
+14599}`); Rollbridge allocates a free port per release so the old and new web
+releases can run side by side during the drain.
+Cross-reference ports with `{{ports.<id>}}` and pass them to Velocious through
+`env`. Rollbridge also injects `ROLLBRIDGE_<ID>_PORT` for every process (e.g.
+`ROLLBRIDGE_BACKGROUND_JOBS_MAIN_PORT`), so you can read ports from the
+environment instead of templating if you prefer — see
+[`docs/config.md`](config.md#injected-environment-variables).
+### Startup ordering
+Only the `proxied` process is health-checked, so dependent processes must wait
+for their dependencies themselves. Two mechanisms combine:
+1. **Policy ordering.** On each deploy Rollbridge starts `service`s first, then
+   the release's `companion`s, then the `proxied` process (see
+   [README → Deploy ordering](../README.md#deploy-ordering)).
+2. **Readiness gating.** `wait-for-it 127.0.0.1:{{ports.beacon}} --strict -- …`
+   blocks the command until Beacon's port accepts connections, so
+   `background-jobs-main`, the worker, and `web` don't start talking to Beacon
+   before it is listening. `wait-for-it` is a small standalone script (install it
+   on the host); any equivalent port-wait works.
+## Deploying
+Drive deploys through the Rollbridge CLI — Rollbridge ships no deploy-tool
+plugins (see [`docs/deploy-recipes.md`](deploy-recipes.md) for shell/CI/Capistrano
+recipes). The minimal step after a release directory is prepared:
+```bash
+release_path=/srv/tensorbuzz/releases/20260523120000  # prepared by your pipeline
+# Run backwards-compatible migrations BEFORE switching traffic: the old and new
+# web releases overlap during the drain.
+(cd "$release_path/backend" && npx velocious db:migrate)
+rollbridge deploy \
+  --ensure-daemon \
+  --config /etc/rollbridge/rollbridge.js \
+  --release-path "$release_path" \
+  --revision "$(git -C "$release_path/backend" rev-parse HEAD)"
+```
+`rollbridge deploy` starts the new release's worker and web process,
+health-checks `web` on its `{{port}}`/`/ping`, switches traffic, then drains and
+stops the previous release. It exits non-zero (leaving the previous release
+active) if the new release fails to start or health-check, so a failed deploy
+never promotes a broken release.
+## Background jobs across a deploy
+The worker is a `companion`, so each release runs its own workers:
+- On deploy, the **new** release's workers start (running the new code) before
+  traffic switches; the **old** release's workers are stopped when that release
+  is drained and retired — `SIGTERM`, then `SIGKILL` after `gracefulStopMs`.
+- Set `gracefulStopMs` on the worker to at least your longest in-flight job so a
+  job gets time to finish on `SIGTERM` before the forced kill. The example uses
+  `60000` (60s).
+> **Planned:** graceful job-worker draining via lifecycle hooks
+> (`quietCommand`/`drainCommand`/`stopCommand` and a non-blocking drain mode so
+> new workers start while old workers finish) is on the
+> [roadmap](../TODO.md#major-features) and not yet implemented. Until then, the
+> `gracefulStopMs` window above is the mechanism for letting in-flight jobs
+> finish.
+### Choosing the jobs-main policy
+`background-jobs-main` is duplicate-unsafe (you never want two coordinators), so
+it is either a `service` or a `singleton` — never a `companion`:
+- **`service`** — keeps running across deploys on its stable port. Workers from
+  every release talk to the same coordinator, so there's no coordination gap on
+  deploy. The trade-off: a `service` keeps running the **release it was started
+  from** and only adopts the latest release's template if it crashes and
+  restarts (or the daemon restarts). If `background-jobs-main` itself needs the
+  newest code immediately after every deploy, this is the wrong policy.
+- **`singleton`** — Rollbridge stops the old instance and then starts the new
+  one on each deploy, so it always runs the latest release's code and two copies
+  never overlap. The trade-off: a brief coordination gap while it restarts.
+Beacon is a broker rather than code that changes per release, so `service` is
+almost always right for it.
+## Verifying
+After a deploy, `rollbridge status` should show `beacon` and
+`background-jobs-main` as long-lived `service`s with unchanged ports across
+deploys, one `background-jobs-worker` for the active release, and the `web`
+process `proxied` with its connection counts. Use
+[`rollbridge logs --process <id>`](cli.md) to read recent output from any
+process, and [`docs/troubleshooting.md`](troubleshooting.md) for health-check,
+port, and draining problems.
+For the front end, point Nginx at the stable `proxy.port` (here `4500`), never at
+a release's web port — see [`docs/nginx.md`](nginx.md).

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "rollbridge",
-  "version": "0.1.4",
+  "version": "0.1.5",
   "description": "Zero-downtime process supervisor and local traffic switcher for deploy-managed apps.",
   "keywords": [
     "deploy",
@@ -28,7 +28,7 @@
   "scripts": {
     "all-checks": "npm run typecheck && npm run lint && npm test",
     "lint": "eslint",
-    "release:patch": "node scripts/release-patch.js",
+    "release:patch": "release-patch",
     "test": "node --test test/*.test.js",
     "typecheck": "tsc --noEmit"
   },
@@ -46,6 +46,7 @@
     "eslint": "^10.4.0",
     "eslint-plugin-jsdoc": "^62.9.0",
     "globals": "^17.6.0",
+    "release-patch": "^1.0.0",
     "typescript": "^6.0.3"
   }
 }

package/src/cli.js CHANGED Viewed

@@ -136,6 +136,33 @@ export async function runCli(argv) {
       console.log(JSON.stringify(response, null, 2))
     })
+  program
+    .command("restart")
+    .description("Restart running non-proxied processes (by id, by policy, or all).")
+    .option("-c, --config <path>", "Config file path (defaults to rollbridge.js)")
+    .option("--process <id>", "Restart only the process with this id")
+    .option("--policy <policy>", "Restart only processes with this policy (companion, singleton, or service)")
+    .action(async (options) => {
+      if (options.policy !== undefined && !["companion", "service", "singleton"].includes(options.policy)) {
+        console.error("--policy must be one of: companion, singleton, service.")
+        process.exitCode = 1
+        return
+      }
+      const configPath = await resolveConfigPath(options.config)
+      const config = await loadConfig(configPath)
+      const response = await sendControlCommand({
+        command: {
+          command: "restart",
+          policy: options.policy,
+          processId: options.process
+        },
+        path: config.control.path
+      })
+      console.log(JSON.stringify(response, null, 2))
+    })
   program
     .command("shutdown")
     .option("-c, --config <path>", "Config file path (defaults to rollbridge.js)")

package/src/config.js CHANGED Viewed

@@ -9,7 +9,8 @@ import {pathToFileURL} from "node:url"
  * @typedef {{from: number, to: number}} PortRange
  * @typedef {{path: string, startDelayMs: number, timeoutMs: number, intervalMs: number}} HealthConfig
  * @typedef {"proxied" | "companion" | "singleton" | "service"} ProcessPolicy
- * @typedef {{cwd?: string, env: Record<string, string>, gracefulStopMs: number, health?: HealthConfig, id: string, outputLines: number, policy: ProcessPolicy, port?: PortRange, restartDelayMs: number, command: string}} ProcessConfig
+ * @typedef {{backoffFactor: number, maxDelayMs: number, maxRestarts: number | undefined, windowMs: number}} RestartConfig
+ * @typedef {{cwd?: string, env: Record<string, string>, gracefulStopMs: number, health?: HealthConfig, id: string, outputLines: number, policy: ProcessPolicy, port?: PortRange, restart: RestartConfig, restartDelayMs: number, command: string}} ProcessConfig
  * @typedef {{mode?: number, path: string}} ControlConfig
  * @typedef {{drainTimeoutMs: number, forceStopTimeoutMs: number, healthPath: string, healthTimeoutMs: number, host: string, port: number, upstreamHost: string}} ProxyConfig
  * @typedef {{keep: number, maxAgeMs: number}} ReleaseRetentionConfig
@@ -175,7 +176,7 @@ function normalizeProcess(value, index, proxy, issues) {
   if (!isPlainObject(value)) {
     issues.push({fix: `Define processes[${index}] as a mapping with id, policy, and command.`, message: `processes[${index}] must be an object`})
-    return {command: "", cwd: undefined, env: {}, gracefulStopMs: proxy.forceStopTimeoutMs, health: undefined, id: "", outputLines: 50, policy: "companion", port: undefined, restartDelayMs: 1000}
+    return {command: "", cwd: undefined, env: {}, gracefulStopMs: proxy.forceStopTimeoutMs, health: undefined, id: "", outputLines: 50, policy: "companion", port: undefined, restart: defaultRestartConfig(), restartDelayMs: 1000}
   }
   const source = value
@@ -190,10 +191,80 @@ function normalizeProcess(value, index, proxy, issues) {
     outputLines: normalizeOutputLines(source.outputLines, `processes[${index}].outputLines`, issues),
     policy: normalizePolicy(source.policy, `processes[${index}].policy`, issues),
     port: normalizePortRange(source.port, `processes[${index}].port`, issues),
+    restart: normalizeRestart(source.restart, `processes[${index}].restart`, issues),
     restartDelayMs: normalizeNumber(source.restartDelayMs, `processes[${index}].restartDelayMs`, issues, {default: 1000})
   }
 }
+/**
+ * @returns {RestartConfig} Default restart policy: unlimited restarts with a constant delay.
+ */
+function defaultRestartConfig() {
+  return {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}
+}
+/**
+ * @param {JsonValue} value - Raw restart policy.
+ * @param {string} key - Config key.
+ * @param {ConfigIssue[]} issues - Issue collector.
+ * @returns {RestartConfig} Normalized restart policy.
+ */
+function normalizeRestart(value, key, issues) {
+  if (value === undefined || value === null) return defaultRestartConfig()
+  if (!isPlainObject(value)) {
+    issues.push({fix: `Set ${key} to a mapping with maxRestarts, windowMs, backoffFactor, and maxDelayMs.`, message: `${key} must be an object`})
+    return defaultRestartConfig()
+  }
+  const windowMs = normalizeNumber(value.windowMs, `${key}.windowMs`, issues, {default: 0})
+  const maxDelayMs = normalizeNumber(value.maxDelayMs, `${key}.maxDelayMs`, issues, {default: 0})
+  return {
+    backoffFactor: normalizeBackoffFactor(value.backoffFactor, `${key}.backoffFactor`, issues),
+    maxDelayMs: nonNegativeOrDefault(maxDelayMs, `${key}.maxDelayMs`, issues, 0, false),
+    maxRestarts: normalizeMaxRestarts(value.maxRestarts, `${key}.maxRestarts`, issues),
+    windowMs: nonNegativeOrDefault(windowMs, `${key}.windowMs`, issues, 0, false)
+  }
+}
+/**
+ * @param {JsonValue} value - Raw maximum restart count.
+ * @param {string} key - Config key.
+ * @param {ConfigIssue[]} issues - Issue collector.
+ * @returns {number | undefined} Restart cap, or undefined for unlimited restarts.
+ */
+function normalizeMaxRestarts(value, key, issues) {
+  if (value === undefined || value === null) return undefined
+  if (typeof value !== "number" || !Number.isInteger(value) || value < 0) {
+    issues.push({fix: `Set ${key} to a non-negative integer (0 disables automatic restarts), or omit it for unlimited restarts.`, message: `${key} must be a non-negative integer`})
+    return undefined
+  }
+  return value
+}
+/**
+ * @param {JsonValue} value - Raw backoff factor.
+ * @param {string} key - Config key.
+ * @param {ConfigIssue[]} issues - Issue collector.
+ * @returns {number} Backoff multiplier (>= 1; 1 keeps a constant delay).
+ */
+function normalizeBackoffFactor(value, key, issues) {
+  const factor = normalizeNumber(value, key, issues, {default: 1})
+  if (factor < 1) {
+    issues.push({fix: `Set ${key} to a number >= 1 (1 keeps a constant delay; 2 doubles the delay each restart).`, message: `${key} must be a number greater than or equal to 1`})
+    return 1
+  }
+  return factor
+}
 /**
  * @param {JsonValue} value - Raw output retention value.
  * @param {string} key - Config key.

package/src/daemon.js CHANGED Viewed

@@ -233,6 +233,13 @@ export default class RollbridgeDaemon {
       return this.status()
     }
+    if (commandName === "restart") {
+      return await this.restartProcesses({
+        policy: stringOrUndefined(data.policy),
+        processId: stringOrUndefined(data.processId)
+      })
+    }
     if (commandName === "shutdown") {
       setImmediate(() => {
         this.shutdown().catch((error) => {
@@ -365,6 +372,7 @@ export default class RollbridgeDaemon {
         env: nextDefinition.env,
         logger: nextDefinition.logger,
         outputLines: nextDefinition.outputLines,
+        restart: nextDefinition.restart,
         restartDelayMs: nextDefinition.restartDelayMs,
         shouldRestart: nextDefinition.shouldRestart,
         stopTimeoutMs: nextDefinition.stopTimeoutMs
@@ -394,6 +402,75 @@ export default class RollbridgeDaemon {
     }
   }
+  /**
+   * Restarts non-proxied processes selected by id or policy, or all of them: running
+   * processes are bounced (stop then start) and crashed or stopped ones are revived,
+   * matching the conventional meaning of "restart".
+   *
+   * The proxied process is never restarted in place (that would drop traffic); use a
+   * deploy for a zero-downtime replacement.
+   * @param {{policy?: string, processId?: string}} selector - Restart selector; restarts all non-proxied processes when both are omitted.
+   * @returns {Promise<Record<string, JsonValue>>} The ids that were restarted.
+   */
+  async restartProcesses({policy, processId} = {}) {
+    if (policy === "proxied" || (processId !== undefined && this.isProxiedId(processId))) {
+      throw new Error('The proxied process cannot be restarted in place; use "rollbridge deploy" for a zero-downtime replacement.')
+    }
+    const targets = this.collectRestartTargets({policy, processId})
+    if (processId !== undefined && targets.length === 0) {
+      throw new Error(`No managed process with id "${processId}" to restart.`)
+    }
+    for (const target of targets) {
+      this.logger("process restart requested", {processId: target.id})
+      await target.process.stop()
+      await target.process.start()
+    }
+    return {restarted: targets.map((target) => target.id)}
+  }
+  /**
+   * @param {{policy?: string, processId?: string}} selector - Restart selector.
+   * @returns {{id: string, process: import("./managed-process.js").default}[]} Running non-proxied processes matching the selector.
+   */
+  collectRestartTargets({policy, processId}) {
+    const targets = /** @type {{id: string, process: import("./managed-process.js").default}[]} */ ([])
+    for (const processConfig of this.config.processes) {
+      if (processConfig.policy === "proxied") continue
+      if (processId !== undefined && processConfig.id !== processId) continue
+      if (policy !== undefined && processConfig.policy !== policy) continue
+      const process = this.findProcessInstance(processConfig)
+      if (process) targets.push({id: processConfig.id, process})
+    }
+    return targets
+  }
+  /**
+   * @param {import("./config.js").ProcessConfig} processConfig - Process definition.
+   * @returns {import("./managed-process.js").default | undefined} The running instance, if any.
+   */
+  findProcessInstance(processConfig) {
+    if (processConfig.policy === "service") return this.services.get(processConfig.id)
+    if (processConfig.policy === "singleton") return this.singletons.get(processConfig.id)
+    return this.activeRelease ? this.activeRelease.getProcess(processConfig.id) : undefined
+  }
+  /**
+   * @param {string} id - Process id.
+   * @returns {boolean} True when the id belongs to the proxied process.
+   */
+  isProxiedId(id) {
+    return this.config.processes.some((processConfig) => processConfig.policy === "proxied" && processConfig.id === id)
+  }
   /**
    * @param {string | undefined} releaseId - Release id, or active release when omitted.
    * @returns {Promise<void>} Resolves when stopped.

package/src/managed-process.js CHANGED Viewed

@@ -8,7 +8,7 @@ import {spawn} from "node:child_process"
  * @typedef {"starting" | "running" | "stopping" | "stopped" | "failed"} ManagedProcessState
  * @typedef {import("node:child_process").ChildProcess["signalCode"]} ProcessExitSignal
  * @typedef {{at: string, line: string, stream: "stdout" | "stderr"}} ManagedProcessLog
- * @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, outputLines: number, restartDelayMs: number, shouldRestart: () => boolean, stopTimeoutMs: number}} ManagedProcessDefinition
+ * @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopTimeoutMs: number}} ManagedProcessDefinition
  * @typedef {{command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, logs: ManagedProcessLog[], pid: number | undefined, restarts: number, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
  */
@@ -21,11 +21,12 @@ export default class ManagedProcess extends EventEmitter {
    * @param {string} args.id - Process id.
    * @param {(message: string, data?: Record<string, JsonValue>) => void} args.logger - Logger callback.
    * @param {number} args.outputLines - Recent stdout/stderr lines to retain and report.
+   * @param {import("./config.js").RestartConfig} [args.restart] - Restart policy (defaults to unlimited restarts with a constant delay).
    * @param {number} args.restartDelayMs - Restart delay.
    * @param {() => boolean} args.shouldRestart - Restart policy callback.
    * @param {number} args.stopTimeoutMs - Stop timeout.
    */
-  constructor({command, cwd, env, id, logger, outputLines, restartDelayMs, shouldRestart, stopTimeoutMs}) {
+  constructor({command, cwd, env, id, logger, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopTimeoutMs}) {
     super()
     this.command = command
@@ -34,12 +35,14 @@ export default class ManagedProcess extends EventEmitter {
     this.id = id
     this.logger = logger
     this.outputLines = outputLines
+    this.restart = restart
     this.restartDelayMs = restartDelayMs
     this.shouldRestart = shouldRestart
     this.stopTimeoutMs = stopTimeoutMs
     this.state = /** @type {ManagedProcessState} */ ("stopped")
     this.logs = /** @type {ManagedProcessLog[]} */ ([])
     this.restarts = 0
+    this.recentRestarts = /** @type {number[]} */ ([])
     this.startedAtMs = /** @type {number | undefined} */ (undefined)
     this.intentionalStop = false
     this.restartTimer = undefined
@@ -106,6 +109,7 @@ export default class ManagedProcess extends EventEmitter {
     this.env = definition.env
     this.logger = definition.logger
     this.outputLines = definition.outputLines
+    this.restart = definition.restart
     this.restartDelayMs = definition.restartDelayMs
     this.shouldRestart = definition.shouldRestart
     this.stopTimeoutMs = definition.stopTimeoutMs
@@ -146,14 +150,66 @@ export default class ManagedProcess extends EventEmitter {
     this.emit("exit", {code, signal})
     if (!wasIntentional && this.shouldRestart()) {
-      this.restartTimer = setTimeout(() => {
-        this.restartTimer = undefined
-        this.restarts += 1
-        this.start().catch((error) => {
-          this.logger("process restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
-        })
-      }, this.restartDelayMs)
+      this.scheduleRestart()
+    }
+  }
+  /**
+   * Schedules an automatic restart per the restart policy, or gives up once the policy's limit is reached.
+   * @returns {void}
+   */
+  scheduleRestart() {
+    const {backoffFactor, maxRestarts, windowMs} = this.restart
+    // Fast path: unlimited restarts with a constant delay needs no per-restart bookkeeping.
+    // The delay is constant across attempts here (backoffFactor is 1), so restartDelayFor(0)
+    // gives the right value while still applying any maxDelayMs cap.
+    if (maxRestarts === undefined && backoffFactor === 1) {
+      this.queueRestart(this.restartDelayFor(0))
+      return
+    }
+    const now = Date.now()
+    if (windowMs > 0) {
+      this.recentRestarts = this.recentRestarts.filter((time) => time > now - windowMs)
     }
+    if (maxRestarts !== undefined && this.recentRestarts.length >= maxRestarts) {
+      this.logger("restart limit reached", {id: this.id, maxRestarts, windowMs})
+      return
+    }
+    const delay = this.restartDelayFor(this.recentRestarts.length)
+    this.recentRestarts.push(now)
+    this.queueRestart(delay)
+  }
+  /**
+   * @param {number} attempt - Number of restarts already counted in the current window.
+   * @returns {number} Backed-off restart delay in milliseconds, capped by maxDelayMs when set.
+   */
+  restartDelayFor(attempt) {
+    const backedOff = this.restartDelayMs * this.restart.backoffFactor ** attempt
+    return this.restart.maxDelayMs > 0 ? Math.min(backedOff, this.restart.maxDelayMs) : backedOff
+  }
+  /**
+   * @param {number} delayMs - Delay before the restart attempt.
+   * @returns {void}
+   */
+  queueRestart(delayMs) {
+    this.restartTimer = setTimeout(() => {
+      this.restartTimer = undefined
+      this.restarts += 1
+      this.start().catch((error) => {
+        this.logger("process restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
+      })
+    }, delayMs)
   }
   /**

package/src/release-group.js CHANGED Viewed

@@ -80,6 +80,14 @@ export default class ReleaseGroup extends EventEmitter {
     }
   }
+  /**
+   * @param {string} id - Process id.
+   * @returns {ManagedProcess | undefined} This release's managed process with the given id, if present.
+   */
+  getProcess(id) {
+    return this.processes.get(id)
+  }
   /**
    * Logs process diagnostics before failed startup cleanup stops and removes the release processes.
    * @param {Error | string} error - Startup failure.
@@ -170,6 +178,7 @@ export default class ReleaseGroup extends EventEmitter {
       id: processConfig.id,
       logger: (message, data = {}) => this.logger(message, {processId: processConfig.id, releaseId: this.releaseId, ...data}),
       outputLines: processConfig.outputLines,
+      restart: processConfig.restart,
       restartDelayMs: processConfig.restartDelayMs,
       shouldRestart: options.shouldRestart || (() => this.state === "active" || this.state === "starting"),
       stopTimeoutMs: processConfig.gracefulStopMs

package/test/config-validation.test.js CHANGED Viewed

@@ -86,6 +86,46 @@ test("validateConfig defaults outputLines and accepts a positive override", () =
   assert.equal(config.processes[1].outputLines, 5)
 })
+test("validateConfig defaults the restart policy, accepts overrides, and rejects bad values", () => {
+  /**
+   * @param {import("../src/json.js").JsonValue} restart - Restart policy under test, or undefined to omit it.
+   * @returns {{config: import("../src/config.js").RollbridgeConfig, issues: import("../src/config.js").ConfigIssue[]}} Validation result.
+   */
+  const validateRestart = (restart) => validateConfig({
+    application: "demo",
+    control: {path: "/tmp/demo.sock"},
+    processes: [{command: "run web", id: "web", policy: "proxied", port: {from: 18000, to: 18099}, restart}],
+    proxy: {host: "127.0.0.1", port: 8182}
+  })
+  const defaulted = validateRestart(undefined)
+  assert.deepEqual(defaulted.issues, [])
+  assert.deepEqual(defaulted.config.processes[0].restart, {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0})
+  const custom = validateRestart({backoffFactor: 2, maxDelayMs: 30000, maxRestarts: 5, windowMs: 60000})
+  assert.deepEqual(custom.issues, [])
+  assert.deepEqual(custom.config.processes[0].restart, {backoffFactor: 2, maxDelayMs: 30000, maxRestarts: 5, windowMs: 60000})
+  // maxRestarts: 0 disables automatic restarts.
+  const disabled = validateRestart({maxRestarts: 0})
+  assert.deepEqual(disabled.issues, [])
+  assert.equal(disabled.config.processes[0].restart.maxRestarts, 0)
+  const invalid = validateRestart({backoffFactor: 0.5, maxDelayMs: -1, maxRestarts: -2, windowMs: -3})
+  const messages = invalid.issues.map((issue) => issue.message)
+  assert.ok(messages.includes("processes[0].restart.backoffFactor must be a number greater than or equal to 1"), JSON.stringify(messages))
+  assert.ok(messages.includes("processes[0].restart.maxRestarts must be a non-negative integer"), JSON.stringify(messages))
+  assert.ok(messages.includes("processes[0].restart.maxDelayMs must be a non-negative number"), JSON.stringify(messages))
+  assert.ok(messages.includes("processes[0].restart.windowMs must be a non-negative number"), JSON.stringify(messages))
+  // A fractional maxRestarts is rejected (it must be a whole number of restarts).
+  assert.ok(validateRestart({maxRestarts: 1.5}).issues.some((issue) => issue.message === "processes[0].restart.maxRestarts must be a non-negative integer"))
+})
 test("validateConfig rejects a non-positive-integer outputLines with a fix", () => {
   const {issues} = validateConfig({
     application: "demo",

package/test/managed-process.test.js CHANGED Viewed

@@ -104,3 +104,89 @@ test("counts automatic restarts and reports startedAt and uptime while running",
     await managed.stop()
   }
 })
+/**
+ * Builds a managed crasher with a specific restart policy.
+ * @param {import("../src/config.js").RestartConfig} restart - Restart policy.
+ * @returns {ManagedProcess} Managed process.
+ */
+function buildCrasher(restart) {
+  return new ManagedProcess({
+    command: `${JSON.stringify(process.execPath)} ${JSON.stringify(crasherPath)}`,
+    cwd: undefined,
+    env: {},
+    id: "crasher",
+    logger: () => {},
+    outputLines: 50,
+    restart,
+    restartDelayMs: 10,
+    shouldRestart: () => true,
+    stopTimeoutMs: 500
+  })
+}
+test("does not auto-restart when the restart policy is disabled (maxRestarts: 0)", async () => {
+  const managed = buildCrasher({backoffFactor: 1, maxDelayMs: 0, maxRestarts: 0, windowMs: 0})
+  try {
+    await managed.start()
+    // The fixture exits ~40ms after start; with restarts disabled it should stay failed.
+    await waitFor(() => managed.status().state === "failed")
+    await new Promise((resolve) => setTimeout(resolve, 100))
+    assert.equal(managed.status().restarts, 0)
+    assert.equal(managed.status().state, "failed")
+  } finally {
+    await managed.stop()
+  }
+})
+test("stops auto-restarting once maxRestarts within the window is reached", async () => {
+  const managed = buildCrasher({backoffFactor: 1, maxDelayMs: 0, maxRestarts: 2, windowMs: 60000})
+  try {
+    await managed.start()
+    // It restarts at most twice within the window, then gives up and stays failed.
+    await waitFor(() => managed.status().restarts === 2 && managed.status().state === "failed")
+    await new Promise((resolve) => setTimeout(resolve, 100))
+    assert.equal(managed.status().restarts, 2)
+    assert.equal(managed.status().state, "failed")
+  } finally {
+    await managed.stop()
+  }
+})
+test("applies exponential backoff to restart delays, capped by maxDelayMs", () => {
+  const capped = buildCrasher({backoffFactor: 2, maxDelayMs: 500, maxRestarts: undefined, windowMs: 0})
+  // restartDelayMs (10) * 2 ** attempt, capped at 500.
+  assert.equal(capped.restartDelayFor(0), 10)
+  assert.equal(capped.restartDelayFor(1), 20)
+  assert.equal(capped.restartDelayFor(2), 40)
+  assert.equal(capped.restartDelayFor(6), 500) // 10 * 64 = 640, capped to 500
+  assert.equal(capped.restartDelayFor(7), 500)
+  // maxDelayMs: 0 means no cap.
+  const uncapped = buildCrasher({backoffFactor: 3, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0})
+  assert.equal(uncapped.restartDelayFor(0), 10)
+  assert.equal(uncapped.restartDelayFor(2), 90)
+})
+test("the unlimited constant-delay fast path still applies maxDelayMs", () => {
+  // restartDelayMs (10) above maxDelayMs (5), with no backoff and unlimited restarts.
+  const managed = buildCrasher({backoffFactor: 1, maxDelayMs: 5, maxRestarts: undefined, windowMs: 0})
+  assert.equal(managed.restartDelayFor(0), 5)
+  /** @type {number | undefined} */
+  let queued
+  managed.queueRestart = (delayMs) => { queued = delayMs }
+  managed.scheduleRestart()
+  assert.equal(queued, 5)
+})

package/test/rollbridge.test.js CHANGED Viewed

@@ -141,6 +141,38 @@ test("singleton processes restart without overlap during deploy", async () => {
   }
 })
+test("a failed singleton replacement surfaces the error after stopping the old singleton", async () => {
+  // The singleton's working directory is per-release; only the v1 directory exists, so
+  // the v2 replacement cannot spawn (ENOENT on cwd) and its start() rejects.
+  const fixture = await createFixture({includeSingleton: true, singletonCwd: "{{releasePath}}/{{releaseId}}"})
+  const daemon = await startDaemon(fixture.config)
+  await fs.mkdir(path.join(fixture.root, "v1"))
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    await waitFor(async () => (await processEvents(fixture.singletonLogPath)).some((event) => event.event === "start" && event.releaseId === "v1"))
+    // The new release's singleton fails to start, so the deploy surfaces the error.
+    await assert.rejects(() => daemon.deploy({releaseId: "v2", releasePath: fixture.root, revision: "v2"}))
+    // The old singleton is stopped before the new one is started, so two copies never
+    // overlap — even when the replacement then fails.
+    await waitFor(async () => (await processEvents(fixture.singletonLogPath)).some((event) => event.event === "stop" && event.releaseId === "v1"))
+    const status = daemon.status()
+    // Traffic switches before singletons are replaced, so the new release is already active,
+    // but its singleton is left failed with no replacement running.
+    assert.equal(status.activeReleaseId, "v2")
+    assert.equal(status.singletons.length, 1)
+    assert.equal(status.singletons[0].process.state, "failed")
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
 test("service processes start before releases and restart with the latest deploy template", async () => {
   const fixture = await createFixture({includeService: true, webDependsOnService: true})
   const daemon = await startDaemon(fixture.config)
@@ -173,6 +205,137 @@ test("service processes start before releases and restart with the latest deploy
   }
 })
+test("restart bounces a single process by id", async () => {
+  const fixture = await createFixture({includeService: true})
+  const daemon = await startDaemon(fixture.config)
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    const before = pidsById(daemon.status())
+    const result = await daemon.restartProcesses({processId: "beacon"})
+    assert.deepEqual(result.restarted, ["beacon"])
+    const after = pidsById(daemon.status())
+    assert.ok(before.beacon && after.beacon, "beacon should have a pid before and after")
+    assert.notEqual(after.beacon, before.beacon)
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
+test("restart with no selector bounces every non-proxied process but not the proxied one", async () => {
+  const fixture = await createFixture({includeCompanion: true, includeService: true, includeSingleton: true})
+  const daemon = await startDaemon(fixture.config)
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    const before = pidsById(daemon.status())
+    const result = await daemon.restartProcesses()
+    const restarted = /** @type {string[]} */ (result.restarted)
+    assert.deepEqual([...restarted].sort(), ["beacon", "jobs-main", "worker"])
+    const after = pidsById(daemon.status())
+    assert.equal(after.web, before.web, "proxied process should not be restarted")
+    assert.notEqual(after.beacon, before.beacon)
+    assert.notEqual(after["jobs-main"], before["jobs-main"])
+    assert.notEqual(after.worker, before.worker)
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
+test("restart --policy targets only processes with that policy", async () => {
+  const fixture = await createFixture({includeCompanion: true, includeService: true})
+  const daemon = await startDaemon(fixture.config)
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    const before = pidsById(daemon.status())
+    const result = await daemon.restartProcesses({policy: "companion"})
+    assert.deepEqual(result.restarted, ["worker"])
+    const after = pidsById(daemon.status())
+    assert.notEqual(after.worker, before.worker)
+    assert.equal(after.beacon, before.beacon, "the service should be left running")
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
+test("restart refuses the proxied process and reports unknown ids", async () => {
+  const fixture = await createFixture()
+  const daemon = await startDaemon(fixture.config)
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    await assert.rejects(() => daemon.restartProcesses({processId: "web"}), /proxied process cannot be restarted/)
+    await assert.rejects(() => daemon.restartProcesses({policy: "proxied"}), /proxied process cannot be restarted/)
+    await assert.rejects(() => daemon.restartProcesses({processId: "missing"}), /No managed process with id "missing"/)
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
+test("restart revives a stopped process instead of erroring", async () => {
+  const fixture = await createFixture({includeCompanion: true})
+  const daemon = await startDaemon(fixture.config)
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    // Simulate the worker having exited (e.g. crashed and exhausted its restart budget).
+    const worker = daemon.activeRelease?.getProcess("worker")
+    assert.ok(worker, "worker process should exist")
+    await worker.stop()
+    assert.equal(worker.status().state, "stopped")
+    const result = await daemon.restartProcesses({processId: "worker"})
+    assert.deepEqual(result.restarted, ["worker"])
+    assert.equal(worker.status().state, "running")
+    assert.ok(worker.status().pid)
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
+test("the restart control command bounces a process over the socket", async () => {
+  const fixture = await createFixture({includeService: true})
+  const daemon = await startDaemon(fixture.config)
+  try {
+    await daemon.deploy({releaseId: "v1", releasePath: fixture.root, revision: "v1"})
+    const before = pidsById(daemon.status())
+    const response = await sendControlCommand({
+      command: {command: "restart", processId: "beacon"},
+      path: fixture.config.control.path
+    })
+    assert.deepEqual(response.restarted, ["beacon"])
+    assert.notEqual(pidsById(daemon.status()).beacon, before.beacon)
+  } finally {
+    await daemon.shutdown()
+    await fs.rm(fixture.root, {force: true, recursive: true})
+  }
+})
 test("control socket accepts deploy and status commands", async () => {
   const fixture = await createFixture()
   const daemon = await startDaemon(fixture.config)
@@ -336,7 +499,7 @@ test("deploy can ensure the daemon before sending the release command", async ()
 })
 /**
- * @param {{includeService?: boolean, includeSingleton?: boolean, proxyHost?: string, webCommand?: string, webDependsOnService?: boolean, webHealthTimeoutMs?: number}} [options] - Fixture options.
+ * @param {{includeCompanion?: boolean, includeService?: boolean, includeSingleton?: boolean, proxyHost?: string, singletonCwd?: string, webCommand?: string, webDependsOnService?: boolean, webHealthTimeoutMs?: number}} [options] - Fixture options.
  * @returns {Promise<{config: import("../src/config.js").RollbridgeConfig, root: string, serviceLogPath: string, singletonLogPath: string}>} Fixture data.
  */
 async function createFixture(options = {}) {
@@ -359,6 +522,14 @@ async function createFixture(options = {}) {
     })
   }
+  if (options.includeCompanion) {
+    processes.push({
+      command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
+      id: "worker",
+      policy: "companion"
+    })
+  }
   processes.push({
     command: options.webCommand || (options.webDependsOnService
       ? `${JSON.stringify(process.execPath)} ${JSON.stringify(dependentAppPath)}`
@@ -376,6 +547,7 @@ async function createFixture(options = {}) {
   if (options.includeSingleton) {
     processes.push({
       command: `${JSON.stringify(process.execPath)} ${JSON.stringify(singletonAppPath)}`,
+      ...(options.singletonCwd ? {cwd: options.singletonCwd} : {}),
       env: {
         ROLLBRIDGE_SINGLETON_LOG: singletonLogPath
       },
@@ -466,6 +638,27 @@ function statusRelease(daemon, releaseId) {
   return release
 }
+/**
+ * Maps process id to pid across the active release, services, and singletons.
+ * @param {import("../src/daemon.js").DaemonStatus} status - Daemon status payload.
+ * @returns {Record<string, number | undefined>} Process id to current pid.
+ */
+function pidsById(status) {
+  /** @type {Record<string, number | undefined>} */
+  const pids = {}
+  for (const release of status.releases) {
+    if (release.state !== "active") continue
+    for (const processStatus of release.processes) pids[processStatus.id] = processStatus.pid
+  }
+  for (const service of status.services) pids[service.id] = service.process.pid
+  for (const singleton of status.singletons) pids[singleton.id] = singleton.process.pid
+  return pids
+}
 /**
  * @param {string} logPath - Log path.
  * @returns {Promise<Array<{event: string, pid: number, releaseId: string}>>} Events.

package/scripts/release-patch.js DELETED Viewed

@@ -1,83 +0,0 @@
-#!/usr/bin/env node
-import {execFileSync} from "node:child_process"
-/**
- * Runs a command and inherits stdio.
- * @param {string} command - Command to run.
- * @param {string[]} [args] - Command arguments.
- * @returns {void}
- */
-function run(command, args = []) {
-  execFileSync(command, args, {
-    env: {
-      ...process.env,
-      GIT_EDITOR: "true",
-      GIT_MERGE_AUTOEDIT: "no"
-    },
-    stdio: "inherit"
-  })
-}
-/**
- * Runs a command and returns trimmed stdout.
- * @param {string} command - Command to run.
- * @param {string[]} [args] - Command arguments.
- * @returns {string} Trimmed stdout.
- */
-function output(command, args = []) {
-  return execFileSync(command, args, {encoding: "utf8"}).trim()
-}
-/** @returns {string} GitHub remote default branch name. */
-function defaultBranch() {
-  const remoteHead = output("git", ["ls-remote", "--symref", "origin", "HEAD"])
-  const match = remoteHead.match(/^ref: refs\/heads\/(.+)\s+HEAD$/m)
-  if (!match) throw new Error("Unable to determine origin default branch")
-  return match[1]
-}
-/**
- * @param {string} branch - Branch name.
- * @returns {boolean} True when the local branch exists.
- */
-function localBranchExists(branch) {
-  try {
-    output("git", ["rev-parse", "--verify", `refs/heads/${branch}`])
-    return true
-  } catch (_error) {
-    return false
-  }
-}
-/** @returns {string} Updated default branch name. */
-function updateLocalDefaultBranch() {
-  run("git", ["fetch", "origin"])
-  const branch = defaultBranch()
-  if (localBranchExists(branch)) {
-    run("git", ["checkout", branch])
-  } else {
-    run("git", ["checkout", "-b", branch, `origin/${branch}`])
-  }
-  run("git", ["merge", "--ff-only", `origin/${branch}`])
-  return branch
-}
-try {
-  execFileSync("npm", ["whoami"], {stdio: "ignore"})
-} catch {
-  run("npm", ["login"])
-}
-const branch = updateLocalDefaultBranch()
-run("npm", ["version", "patch", "--no-git-tag-version"])
-run("npm", ["install"])
-run("git", ["add", "package.json", "package-lock.json"])
-run("git", ["commit", "-m", "chore: bump patch version"])
-run("git", ["push", "origin", branch])
-run("npm", ["publish"])