npm - cdk-local - Versions diffs - 0.66.0 → 0.67.0 - Mend

cdk-local 0.66.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +1 -1
package/dist/cli.js +2 -2
package/dist/index.js +1 -1
package/dist/internal.js +1 -1
package/dist/local-list-9jAE7ClA.d.ts.map +1 -1
package/dist/{local-list-CyW86HDN.js → local-list-faPgnDlc.js} +416 -64
package/dist/{local-list-CyW86HDN.js.map → local-list-faPgnDlc.js.map} +1 -1
package/package.json +1 -1

package/dist/{local-list-CyW86HDN.js → local-list-faPgnDlc.js} RENAMED Viewed

@@ -20968,6 +20968,7 @@ async function startEcsService(service, options, runState) {
 	for (let i = 0; i < replicaCount; i++) {
 		const instance = {
 			index: i,
+			generation: 0,
 			state: createEcsRunState(),
 			restartCount: 0,
 			shuttingDown: false,
@@ -21112,8 +21113,11 @@ function buildNetworkAliasesByContainer(service) {
 */
 async function bootReplica(service, options, instance) {
 	const logger = getLogger().child("ecs-service");
-	const perReplicaCluster = `${options.taskOptions.cluster}-svc-${service.serviceLogicalId.toLowerCase()}-r${instance.index}`;
-	const ownerKeyPrefix = `${service.serviceLogicalId}:r${instance.index}`;
+	const gen = instance.generation;
+	const genSuffix = gen > 0 ? `-g${gen}` : "";
+	const ownerKeyGenSuffix = gen > 0 ? `:g${gen}` : "";
+	const perReplicaCluster = `${options.taskOptions.cluster}-svc-${service.serviceLogicalId.toLowerCase()}-r${instance.index}${genSuffix}`;
+	const ownerKeyPrefix = `${service.serviceLogicalId}:r${instance.index}${ownerKeyGenSuffix}`;
 	const addHostFlags = options.discovery?.registry ? options.discovery.registry.buildAddHostFlags(ownerKeyPrefix) : [];
 	const sharedNetwork = options.discovery?.sharedNetwork;
 	const networkAliasesByContainer = buildNetworkAliasesByContainer(service);
@@ -21264,6 +21268,266 @@ function unregisterReplicaFromFrontDoor(instance, frontDoor) {
 	instance.frontDoorOwnerKey = void 0;
 }
 /**
+* Phase 2 of issue #214 — shadow-replica readiness probe budget. Tested
+* with busybox httpd's ~50ms listen window and a non-trivial Node
+* Express startup (~1-3s) — 10s caps the rare slow-start app without
+* blocking the roll for typo'd configurations that will never listen.
+*
+* Mutable so unit tests can shrink the timeout window without
+* standing up a real clock; production callers leave the defaults.
+* Exposed via {@link __setShadowReadyConfig} below.
+*/
+let shadowReadyTimeoutMs = 1e4;
+let shadowReadyIntervalMs = 100;
+/**
+* Phase 2 of issue #214 — per-replica rolling reload primitive used by
+* `cdkl start-service --watch`. Boots one fresh "shadow" replica under a
+* bumped generation suffix, atomically swaps Cloud Map / front-door
+* registrations off the old replica, then stops and cleans up the old
+* container.
+*
+* Sequence:
+*   1. Locate the old replica by `oldReplicaIndex` (rejects when it's
+*      already shutting down or missing — the reloader must not race
+*      itself across overlapping firings, which the emulator's
+*      `reloadChain` serializer guarantees externally).
+*   2. Allocate a shadow {@link ServiceReplicaInstance} with the same
+*      logical `index` and `generation = old.generation + 1`. Appended
+*      to `runState.replicas` so a SIGTERM mid-roll tears it down too.
+*   3. `bootReplica(newService, newOptions, shadow)` boots the new
+*      container, publishes Cloud Map handles under the bumped
+*      generation suffix, and registers the shadow in the front-door
+*      pool. The OLD replica's handles + pool entry stay live during
+*      this window so consumers never see a gap.
+*   4. Atomically swap: unregister old's Cloud Map handles, drop its
+*      front-door pool entry, mark `oldInstance.shuttingDown = true`
+*      so the watcher exits. The shadow is already serving by this
+*      point.
+*   5. `cleanupEcsRun(oldInstance.state)` tears the old container +
+*      network down. The shadow remains in `runState.replicas`.
+*   6. Start the shadow's watcher so restart-on-exit is wired the
+*      same as Phase 1's boot loop.
+*
+* Failure modes:
+*   - `bootReplica` fails: keep the old replica serving. Best-effort
+*     teardown of partial shadow state. Re-throws so the reloader can
+*     log and continue with the remaining replicas.
+*   - Old shutdown fails: surfaced via the logger; the shadow is
+*     already live so the service stays available.
+*
+* @internal — wired only by the emulator's reload pathway.
+*/
+async function rollServiceReplica(args) {
+	const { controller, oldReplicaIndex, newService, newOptions } = args;
+	const logger = getLogger().child("ecs-service");
+	const oldInstance = controller.runState.replicas[oldReplicaIndex];
+	if (!oldInstance) throw new EcsServiceRunnerError(`rollServiceReplica: no replica at index ${oldReplicaIndex} (replicas=${controller.runState.replicas.length}).`);
+	if (oldInstance.shuttingDown) {
+		logger.warn(`Rolling replica r${oldInstance.index} (gen ${oldInstance.generation}): retired by its own watcher mid-roll (essential container exited). Skipping this slot; save again to re-boot it.`);
+		return;
+	}
+	const teardownOldFirst = computeReplicaCount(newService.desiredCount, newOptions.maxTasks) === 1;
+	const shadow = {
+		index: oldInstance.index,
+		generation: oldInstance.generation + 1,
+		state: createEcsRunState(),
+		restartCount: 0,
+		shuttingDown: false,
+		inFlightBoot: void 0,
+		cloudMapHandles: [],
+		frontDoorOwnerKey: void 0
+	};
+	controller.runState.replicas.push(shadow);
+	if (teardownOldFirst) {
+		logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): single-replica + host-port publish — tearing old down before shadow boot to avoid host-port collision.`);
+		if (newOptions.discovery) {
+			for (const handle of oldInstance.cloudMapHandles) try {
+				newOptions.discovery.registry.unregister(handle);
+			} catch {}
+			oldInstance.cloudMapHandles = [];
+		}
+		unregisterReplicaFromFrontDoor(oldInstance, newOptions.frontDoor);
+		oldInstance.shuttingDown = true;
+		try {
+			await cleanupEcsRun(oldInstance.state, { keepRunning: newOptions.taskOptions.keepRunning });
+		} catch (err) {
+			logger.warn(`Rolling replica ${oldInstance.index}: cleanup of old (gen ${oldInstance.generation}) failed: ${err instanceof Error ? err.message : String(err)}. Attempting shadow boot anyway.`);
+		}
+		const oldIdx = controller.runState.replicas.indexOf(oldInstance);
+		if (oldIdx !== -1) controller.runState.replicas.splice(oldIdx, 1);
+	} else logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): booting shadow before retiring old.`);
+	const bootPromise = (async () => {
+		await bootReplica(newService, newOptions, shadow);
+		await waitForReplicaTcpReady(newService, shadow, {
+			timeoutMs: shadowReadyTimeoutMs,
+			intervalMs: shadowReadyIntervalMs
+		});
+	})();
+	shadow.inFlightBoot = bootPromise;
+	try {
+		await bootPromise;
+	} catch (err) {
+		const shadowIdx = controller.runState.replicas.indexOf(shadow);
+		if (shadowIdx !== -1) controller.runState.replicas.splice(shadowIdx, 1);
+		try {
+			await cleanupEcsRun(shadow.state, { keepRunning: false });
+		} catch {}
+		if (teardownOldFirst) logger.error(`Rolling replica ${shadow.index}: shadow boot failed and the old replica was already torn down for the single-replica path. Save again with a clean boot to re-start the service.`);
+		throw err;
+	} finally {
+		shadow.inFlightBoot = void 0;
+	}
+	if (teardownOldFirst) {
+		watchReplica(newService, newOptions, shadow, controller.runState);
+		logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): single-replica reload complete.`);
+		return;
+	}
+	if (newOptions.discovery) {
+		for (const handle of oldInstance.cloudMapHandles) try {
+			newOptions.discovery.registry.unregister(handle);
+		} catch {}
+		oldInstance.cloudMapHandles = [];
+	}
+	unregisterReplicaFromFrontDoor(oldInstance, newOptions.frontDoor);
+	await disconnectOldFromSharedNetwork(oldInstance).catch((err) => {
+		logger.debug(`Rolling replica ${oldInstance.index}: shared-network disconnect of old (gen ${oldInstance.generation}) failed: ${err instanceof Error ? err.message : String(err)}. Proceeding with cleanup (the docker-rm step still tears it down).`);
+	});
+	oldInstance.shuttingDown = true;
+	try {
+		await cleanupEcsRun(oldInstance.state, { keepRunning: newOptions.taskOptions.keepRunning });
+	} catch (err) {
+		logger.warn(`Rolling replica ${oldInstance.index}: cleanup of old (gen ${oldInstance.generation}) failed: ${err instanceof Error ? err.message : String(err)}. The shadow is live; the stale container may need a manual \`docker rm\`.`);
+	}
+	const oldIdx = controller.runState.replicas.indexOf(oldInstance);
+	if (oldIdx !== -1) controller.runState.replicas.splice(oldIdx, 1);
+	watchReplica(newService, newOptions, shadow, controller.runState);
+	logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): swap complete; old retired.`);
+}
+/**
+* Phase 2 of issue #214 — disconnect every container of the dying
+* replica from the shared service network BEFORE `cleanupEcsRun`'s
+* `docker stop → docker rm` sequence. Docker's embedded DNS strips an
+* alias the instant a container is disconnected, so a peer resolving
+* the service's Service Connect / Cloud Map alias right after this
+* step never picks the dying container's IP — closing the race window
+* where the alias points at an IP whose app is already gone. Best-
+* effort: a disconnect failure logs at debug and `cleanupEcsRun`'s
+* `docker rm -f` will still tear the network membership down.
+*
+* No-op for replicas that aren't on a shared network (the defensive
+* "per-replica /24" fallback path); the per-replica network is
+* destroyed by `cleanupEcsRun` directly.
+*/
+async function disconnectOldFromSharedNetwork(oldInstance) {
+	const network = oldInstance.state.network;
+	if (!network || !network.ownedByCaller) return;
+	const networkName = network.networkName;
+	const targets = [];
+	if (network.sidecarContainerId) targets.push(network.sidecarContainerId);
+	for (const c of oldInstance.state.startedContainers) targets.push(c.id);
+	for (const id of targets) try {
+		await dockerNetworkDisconnectImpl(networkName, id);
+	} catch (err) {}
+}
+/**
+* Production `docker network disconnect --force <network> <id>` impl,
+* extracted as a test-overridable function so the rolling-primitive
+* unit test can assert this step actually ran (the reviewer flagged
+* that the test mock previously took the `!ownedByCaller` early-return
+* path and silently never entered the disconnect branch).
+*/
+const defaultDockerNetworkDisconnectImpl = async (networkName, containerId) => {
+	const { execFile } = await import("node:child_process");
+	const { promisify } = await import("node:util");
+	const { getDockerCmd } = await import("./docker-cmd-voNPrcRh.js").then((n) => n.t);
+	await promisify(execFile)(getDockerCmd(), [
+		"network",
+		"disconnect",
+		"--force",
+		networkName,
+		containerId
+	]);
+};
+let dockerNetworkDisconnectImpl = defaultDockerNetworkDisconnectImpl;
+/**
+* Phase 2 of issue #214 — shadow-replica TCP readiness probe used by
+* {@link rollServiceReplica} before the atomic registry swap. Polls the
+* essential container's first port mapping (the one Cloud Map / Service
+* Connect publishes) via TCP-connect on the shadow's docker network IP,
+* retrying every `intervalMs` until either the connect succeeds or the
+* timeout elapses.
+*
+* The probe is best-effort: a timeout logs a warn but DOES NOT throw.
+* Swapping anyway is the lesser evil — the dying old replica's image
+* is about to be torn down, and the shadow's new image is the user's
+* intent. A timed-out probe usually means the app inside the new image
+* has a startup bug; the user will see the connection failures on
+* their probe / curl and fix the app, then save again. Failing the
+* roll here would leave the OLD replica running on stale code with no
+* recovery path other than `^C`.
+*
+* Exposed for the unit test pattern: the probe's `connect` impl is
+* injectable via {@link __setTcpProbeImpl} so the rolling-primitive
+* unit test can avoid any real TCP socket.
+*/
+async function waitForReplicaTcpReady(service, shadow, opts) {
+	const logger = getLogger().child("ecs-service");
+	const networkName = shadow.state.network?.networkName;
+	if (!networkName) return;
+	const essential = service.task.containers.find((c) => c.essential) ?? service.task.containers[0];
+	if (!essential || essential.portMappings.length === 0) return;
+	const started = shadow.state.startedContainers.find((c) => c.name === essential.name);
+	if (!started) return;
+	let ip;
+	try {
+		const resolved = await getContainerNetworkIp(started.id, networkName);
+		if (!resolved) return;
+		ip = resolved;
+	} catch (err) {
+		logger.warn(`Shadow replica r${shadow.index} (gen ${shadow.generation}): TCP-ready probe could not resolve docker IP: ${err instanceof Error ? err.message : String(err)}. Proceeding with swap.`);
+		return;
+	}
+	const port = essential.portMappings[0].containerPort;
+	const deadline = Date.now() + opts.timeoutMs;
+	let lastErr;
+	while (Date.now() < deadline) {
+		try {
+			await tcpProbeImpl(ip, port);
+			logger.debug(`Shadow replica r${shadow.index} (gen ${shadow.generation}): TCP probe ${ip}:${port} accepted; proceeding with swap.`);
+			return;
+		} catch (err) {
+			lastErr = err instanceof Error ? err.message : String(err);
+		}
+		await sleep(opts.intervalMs);
+	}
+	logger.warn(`Shadow replica r${shadow.index} (gen ${shadow.generation}): TCP probe ${ip}:${port} did not accept within ${opts.timeoutMs}ms (last: ${lastErr ?? "n/a"}). Swapping anyway — the new image is the user intent. Initial requests after the swap may 502 until the app finishes binding.`);
+}
+/**
+* Default TCP-connect probe used by {@link waitForReplicaTcpReady}.
+* Opens a socket to `host:port` and resolves on `connect`; rejects on
+* any error. The socket is destroyed immediately on connect — we don't
+* want to keep a connection open or send any bytes.
+*/
+const defaultTcpProbeImpl = async (host, port) => {
+	const { createConnection } = await import("node:net");
+	await new Promise((resolve, reject) => {
+		const socket = createConnection({
+			host,
+			port
+		});
+		const onError = (err) => {
+			socket.destroy();
+			reject(err);
+		};
+		socket.once("connect", () => {
+			socket.destroy();
+			resolve();
+		});
+		socket.once("error", onError);
+	});
+};
+let tcpProbeImpl = defaultTcpProbeImpl;
+/**
 * Long-running watcher loop for one replica. Polls the essential
 * container's exit code via `docker wait`; on exit, decides whether to
 * restart per `restartPolicy` + applies exponential backoff. The loop
@@ -23488,40 +23752,43 @@ async function runEcsServiceEmulator(targets, options, strategy, extraStateProvi
 	}
 }
 /**
-* Phase 1 of issue #214 — refuse a `--watch` run when the resolved service's
-* effective replica count (`min(template DesiredCount, --max-tasks)`) is > 1.
-* The Phase 1 reload pathway tears the single replica down before booting the
-* new one; multi-replica services would therefore drop multiple connections at
-* once and lose any in-memory state. Multi-replica rolling reload is Phase 2
-* of issue #214. Exposed for the unit test that locks the gating logic
-* (the integ test only covers the single-replica happy path).
+* Phase 2 of issue #214 — multi-replica rolling reload cycle for
+* `cdkl start-service --watch`. Mirrors start-api's `reloadAllServers`
+* shape but per-ECS-service, replacing Phase 1's "tear single replica
+* down, boot fresh" sequence with a per-replica rolling loop so the
+* service stays available end-to-end:
 *
-* @internal
-*/
-function assertSingleReplicaForWatch(service, options) {
-	if (options.watch !== true) return;
-	const effective = computeReplicaCount(service.desiredCount, options.maxTasks);
-	if (effective > 1) throw new LocalStartServiceError(`--watch is single-replica only in v1; service '${service.serviceName}' resolves to ${effective} replica(s) (template DesiredCount=${service.desiredCount}, --max-tasks=${options.maxTasks}). Lower --max-tasks to 1, drop the DesiredCount in your CDK code, or drop --watch. Multi-replica rolling reload is Phase 2 of issue #214.`);
-}
-/**
-* Phase 1 of issue #214 — single-replica rebuild-on-change reload cycle.
-* Mirrors start-api's `reloadAllServers` shape but per-ECS-service:
-*
-*   1. Re-runs `synthesizer.synthesize(synthOpts)` once (failure → warn +
-*      keep every replica serving).
+*   1. Re-runs `synthesizer.synthesize(synthOpts)` once (failure → warn
+*      + keep every replica serving).
 *   2. Re-runs `strategy.resolveBoots(stacks, resolvedTargets)` so a
-*      target that disappears from the CDK code is detected (warn + keep
-*      previous).
+*      target that disappears from the CDK code is detected (warn +
+*      keep previous).
 *   3. Refreshes `cloudMapIndexByStack` from the new stacks so a peer
 *      service's namespace / discovery-name rename is picked up by the
-*      next replica's Cloud Map publish.
-*   4. Per-target: tear the existing controller down, boot a fresh one
-*      against the new stacks. A per-target boot failure logs warn and
-*      leaves that target dark until the next save with a clean boot.
-*
-* Phase 1 trade-off: each target's replica briefly stops (between old
-* `controller.shutdown()` and new `bootOneTarget()` finishing). Phase 2 of
-* #214 swaps that for a rolling deploy across multiple replicas.
+*      next shadow replica's Cloud Map publish.
+*   4. Per-target:
+*      a. Resolves the new (service, runnerOpts) pair against the new
+*         stacks (cross-stack env / assume-task-role / `--env-vars`
+*         all re-resolved fresh).
+*      b. For each existing replica `i` in 0..min(old, new) - 1:
+*         {@link rollServiceReplica} boots a shadow replica with the
+*         new image under a bumped generation suffix, atomically swaps
+*         Cloud Map / front-door registrations, then stops + cleans up
+*         the old replica. Sequential — only one replica is mid-swap
+*         at a time, so peer services + the front-door pool always
+*         have at least N-1 live endpoints during a roll.
+*
+* Phase 2 trade-off: when the effective replica count changes mid-roll
+* (the user bumped `DesiredCount` or `--max-tasks` flips a clamp),
+* the rolling pathway keeps the existing replicas on the new image
+* but does not scale up / down to match the new count. A warn surfaces
+* so the user can `^C` + re-launch to scale; a richer "scale + roll"
+* mode is left to a follow-up under #214.
+*
+* Per-replica boot failure during the roll: the OLD replica stays
+* live (the shadow was torn down by `rollServiceReplica`), the
+* remaining replicas are still rolled, and the failure is surfaced
+* via the logger so the user can fix the source + save again.
 */
 async function reloadAllServices(args) {
 	const { perTarget, synthesizer, synthOpts, strategy, resolvedTargets, cloudMapIndexByStack, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorByService, logger } = args;
@@ -23544,29 +23811,82 @@ async function reloadAllServices(args) {
 	for (const pt of perTarget) {
 		const newBoot = newBootByTarget.get(pt.boot.target);
 		if (!newBoot) {
-			logger.warn(`Reload: target '${pt.boot.target}' no longer resolves to a service in the synthesized app; keeping the previous replica serving.`);
+			logger.warn(`Reload: target '${pt.boot.target}' no longer resolves to a service in the synthesized app; keeping the previous replica(s) serving.`);
 			continue;
 		}
-		const oldController = pt.controller;
-		try {
-			if (oldController) await oldController.shutdown();
-		} catch (err) {
-			logger.warn(`Reload: shutdown of previous '${pt.boot.target}' controller failed (${err instanceof Error ? err.message : String(err)}); attempting re-boot anyway.`);
+		const controller = pt.controller;
+		if (!controller) {
+			logger.warn(`Reload: target '${pt.boot.target}' has no live controller (previous boot likely failed); skipping roll. \`^C\` and re-run start-service to recover.`);
+			continue;
 		}
-		const newRunState = createServiceRunState();
-		let newController;
+		await rollOneTarget({
+			controller,
+			newBoot,
+			stacks,
+			options,
+			discovery,
+			skipPull,
+			extraStateProviders,
+			profileCredsFile,
+			frontDoorPools: frontDoorByService.get(newBoot.target),
+			suppressLoadBalancerWarning: strategy.suppressLoadBalancerWarning === true,
+			logger
+		});
+	}
+	logger.info("Reload complete.");
+}
+/**
+* Phase 2 of issue #214 — roll every replica of one target through the
+* new task descriptor sequentially. Extracted from {@link reloadAllServices}
+* so the per-target try/catch logic (synth-failure / resolve-failure /
+* per-replica boot-failure) stays uniform and readable.
+*
+* State-provider lifetime mirrors {@link bootOneTarget}: a fresh
+* `LocalStateProvider` is created at the top, disposed in `finally`,
+* even when the resolve / roll throws.
+*/
+async function rollOneTarget(args) {
+	const { controller, newBoot, stacks, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning, logger } = args;
+	const candidate = pickCandidateStack(parseEcsTarget(newBoot.target).stackPattern, stacks);
+	const stateProvider = createLocalStateProvider(options, candidate?.stackName ?? "", await resolveCfnFallbackRegion(options, candidate?.region), extraStateProviders);
+	try {
+		let resolved;
 		try {
-			newController = await bootOneTarget(newBoot, newRunState, stacks, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorByService.get(newBoot.target), strategy.suppressLoadBalancerWarning === true);
+			resolved = await resolveServiceAndRunnerOpts(newBoot, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning, { quiet: true });
 		} catch (err) {
-			if (err instanceof LocalStartServiceError) logger.error(`Reload of '${pt.boot.target}' was rejected: ${err.message}`);
-			else logger.error(`Reload: re-boot of '${pt.boot.target}' failed (${err instanceof Error ? err.message : String(err)}). The previous replica was torn down; save again with a clean boot to re-start it, or ^C and re-run start-service.`);
-			await Promise.allSettled(newRunState.replicas.map((r) => cleanupEcsRun(r.state, { keepRunning: false }).catch(() => void 0)));
-			continue;
+			const reason = err instanceof Error ? err.message : String(err);
+			logger.error(`Reload of '${newBoot.target}' was rejected: ${reason}. Existing replica(s) keep serving.`);
+			return;
+		}
+		const { service: newService, runnerOpts: newRunnerOpts } = resolved;
+		const oldReplicas = controller.runState.replicas.filter((r) => !r.shuttingDown);
+		if (oldReplicas.length === 0) {
+			logger.warn(`Reload of '${newBoot.target}': no live replicas to roll (all shutting down). \`^C\` and re-run start-service to recover.`);
+			return;
+		}
+		if (newService.desiredCount !== oldReplicas.length) logger.warn(`Reload of '${newBoot.target}': service DesiredCount=${newService.desiredCount} does not match the ${oldReplicas.length} live replica(s); rolling existing replicas only — scale changes during --watch are not yet supported. \`^C\` and re-run start-service to apply the new replica count.`);
+		logger.info(`Reload of '${newBoot.target}': rolling ${oldReplicas.length} replica(s) one at a time (start new shadow → swap registrations → stop old).`);
+		for (const oldInstance of oldReplicas) {
+			const idx = controller.runState.replicas.indexOf(oldInstance);
+			if (idx === -1) {
+				logger.warn(`Reload of '${newBoot.target}': replica r${oldInstance.index} (gen ${oldInstance.generation}) vanished before its roll; skipping.`);
+				continue;
+			}
+			try {
+				await rollServiceReplica({
+					controller,
+					oldReplicaIndex: idx,
+					newService,
+					newOptions: newRunnerOpts
+				});
+			} catch (err) {
+				const reason = err instanceof Error ? err.message : String(err);
+				logger.error(`Reload of '${newBoot.target}' replica r${oldInstance.index}: ${reason}. The old replica keeps serving; remaining replicas will still be rolled.`);
+			}
 		}
-		pt.runState = newRunState;
-		pt.controller = newController;
+	} finally {
+		if (stateProvider) stateProvider.dispose();
 	}
-	logger.info("Reload complete.");
 }
 async function bootOneTarget(boot, runState, stacks, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning) {
 	const candidate = pickCandidateStack(parseEcsTarget(boot.target).stackPattern, stacks);
@@ -23578,14 +23898,41 @@ async function bootOneTarget(boot, runState, stacks, options, discovery, skipPul
 	}
 }
 async function runOneTarget(boot, runState, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning) {
+	const { service, runnerOpts } = await resolveServiceAndRunnerOpts(boot, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning);
+	return startEcsService(service, runnerOpts, runState);
+}
+/**
+* Resolve a {@link ServiceBoot} to its `(ResolvedEcsService, ServiceRunnerOptions)`
+* pair. Shared by the initial boot path (`runOneTarget`) and the
+* Phase 2 of issue #214 rolling-reload pathway (`reloadAllServices`).
+*
+* Walks the same steps the original `runOneTarget` body did:
+*   1. Build the per-target image-resolution context (resolves
+*      asset / `Fn::Sub` / `--from-cfn-stack` overlays for image URIs).
+*   2. Resolve the ECS service target into a {@link ResolvedEcsService}.
+*   3. Apply the cross-stack env / secret resolver when the task
+*      references `Fn::ImportValue` / `Fn::GetStackOutput` across
+*      stacks.
+*   4. Resolve task-role credentials when `--assume-task-role` is set.
+*   5. Resolve `--env-vars` overrides.
+*   6. Compose {@link ServiceRunnerOptions} (including the shared
+*      `discovery` + per-service `frontDoor` pools the rolling
+*      reload depends on for atomic registry swaps).
+*
+* Side effects: logs the target descriptor + Service Connect /
+* ServiceRegistries banners ONLY on the initial boot. The reload
+* pathway calls this on every save; the banners would otherwise
+* spam the console once per save. Pass `quiet: true` to skip them.
+*/
+async function resolveServiceAndRunnerOpts(boot, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning, opts = {}) {
 	const logger = getLogger();
 	const target = boot.target;
+	const quiet = opts.quiet === true;
 	const imageContext = await buildEcsImageResolutionContext(target, stacks, options, stateProvider);
 	const service = resolveEcsServiceTarget(target, stacks, imageContext, { suppressLoadBalancerWarning });
-	logger.info(`Target: ${service.stack.stackName}/${service.serviceLogicalId} (service=${service.serviceName}, desiredCount=${service.desiredCount}, task=${service.task.taskDefinitionLogicalId})`);
-	assertSingleReplicaForWatch(service, options);
-	if (service.serviceConnect) logger.info(`Service Connect: namespace='${service.serviceConnect.namespaceName}', ${service.serviceConnect.services.length} service(s) registered for peer discovery.`);
-	if (service.serviceRegistries.length > 0) logger.info(`Cloud Map: ${service.serviceRegistries.length} ServiceRegistry binding(s).`);
+	if (!quiet) logger.info(`Target: ${service.stack.stackName}/${service.serviceLogicalId} (service=${service.serviceName}, desiredCount=${service.desiredCount}, task=${service.task.taskDefinitionLogicalId})`);
+	if (!quiet && service.serviceConnect) logger.info(`Service Connect: namespace='${service.serviceConnect.namespaceName}', ${service.serviceConnect.services.length} service(s) registered for peer discovery.`);
+	if (!quiet && service.serviceRegistries.length > 0) logger.info(`Cloud Map: ${service.serviceRegistries.length} ServiceRegistry binding(s).`);
 	const taskNeeds = detectEcsImageResolutionNeeds(stacks.find((s) => s.stackName === service.stack.stackName) ?? service.stack);
 	if (stateProvider && taskNeeds.needsCrossStackResolver) {
 		const consumerRegion = options.region ?? process.env["AWS_REGION"] ?? process.env["AWS_DEFAULT_REGION"] ?? service.stack.region ?? "us-east-1";
@@ -23634,13 +23981,16 @@ async function runOneTarget(boot, runState, stacks, options, discovery, skipPull
 		containerPath: profileCredsFile.containerPath,
 		profileName: profileCredsFile.profileName
 	};
-	return startEcsService(service, {
-		maxTasks: options.maxTasks,
-		restartPolicy: options.restartPolicy,
-		taskOptions: taskOpts,
-		discovery,
-		...frontDoorPools && frontDoorPools.length > 0 ? { frontDoor: { pools: frontDoorPools } } : {}
-	}, runState);
+	return {
+		service,
+		runnerOpts: {
+			maxTasks: options.maxTasks,
+			restartPolicy: options.restartPolicy,
+			taskOptions: taskOpts,
+			discovery,
+			...frontDoorPools && frontDoorPools.length > 0 ? { frontDoor: { pools: frontDoorPools } } : {}
+		}
+	};
 }
 /**
 * Stand up one host-side reverse-proxy server PER LISTENER PORT from the
@@ -24048,9 +24398,11 @@ function addCommonEcsServiceOptions(cmd) {
 * a leaf compute runner, symmetric with `invoke` / `run-task`.
 *
 * `supportsWatch: true` opts this strategy into the emulator's `--watch`
-* reload pathway (Phase 1 of issue #214 — single-replica rebuild-on-change).
-* `start-alb`'s strategy intentionally does NOT set this so a `--watch` flag
-* never leaks into the ALB-front-door path (Phase 3).
+* reload pathway (Phase 1 + Phase 2 of issue #214 — per-replica rolling
+* deploy: shadow boot under a bumped generation suffix, TCP-ready probe,
+* atomic Cloud Map / front-door swap, retire old). `start-alb`'s strategy
+* intentionally does NOT set this so a `--watch` flag never leaks into
+* the ALB-front-door path (Phase 3).
 */
 function serviceStrategy() {
 	return {
@@ -24103,7 +24455,7 @@ function createLocalStartServiceCommand(opts = {}) {
 * advertise a flag one of its consumers does not honor.
 */
 function addStartServiceSpecificOptions(cmd) {
-	return cmd.addOption(new Option("--host-port <containerPort=hostPort...>", "Publish a container port on a specific host port (e.g. 80=8080); repeatable. Default: host port == container port. Use this on macOS to map a privileged container port (< 1024) to a non-privileged host port and avoid the Docker Desktop admin-password prompt. (Single-replica services only — multi-replica services do not publish host ports.)")).addOption(new Option("--watch", "Hot-reload: re-synth + re-resolve every booted service and replace its single replica when the CDK app's source changes (honors cdk.json watch.include/exclude; cdk.out, node_modules, .git are always excluded). Single-replica services only in v1 — a service with effective replica count > 1 errors out (multi-replica rolling deploy is Phase 2 of issue #214). Off by default; the previous replica keeps serving when synth fails mid-reload.").default(false));
+	return cmd.addOption(new Option("--host-port <containerPort=hostPort...>", "Publish a container port on a specific host port (e.g. 80=8080); repeatable. Default: host port == container port. Use this on macOS to map a privileged container port (< 1024) to a non-privileged host port and avoid the Docker Desktop admin-password prompt. (Single-replica services only — multi-replica services do not publish host ports.)")).addOption(new Option("--watch", "Hot-reload: re-synth + per-replica rolling deploy when the CDK source changes (honors cdk.json watch.include/exclude; cdk.out, node_modules, .git are always excluded). Each replica is rolled one at a time — boot a shadow under a bumped generation suffix, wait for its container port to accept a TCP connection, atomically swap Service-Connect / Cloud Map registrations, then retire the old container — so peer services see zero connection refusals across the reload even on multi-replica services. Off by default; existing replica(s) keep serving when synth fails mid-reload.").default(false));
 }
 //#endregion
@@ -25052,4 +25404,4 @@ function addListSpecificOptions(cmd) {
 //#endregion
 export { buildHttpApiV2Event as $, resolveRuntimeFileExtension as $t, createWatchPredicates as A, AGENTCORE_HTTP_PROTOCOL as An, A2A_PATH as At, readMtlsMaterialsFromDisk as B, LocalInvokeBuildError as Bn, invokeAgentCore as Bt, getContainerNetworkIp as C, discoverWebSocketApisOrThrow as Cn, applyCorsResponseHeaders as Ct, createLocalInvokeAgentCoreCommand as D, resolveLambdaArnIntrinsic as Dn, matchPreflight as Dt, addInvokeAgentCoreSpecificOptions as E, pickRefLogicalId as En, isFunctionUrlOacFronted as Et, buildStageMap as F, resolveAgentCoreTarget as Fn, mcpInvokeOnce as Ft, buildMethodArn as G, computeCodeImageTag as Gt, resolveSelectionExpression as H, downloadAndExtractS3Bundle as Ht, availableApiIdentifiers as I, derivePseudoParametersFromRegion as In, parseSseForJsonRpc as It, invokeRequestAuthorizer as J, addInvokeSpecificOptions as Jt, computeRequestIdentityHash as K, renderCodeDockerfile as Kt, filterRoutesByApiIdentifier as L, formatStateRemedy as Ln, AGENTCORE_SIGV4_SERVICE as Lt, createAuthorizerCache as M, AGENTCORE_RUNTIME_TYPE as Mn, MCP_CONTAINER_PORT as Mt, createFileWatcher as N, AgentCoreResolutionError as Nn, MCP_PATH as Nt, addStartApiSpecificOptions as O, AGENTCORE_A2A_PROTOCOL as On, invokeAgentCoreWs as Ot, attachStageContext as P, pickAgentCoreCandidateStack as Pn, MCP_PROTOCOL_VERSION as Pt, applyAuthorizerOverlay as Q, resolveRuntimeCodeMountPath as Qt, filterRoutesByApiIdentifiers as R, substituteImagePlaceholders as Rn, signAgentCoreInvocation as Rt, CloudMapRegistry as S, discoverWebSocketApis as Sn, attachAuthorizers as St, createLocalRunTaskCommand as T, discoverRoutes as Tn, buildCorsConfigFromCloudFrontChain as Tt, resolveServiceIntegrationParameters as U, SUPPORTED_CODE_RUNTIMES as Ut, startApiServer as V, waitForAgentCorePing as Vt, defaultCredentialsLoader as W, buildAgentCoreCodeImage as Wt, matchRoute as X, architectureToPlatform as Xt, invokeTokenAuthorizer as Y, createLocalInvokeCommand as Yt, translateLambdaResponse as Z, buildContainerImage as Zt, parseMaxTasks as _, resolveSsmParameters as _n, buildJwksUrlFromIssuer as _t, albStrategy as a, substituteEnvVarsFromStateAsync as an, VtlEvaluationError as at, runEcsServiceEmulator as b, countTargets as bn, verifyJwtAuthorizer as bt, resolveAlbTarget as c, LocalStateSourceError as cn, bufferToBody as ct, addStartServiceSpecificOptions as d, rejectExplicitCfnStackWithMultipleStacks as dn, handleConnectionsRequest as dt, resolveRuntimeImage as en, buildRestV1Event as et, createLocalStartServiceCommand as f, resolveCfnFallbackRegion as fn, parseConnectionsPath as ft, buildEcsImageResolutionContext as g, collectSsmParameterRefs as gn, buildCognitoJwksUrl as gt, addCommonEcsServiceOptions as h, CfnLocalStateProvider as hn, buildMessageEvent as ht, addAlbSpecificOptions as i, substituteEnvVarsFromState as in, tryParseStatus as it, resolveApiTargetSubset as j, AGENTCORE_MCP_PROTOCOL as jn, a2aInvokeOnce as jt, createLocalStartApiCommand as k, AGENTCORE_AGUI_PROTOCOL as kn, A2A_CONTAINER_PORT as kt, isApplicationLoadBalancer as l, createLocalStateProvider as ln, ConnectionRegistry as lt, MAX_TASKS_SUBNET_RANGE_CAP as m, resolveCfnStackName as mn, buildDisconnectEvent as mt, createLocalListCommand as n, substituteAgainstState as nn, pickResponseTemplate as nt, createLocalStartAlbCommand as o, resolveEnvVars as on, HOST_GATEWAY_MIN_VERSION as ot, serviceStrategy as p, resolveCfnRegion as pn, buildConnectEvent as pt, evaluateCachedLambdaPolicy as q, toCmdArgv as qt, formatTargetListing as r, substituteAgainstStateAsync as rn, selectIntegrationResponse as rt, parseLbPortOverrides as s, materializeLayerFromArn as sn, probeHostGatewaySupport as st, addListSpecificOptions as t, EcsTaskResolutionError as tn, evaluateResponseParameters as tt, resolveAlbFrontDoor as u, isCfnFlagPresent as un, buildMgmtEndpointEnvUrl as ut, parseRestartPolicy as v, resolveWatchConfig as vn, createJwksCache as vt, addRunTaskSpecificOptions as w, parseSelectionExpressionPath as wn, buildCorsConfigByApiId as wt, buildCloudMapIndex as x, listTargets as xn, verifyJwtViaDiscovery as xt, resolveSharedSidecarCredentials as y, resolveSingleTarget as yn, verifyCognitoJwt as yt, groupRoutesByServer as z, tryResolveImageFnJoin as zn, AGENTCORE_SESSION_ID_HEADER as zt };
-//# sourceMappingURL=local-list-CyW86HDN.js.map
+//# sourceMappingURL=local-list-faPgnDlc.js.map