cdk-local 0.66.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20968,6 +20968,7 @@ async function startEcsService(service, options, runState) {
20968
20968
  for (let i = 0; i < replicaCount; i++) {
20969
20969
  const instance = {
20970
20970
  index: i,
20971
+ generation: 0,
20971
20972
  state: createEcsRunState(),
20972
20973
  restartCount: 0,
20973
20974
  shuttingDown: false,
@@ -21112,8 +21113,11 @@ function buildNetworkAliasesByContainer(service) {
21112
21113
  */
21113
21114
  async function bootReplica(service, options, instance) {
21114
21115
  const logger = getLogger().child("ecs-service");
21115
- const perReplicaCluster = `${options.taskOptions.cluster}-svc-${service.serviceLogicalId.toLowerCase()}-r${instance.index}`;
21116
- const ownerKeyPrefix = `${service.serviceLogicalId}:r${instance.index}`;
21116
+ const gen = instance.generation;
21117
+ const genSuffix = gen > 0 ? `-g${gen}` : "";
21118
+ const ownerKeyGenSuffix = gen > 0 ? `:g${gen}` : "";
21119
+ const perReplicaCluster = `${options.taskOptions.cluster}-svc-${service.serviceLogicalId.toLowerCase()}-r${instance.index}${genSuffix}`;
21120
+ const ownerKeyPrefix = `${service.serviceLogicalId}:r${instance.index}${ownerKeyGenSuffix}`;
21117
21121
  const addHostFlags = options.discovery?.registry ? options.discovery.registry.buildAddHostFlags(ownerKeyPrefix) : [];
21118
21122
  const sharedNetwork = options.discovery?.sharedNetwork;
21119
21123
  const networkAliasesByContainer = buildNetworkAliasesByContainer(service);
@@ -21264,6 +21268,266 @@ function unregisterReplicaFromFrontDoor(instance, frontDoor) {
21264
21268
  instance.frontDoorOwnerKey = void 0;
21265
21269
  }
21266
21270
  /**
21271
+ * Phase 2 of issue #214 — shadow-replica readiness probe budget. Tested
21272
+ * with busybox httpd's ~50ms listen window and a non-trivial Node
21273
+ * Express startup (~1-3s) — 10s caps the rare slow-start app without
21274
+ * blocking the roll for typo'd configurations that will never listen.
21275
+ *
21276
+ * Mutable so unit tests can shrink the timeout window without
21277
+ * standing up a real clock; production callers leave the defaults.
21278
+ * Exposed via {@link __setShadowReadyConfig} below.
21279
+ */
21280
+ let shadowReadyTimeoutMs = 1e4;
21281
+ let shadowReadyIntervalMs = 100;
21282
+ /**
21283
+ * Phase 2 of issue #214 — per-replica rolling reload primitive used by
21284
+ * `cdkl start-service --watch`. Boots one fresh "shadow" replica under a
21285
+ * bumped generation suffix, atomically swaps Cloud Map / front-door
21286
+ * registrations off the old replica, then stops and cleans up the old
21287
+ * container.
21288
+ *
21289
+ * Sequence:
21290
+ * 1. Locate the old replica by `oldReplicaIndex` (rejects when it's
21291
+ * already shutting down or missing — the reloader must not race
21292
+ * itself across overlapping firings, which the emulator's
21293
+ * `reloadChain` serializer guarantees externally).
21294
+ * 2. Allocate a shadow {@link ServiceReplicaInstance} with the same
21295
+ * logical `index` and `generation = old.generation + 1`. Appended
21296
+ * to `runState.replicas` so a SIGTERM mid-roll tears it down too.
21297
+ * 3. `bootReplica(newService, newOptions, shadow)` boots the new
21298
+ * container, publishes Cloud Map handles under the bumped
21299
+ * generation suffix, and registers the shadow in the front-door
21300
+ * pool. The OLD replica's handles + pool entry stay live during
21301
+ * this window so consumers never see a gap.
21302
+ * 4. Atomically swap: unregister old's Cloud Map handles, drop its
21303
+ * front-door pool entry, mark `oldInstance.shuttingDown = true`
21304
+ * so the watcher exits. The shadow is already serving by this
21305
+ * point.
21306
+ * 5. `cleanupEcsRun(oldInstance.state)` tears the old container +
21307
+ * network down. The shadow remains in `runState.replicas`.
21308
+ * 6. Start the shadow's watcher so restart-on-exit is wired the
21309
+ * same as Phase 1's boot loop.
21310
+ *
21311
+ * Failure modes:
21312
+ * - `bootReplica` fails: keep the old replica serving. Best-effort
21313
+ * teardown of partial shadow state. Re-throws so the reloader can
21314
+ * log and continue with the remaining replicas.
21315
+ * - Old shutdown fails: surfaced via the logger; the shadow is
21316
+ * already live so the service stays available.
21317
+ *
21318
+ * @internal — wired only by the emulator's reload pathway.
21319
+ */
21320
+ async function rollServiceReplica(args) {
21321
+ const { controller, oldReplicaIndex, newService, newOptions } = args;
21322
+ const logger = getLogger().child("ecs-service");
21323
+ const oldInstance = controller.runState.replicas[oldReplicaIndex];
21324
+ if (!oldInstance) throw new EcsServiceRunnerError(`rollServiceReplica: no replica at index ${oldReplicaIndex} (replicas=${controller.runState.replicas.length}).`);
21325
+ if (oldInstance.shuttingDown) {
21326
+ logger.warn(`Rolling replica r${oldInstance.index} (gen ${oldInstance.generation}): retired by its own watcher mid-roll (essential container exited). Skipping this slot; save again to re-boot it.`);
21327
+ return;
21328
+ }
21329
+ const teardownOldFirst = computeReplicaCount(newService.desiredCount, newOptions.maxTasks) === 1;
21330
+ const shadow = {
21331
+ index: oldInstance.index,
21332
+ generation: oldInstance.generation + 1,
21333
+ state: createEcsRunState(),
21334
+ restartCount: 0,
21335
+ shuttingDown: false,
21336
+ inFlightBoot: void 0,
21337
+ cloudMapHandles: [],
21338
+ frontDoorOwnerKey: void 0
21339
+ };
21340
+ controller.runState.replicas.push(shadow);
21341
+ if (teardownOldFirst) {
21342
+ logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): single-replica + host-port publish — tearing old down before shadow boot to avoid host-port collision.`);
21343
+ if (newOptions.discovery) {
21344
+ for (const handle of oldInstance.cloudMapHandles) try {
21345
+ newOptions.discovery.registry.unregister(handle);
21346
+ } catch {}
21347
+ oldInstance.cloudMapHandles = [];
21348
+ }
21349
+ unregisterReplicaFromFrontDoor(oldInstance, newOptions.frontDoor);
21350
+ oldInstance.shuttingDown = true;
21351
+ try {
21352
+ await cleanupEcsRun(oldInstance.state, { keepRunning: newOptions.taskOptions.keepRunning });
21353
+ } catch (err) {
21354
+ logger.warn(`Rolling replica ${oldInstance.index}: cleanup of old (gen ${oldInstance.generation}) failed: ${err instanceof Error ? err.message : String(err)}. Attempting shadow boot anyway.`);
21355
+ }
21356
+ const oldIdx = controller.runState.replicas.indexOf(oldInstance);
21357
+ if (oldIdx !== -1) controller.runState.replicas.splice(oldIdx, 1);
21358
+ } else logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): booting shadow before retiring old.`);
21359
+ const bootPromise = (async () => {
21360
+ await bootReplica(newService, newOptions, shadow);
21361
+ await waitForReplicaTcpReady(newService, shadow, {
21362
+ timeoutMs: shadowReadyTimeoutMs,
21363
+ intervalMs: shadowReadyIntervalMs
21364
+ });
21365
+ })();
21366
+ shadow.inFlightBoot = bootPromise;
21367
+ try {
21368
+ await bootPromise;
21369
+ } catch (err) {
21370
+ const shadowIdx = controller.runState.replicas.indexOf(shadow);
21371
+ if (shadowIdx !== -1) controller.runState.replicas.splice(shadowIdx, 1);
21372
+ try {
21373
+ await cleanupEcsRun(shadow.state, { keepRunning: false });
21374
+ } catch {}
21375
+ if (teardownOldFirst) logger.error(`Rolling replica ${shadow.index}: shadow boot failed and the old replica was already torn down for the single-replica path. Save again with a clean boot to re-start the service.`);
21376
+ throw err;
21377
+ } finally {
21378
+ shadow.inFlightBoot = void 0;
21379
+ }
21380
+ if (teardownOldFirst) {
21381
+ watchReplica(newService, newOptions, shadow, controller.runState);
21382
+ logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): single-replica reload complete.`);
21383
+ return;
21384
+ }
21385
+ if (newOptions.discovery) {
21386
+ for (const handle of oldInstance.cloudMapHandles) try {
21387
+ newOptions.discovery.registry.unregister(handle);
21388
+ } catch {}
21389
+ oldInstance.cloudMapHandles = [];
21390
+ }
21391
+ unregisterReplicaFromFrontDoor(oldInstance, newOptions.frontDoor);
21392
+ await disconnectOldFromSharedNetwork(oldInstance).catch((err) => {
21393
+ logger.debug(`Rolling replica ${oldInstance.index}: shared-network disconnect of old (gen ${oldInstance.generation}) failed: ${err instanceof Error ? err.message : String(err)}. Proceeding with cleanup (the docker-rm step still tears it down).`);
21394
+ });
21395
+ oldInstance.shuttingDown = true;
21396
+ try {
21397
+ await cleanupEcsRun(oldInstance.state, { keepRunning: newOptions.taskOptions.keepRunning });
21398
+ } catch (err) {
21399
+ logger.warn(`Rolling replica ${oldInstance.index}: cleanup of old (gen ${oldInstance.generation}) failed: ${err instanceof Error ? err.message : String(err)}. The shadow is live; the stale container may need a manual \`docker rm\`.`);
21400
+ }
21401
+ const oldIdx = controller.runState.replicas.indexOf(oldInstance);
21402
+ if (oldIdx !== -1) controller.runState.replicas.splice(oldIdx, 1);
21403
+ watchReplica(newService, newOptions, shadow, controller.runState);
21404
+ logger.info(`Rolling replica ${shadow.index} (gen ${shadow.generation}): swap complete; old retired.`);
21405
+ }
21406
+ /**
21407
+ * Phase 2 of issue #214 — disconnect every container of the dying
21408
+ * replica from the shared service network BEFORE `cleanupEcsRun`'s
21409
+ * `docker stop → docker rm` sequence. Docker's embedded DNS strips an
21410
+ * alias the instant a container is disconnected, so a peer resolving
21411
+ * the service's Service Connect / Cloud Map alias right after this
21412
+ * step never picks the dying container's IP — closing the race window
21413
+ * where the alias points at an IP whose app is already gone. Best-
21414
+ * effort: a disconnect failure logs at debug and `cleanupEcsRun`'s
21415
+ * `docker rm -f` will still tear the network membership down.
21416
+ *
21417
+ * No-op for replicas that aren't on a shared network (the defensive
21418
+ * "per-replica /24" fallback path); the per-replica network is
21419
+ * destroyed by `cleanupEcsRun` directly.
21420
+ */
21421
+ async function disconnectOldFromSharedNetwork(oldInstance) {
21422
+ const network = oldInstance.state.network;
21423
+ if (!network || !network.ownedByCaller) return;
21424
+ const networkName = network.networkName;
21425
+ const targets = [];
21426
+ if (network.sidecarContainerId) targets.push(network.sidecarContainerId);
21427
+ for (const c of oldInstance.state.startedContainers) targets.push(c.id);
21428
+ for (const id of targets) try {
21429
+ await dockerNetworkDisconnectImpl(networkName, id);
21430
+ } catch (err) {}
21431
+ }
21432
+ /**
21433
+ * Production `docker network disconnect --force <network> <id>` impl,
21434
+ * extracted as a test-overridable function so the rolling-primitive
21435
+ * unit test can assert this step actually ran (the reviewer flagged
21436
+ * that the test mock previously took the `!ownedByCaller` early-return
21437
+ * path and silently never entered the disconnect branch).
21438
+ */
21439
+ const defaultDockerNetworkDisconnectImpl = async (networkName, containerId) => {
21440
+ const { execFile } = await import("node:child_process");
21441
+ const { promisify } = await import("node:util");
21442
+ const { getDockerCmd } = await import("./docker-cmd-voNPrcRh.js").then((n) => n.t);
21443
+ await promisify(execFile)(getDockerCmd(), [
21444
+ "network",
21445
+ "disconnect",
21446
+ "--force",
21447
+ networkName,
21448
+ containerId
21449
+ ]);
21450
+ };
21451
+ let dockerNetworkDisconnectImpl = defaultDockerNetworkDisconnectImpl;
21452
+ /**
21453
+ * Phase 2 of issue #214 — shadow-replica TCP readiness probe used by
21454
+ * {@link rollServiceReplica} before the atomic registry swap. Polls the
21455
+ * essential container's first port mapping (the one Cloud Map / Service
21456
+ * Connect publishes) via TCP-connect on the shadow's docker network IP,
21457
+ * retrying every `intervalMs` until either the connect succeeds or the
21458
+ * timeout elapses.
21459
+ *
21460
+ * The probe is best-effort: a timeout logs a warn but DOES NOT throw.
21461
+ * Swapping anyway is the lesser evil — the dying old replica's image
21462
+ * is about to be torn down, and the shadow's new image is the user's
21463
+ * intent. A timed-out probe usually means the app inside the new image
21464
+ * has a startup bug; the user will see the connection failures on
21465
+ * their probe / curl and fix the app, then save again. Failing the
21466
+ * roll here would leave the OLD replica running on stale code with no
21467
+ * recovery path other than `^C`.
21468
+ *
21469
+ * Exposed for the unit test pattern: the probe's `connect` impl is
21470
+ * injectable via {@link __setTcpProbeImpl} so the rolling-primitive
21471
+ * unit test can avoid any real TCP socket.
21472
+ */
21473
+ async function waitForReplicaTcpReady(service, shadow, opts) {
21474
+ const logger = getLogger().child("ecs-service");
21475
+ const networkName = shadow.state.network?.networkName;
21476
+ if (!networkName) return;
21477
+ const essential = service.task.containers.find((c) => c.essential) ?? service.task.containers[0];
21478
+ if (!essential || essential.portMappings.length === 0) return;
21479
+ const started = shadow.state.startedContainers.find((c) => c.name === essential.name);
21480
+ if (!started) return;
21481
+ let ip;
21482
+ try {
21483
+ const resolved = await getContainerNetworkIp(started.id, networkName);
21484
+ if (!resolved) return;
21485
+ ip = resolved;
21486
+ } catch (err) {
21487
+ logger.warn(`Shadow replica r${shadow.index} (gen ${shadow.generation}): TCP-ready probe could not resolve docker IP: ${err instanceof Error ? err.message : String(err)}. Proceeding with swap.`);
21488
+ return;
21489
+ }
21490
+ const port = essential.portMappings[0].containerPort;
21491
+ const deadline = Date.now() + opts.timeoutMs;
21492
+ let lastErr;
21493
+ while (Date.now() < deadline) {
21494
+ try {
21495
+ await tcpProbeImpl(ip, port);
21496
+ logger.debug(`Shadow replica r${shadow.index} (gen ${shadow.generation}): TCP probe ${ip}:${port} accepted; proceeding with swap.`);
21497
+ return;
21498
+ } catch (err) {
21499
+ lastErr = err instanceof Error ? err.message : String(err);
21500
+ }
21501
+ await sleep(opts.intervalMs);
21502
+ }
21503
+ logger.warn(`Shadow replica r${shadow.index} (gen ${shadow.generation}): TCP probe ${ip}:${port} did not accept within ${opts.timeoutMs}ms (last: ${lastErr ?? "n/a"}). Swapping anyway — the new image is the user intent. Initial requests after the swap may 502 until the app finishes binding.`);
21504
+ }
21505
+ /**
21506
+ * Default TCP-connect probe used by {@link waitForReplicaTcpReady}.
21507
+ * Opens a socket to `host:port` and resolves on `connect`; rejects on
21508
+ * any error. The socket is destroyed immediately on connect — we don't
21509
+ * want to keep a connection open or send any bytes.
21510
+ */
21511
+ const defaultTcpProbeImpl = async (host, port) => {
21512
+ const { createConnection } = await import("node:net");
21513
+ await new Promise((resolve, reject) => {
21514
+ const socket = createConnection({
21515
+ host,
21516
+ port
21517
+ });
21518
+ const onError = (err) => {
21519
+ socket.destroy();
21520
+ reject(err);
21521
+ };
21522
+ socket.once("connect", () => {
21523
+ socket.destroy();
21524
+ resolve();
21525
+ });
21526
+ socket.once("error", onError);
21527
+ });
21528
+ };
21529
+ let tcpProbeImpl = defaultTcpProbeImpl;
21530
+ /**
21267
21531
  * Long-running watcher loop for one replica. Polls the essential
21268
21532
  * container's exit code via `docker wait`; on exit, decides whether to
21269
21533
  * restart per `restartPolicy` + applies exponential backoff. The loop
@@ -23488,40 +23752,43 @@ async function runEcsServiceEmulator(targets, options, strategy, extraStateProvi
23488
23752
  }
23489
23753
  }
23490
23754
  /**
23491
- * Phase 1 of issue #214 — refuse a `--watch` run when the resolved service's
23492
- * effective replica count (`min(template DesiredCount, --max-tasks)`) is > 1.
23493
- * The Phase 1 reload pathway tears the single replica down before booting the
23494
- * new one; multi-replica services would therefore drop multiple connections at
23495
- * once and lose any in-memory state. Multi-replica rolling reload is Phase 2
23496
- * of issue #214. Exposed for the unit test that locks the gating logic
23497
- * (the integ test only covers the single-replica happy path).
23755
+ * Phase 2 of issue #214 — multi-replica rolling reload cycle for
23756
+ * `cdkl start-service --watch`. Mirrors start-api's `reloadAllServers`
23757
+ * shape but per-ECS-service, replacing Phase 1's "tear single replica
23758
+ * down, boot fresh" sequence with a per-replica rolling loop so the
23759
+ * service stays available end-to-end:
23498
23760
  *
23499
- * @internal
23500
- */
23501
- function assertSingleReplicaForWatch(service, options) {
23502
- if (options.watch !== true) return;
23503
- const effective = computeReplicaCount(service.desiredCount, options.maxTasks);
23504
- if (effective > 1) throw new LocalStartServiceError(`--watch is single-replica only in v1; service '${service.serviceName}' resolves to ${effective} replica(s) (template DesiredCount=${service.desiredCount}, --max-tasks=${options.maxTasks}). Lower --max-tasks to 1, drop the DesiredCount in your CDK code, or drop --watch. Multi-replica rolling reload is Phase 2 of issue #214.`);
23505
- }
23506
- /**
23507
- * Phase 1 of issue #214 — single-replica rebuild-on-change reload cycle.
23508
- * Mirrors start-api's `reloadAllServers` shape but per-ECS-service:
23509
- *
23510
- * 1. Re-runs `synthesizer.synthesize(synthOpts)` once (failure → warn +
23511
- * keep every replica serving).
23761
+ * 1. Re-runs `synthesizer.synthesize(synthOpts)` once (failure → warn
23762
+ * + keep every replica serving).
23512
23763
  * 2. Re-runs `strategy.resolveBoots(stacks, resolvedTargets)` so a
23513
- * target that disappears from the CDK code is detected (warn + keep
23514
- * previous).
23764
+ * target that disappears from the CDK code is detected (warn +
23765
+ * keep previous).
23515
23766
  * 3. Refreshes `cloudMapIndexByStack` from the new stacks so a peer
23516
23767
  * service's namespace / discovery-name rename is picked up by the
23517
- * next replica's Cloud Map publish.
23518
- * 4. Per-target: tear the existing controller down, boot a fresh one
23519
- * against the new stacks. A per-target boot failure logs warn and
23520
- * leaves that target dark until the next save with a clean boot.
23521
- *
23522
- * Phase 1 trade-off: each target's replica briefly stops (between old
23523
- * `controller.shutdown()` and new `bootOneTarget()` finishing). Phase 2 of
23524
- * #214 swaps that for a rolling deploy across multiple replicas.
23768
+ * next shadow replica's Cloud Map publish.
23769
+ * 4. Per-target:
23770
+ * a. Resolves the new (service, runnerOpts) pair against the new
23771
+ * stacks (cross-stack env / assume-task-role / `--env-vars`
23772
+ * all re-resolved fresh).
23773
+ * b. For each existing replica `i` in 0..min(old, new) - 1:
23774
+ * {@link rollServiceReplica} boots a shadow replica with the
23775
+ * new image under a bumped generation suffix, atomically swaps
23776
+ * Cloud Map / front-door registrations, then stops + cleans up
23777
+ * the old replica. Sequential — only one replica is mid-swap
23778
+ * at a time, so peer services + the front-door pool always
23779
+ * have at least N-1 live endpoints during a roll.
23780
+ *
23781
+ * Phase 2 trade-off: when the effective replica count changes mid-roll
23782
+ * (the user bumped `DesiredCount` or `--max-tasks` flips a clamp),
23783
+ * the rolling pathway keeps the existing replicas on the new image
23784
+ * but does not scale up / down to match the new count. A warn surfaces
23785
+ * so the user can `^C` + re-launch to scale; a richer "scale + roll"
23786
+ * mode is left to a follow-up under #214.
23787
+ *
23788
+ * Per-replica boot failure during the roll: the OLD replica stays
23789
+ * live (the shadow was torn down by `rollServiceReplica`), the
23790
+ * remaining replicas are still rolled, and the failure is surfaced
23791
+ * via the logger so the user can fix the source + save again.
23525
23792
  */
23526
23793
  async function reloadAllServices(args) {
23527
23794
  const { perTarget, synthesizer, synthOpts, strategy, resolvedTargets, cloudMapIndexByStack, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorByService, logger } = args;
@@ -23544,29 +23811,82 @@ async function reloadAllServices(args) {
23544
23811
  for (const pt of perTarget) {
23545
23812
  const newBoot = newBootByTarget.get(pt.boot.target);
23546
23813
  if (!newBoot) {
23547
- logger.warn(`Reload: target '${pt.boot.target}' no longer resolves to a service in the synthesized app; keeping the previous replica serving.`);
23814
+ logger.warn(`Reload: target '${pt.boot.target}' no longer resolves to a service in the synthesized app; keeping the previous replica(s) serving.`);
23548
23815
  continue;
23549
23816
  }
23550
- const oldController = pt.controller;
23551
- try {
23552
- if (oldController) await oldController.shutdown();
23553
- } catch (err) {
23554
- logger.warn(`Reload: shutdown of previous '${pt.boot.target}' controller failed (${err instanceof Error ? err.message : String(err)}); attempting re-boot anyway.`);
23817
+ const controller = pt.controller;
23818
+ if (!controller) {
23819
+ logger.warn(`Reload: target '${pt.boot.target}' has no live controller (previous boot likely failed); skipping roll. \`^C\` and re-run start-service to recover.`);
23820
+ continue;
23555
23821
  }
23556
- const newRunState = createServiceRunState();
23557
- let newController;
23822
+ await rollOneTarget({
23823
+ controller,
23824
+ newBoot,
23825
+ stacks,
23826
+ options,
23827
+ discovery,
23828
+ skipPull,
23829
+ extraStateProviders,
23830
+ profileCredsFile,
23831
+ frontDoorPools: frontDoorByService.get(newBoot.target),
23832
+ suppressLoadBalancerWarning: strategy.suppressLoadBalancerWarning === true,
23833
+ logger
23834
+ });
23835
+ }
23836
+ logger.info("Reload complete.");
23837
+ }
23838
+ /**
23839
+ * Phase 2 of issue #214 — roll every replica of one target through the
23840
+ * new task descriptor sequentially. Extracted from {@link reloadAllServices}
23841
+ * so the per-target try/catch logic (synth-failure / resolve-failure /
23842
+ * per-replica boot-failure) stays uniform and readable.
23843
+ *
23844
+ * State-provider lifetime mirrors {@link bootOneTarget}: a fresh
23845
+ * `LocalStateProvider` is created at the top, disposed in `finally`,
23846
+ * even when the resolve / roll throws.
23847
+ */
23848
+ async function rollOneTarget(args) {
23849
+ const { controller, newBoot, stacks, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning, logger } = args;
23850
+ const candidate = pickCandidateStack(parseEcsTarget(newBoot.target).stackPattern, stacks);
23851
+ const stateProvider = createLocalStateProvider(options, candidate?.stackName ?? "", await resolveCfnFallbackRegion(options, candidate?.region), extraStateProviders);
23852
+ try {
23853
+ let resolved;
23558
23854
  try {
23559
- newController = await bootOneTarget(newBoot, newRunState, stacks, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorByService.get(newBoot.target), strategy.suppressLoadBalancerWarning === true);
23855
+ resolved = await resolveServiceAndRunnerOpts(newBoot, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning, { quiet: true });
23560
23856
  } catch (err) {
23561
- if (err instanceof LocalStartServiceError) logger.error(`Reload of '${pt.boot.target}' was rejected: ${err.message}`);
23562
- else logger.error(`Reload: re-boot of '${pt.boot.target}' failed (${err instanceof Error ? err.message : String(err)}). The previous replica was torn down; save again with a clean boot to re-start it, or ^C and re-run start-service.`);
23563
- await Promise.allSettled(newRunState.replicas.map((r) => cleanupEcsRun(r.state, { keepRunning: false }).catch(() => void 0)));
23564
- continue;
23857
+ const reason = err instanceof Error ? err.message : String(err);
23858
+ logger.error(`Reload of '${newBoot.target}' was rejected: ${reason}. Existing replica(s) keep serving.`);
23859
+ return;
23860
+ }
23861
+ const { service: newService, runnerOpts: newRunnerOpts } = resolved;
23862
+ const oldReplicas = controller.runState.replicas.filter((r) => !r.shuttingDown);
23863
+ if (oldReplicas.length === 0) {
23864
+ logger.warn(`Reload of '${newBoot.target}': no live replicas to roll (all shutting down). \`^C\` and re-run start-service to recover.`);
23865
+ return;
23866
+ }
23867
+ if (newService.desiredCount !== oldReplicas.length) logger.warn(`Reload of '${newBoot.target}': service DesiredCount=${newService.desiredCount} does not match the ${oldReplicas.length} live replica(s); rolling existing replicas only — scale changes during --watch are not yet supported. \`^C\` and re-run start-service to apply the new replica count.`);
23868
+ logger.info(`Reload of '${newBoot.target}': rolling ${oldReplicas.length} replica(s) one at a time (start new shadow → swap registrations → stop old).`);
23869
+ for (const oldInstance of oldReplicas) {
23870
+ const idx = controller.runState.replicas.indexOf(oldInstance);
23871
+ if (idx === -1) {
23872
+ logger.warn(`Reload of '${newBoot.target}': replica r${oldInstance.index} (gen ${oldInstance.generation}) vanished before its roll; skipping.`);
23873
+ continue;
23874
+ }
23875
+ try {
23876
+ await rollServiceReplica({
23877
+ controller,
23878
+ oldReplicaIndex: idx,
23879
+ newService,
23880
+ newOptions: newRunnerOpts
23881
+ });
23882
+ } catch (err) {
23883
+ const reason = err instanceof Error ? err.message : String(err);
23884
+ logger.error(`Reload of '${newBoot.target}' replica r${oldInstance.index}: ${reason}. The old replica keeps serving; remaining replicas will still be rolled.`);
23885
+ }
23565
23886
  }
23566
- pt.runState = newRunState;
23567
- pt.controller = newController;
23887
+ } finally {
23888
+ if (stateProvider) stateProvider.dispose();
23568
23889
  }
23569
- logger.info("Reload complete.");
23570
23890
  }
23571
23891
  async function bootOneTarget(boot, runState, stacks, options, discovery, skipPull, extraStateProviders, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning) {
23572
23892
  const candidate = pickCandidateStack(parseEcsTarget(boot.target).stackPattern, stacks);
@@ -23578,14 +23898,41 @@ async function bootOneTarget(boot, runState, stacks, options, discovery, skipPul
23578
23898
  }
23579
23899
  }
23580
23900
  async function runOneTarget(boot, runState, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning) {
23901
+ const { service, runnerOpts } = await resolveServiceAndRunnerOpts(boot, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning);
23902
+ return startEcsService(service, runnerOpts, runState);
23903
+ }
23904
+ /**
23905
+ * Resolve a {@link ServiceBoot} to its `(ResolvedEcsService, ServiceRunnerOptions)`
23906
+ * pair. Shared by the initial boot path (`runOneTarget`) and the
23907
+ * Phase 2 of issue #214 rolling-reload pathway (`reloadAllServices`).
23908
+ *
23909
+ * Walks the same steps the original `runOneTarget` body did:
23910
+ * 1. Build the per-target image-resolution context (resolves
23911
+ * asset / `Fn::Sub` / `--from-cfn-stack` overlays for image URIs).
23912
+ * 2. Resolve the ECS service target into a {@link ResolvedEcsService}.
23913
+ * 3. Apply the cross-stack env / secret resolver when the task
23914
+ * references `Fn::ImportValue` / `Fn::GetStackOutput` across
23915
+ * stacks.
23916
+ * 4. Resolve task-role credentials when `--assume-task-role` is set.
23917
+ * 5. Resolve `--env-vars` overrides.
23918
+ * 6. Compose {@link ServiceRunnerOptions} (including the shared
23919
+ * `discovery` + per-service `frontDoor` pools the rolling
23920
+ * reload depends on for atomic registry swaps).
23921
+ *
23922
+ * Side effects: logs the target descriptor + Service Connect /
23923
+ * ServiceRegistries banners ONLY on the initial boot. The reload
23924
+ * pathway calls this on every save; the banners would otherwise
23925
+ * spam the console once per save. Pass `quiet: true` to skip them.
23926
+ */
23927
+ async function resolveServiceAndRunnerOpts(boot, stacks, options, discovery, skipPull, stateProvider, profileCredsFile, frontDoorPools, suppressLoadBalancerWarning, opts = {}) {
23581
23928
  const logger = getLogger();
23582
23929
  const target = boot.target;
23930
+ const quiet = opts.quiet === true;
23583
23931
  const imageContext = await buildEcsImageResolutionContext(target, stacks, options, stateProvider);
23584
23932
  const service = resolveEcsServiceTarget(target, stacks, imageContext, { suppressLoadBalancerWarning });
23585
- logger.info(`Target: ${service.stack.stackName}/${service.serviceLogicalId} (service=${service.serviceName}, desiredCount=${service.desiredCount}, task=${service.task.taskDefinitionLogicalId})`);
23586
- assertSingleReplicaForWatch(service, options);
23587
- if (service.serviceConnect) logger.info(`Service Connect: namespace='${service.serviceConnect.namespaceName}', ${service.serviceConnect.services.length} service(s) registered for peer discovery.`);
23588
- if (service.serviceRegistries.length > 0) logger.info(`Cloud Map: ${service.serviceRegistries.length} ServiceRegistry binding(s).`);
23933
+ if (!quiet) logger.info(`Target: ${service.stack.stackName}/${service.serviceLogicalId} (service=${service.serviceName}, desiredCount=${service.desiredCount}, task=${service.task.taskDefinitionLogicalId})`);
23934
+ if (!quiet && service.serviceConnect) logger.info(`Service Connect: namespace='${service.serviceConnect.namespaceName}', ${service.serviceConnect.services.length} service(s) registered for peer discovery.`);
23935
+ if (!quiet && service.serviceRegistries.length > 0) logger.info(`Cloud Map: ${service.serviceRegistries.length} ServiceRegistry binding(s).`);
23589
23936
  const taskNeeds = detectEcsImageResolutionNeeds(stacks.find((s) => s.stackName === service.stack.stackName) ?? service.stack);
23590
23937
  if (stateProvider && taskNeeds.needsCrossStackResolver) {
23591
23938
  const consumerRegion = options.region ?? process.env["AWS_REGION"] ?? process.env["AWS_DEFAULT_REGION"] ?? service.stack.region ?? "us-east-1";
@@ -23634,13 +23981,16 @@ async function runOneTarget(boot, runState, stacks, options, discovery, skipPull
23634
23981
  containerPath: profileCredsFile.containerPath,
23635
23982
  profileName: profileCredsFile.profileName
23636
23983
  };
23637
- return startEcsService(service, {
23638
- maxTasks: options.maxTasks,
23639
- restartPolicy: options.restartPolicy,
23640
- taskOptions: taskOpts,
23641
- discovery,
23642
- ...frontDoorPools && frontDoorPools.length > 0 ? { frontDoor: { pools: frontDoorPools } } : {}
23643
- }, runState);
23984
+ return {
23985
+ service,
23986
+ runnerOpts: {
23987
+ maxTasks: options.maxTasks,
23988
+ restartPolicy: options.restartPolicy,
23989
+ taskOptions: taskOpts,
23990
+ discovery,
23991
+ ...frontDoorPools && frontDoorPools.length > 0 ? { frontDoor: { pools: frontDoorPools } } : {}
23992
+ }
23993
+ };
23644
23994
  }
23645
23995
  /**
23646
23996
  * Stand up one host-side reverse-proxy server PER LISTENER PORT from the
@@ -24048,9 +24398,11 @@ function addCommonEcsServiceOptions(cmd) {
24048
24398
  * a leaf compute runner, symmetric with `invoke` / `run-task`.
24049
24399
  *
24050
24400
  * `supportsWatch: true` opts this strategy into the emulator's `--watch`
24051
- * reload pathway (Phase 1 of issue #214 — single-replica rebuild-on-change).
24052
- * `start-alb`'s strategy intentionally does NOT set this so a `--watch` flag
24053
- * never leaks into the ALB-front-door path (Phase 3).
24401
+ * reload pathway (Phase 1 + Phase 2 of issue #214 — per-replica rolling
24402
+ * deploy: shadow boot under a bumped generation suffix, TCP-ready probe,
24403
+ * atomic Cloud Map / front-door swap, retire old). `start-alb`'s strategy
24404
+ * intentionally does NOT set this so a `--watch` flag never leaks into
24405
+ * the ALB-front-door path (Phase 3).
24054
24406
  */
24055
24407
  function serviceStrategy() {
24056
24408
  return {
@@ -24103,7 +24455,7 @@ function createLocalStartServiceCommand(opts = {}) {
24103
24455
  * advertise a flag one of its consumers does not honor.
24104
24456
  */
24105
24457
  function addStartServiceSpecificOptions(cmd) {
24106
- return cmd.addOption(new Option("--host-port <containerPort=hostPort...>", "Publish a container port on a specific host port (e.g. 80=8080); repeatable. Default: host port == container port. Use this on macOS to map a privileged container port (< 1024) to a non-privileged host port and avoid the Docker Desktop admin-password prompt. (Single-replica services only — multi-replica services do not publish host ports.)")).addOption(new Option("--watch", "Hot-reload: re-synth + re-resolve every booted service and replace its single replica when the CDK app's source changes (honors cdk.json watch.include/exclude; cdk.out, node_modules, .git are always excluded). Single-replica services only in v1 — a service with effective replica count > 1 errors out (multi-replica rolling deploy is Phase 2 of issue #214). Off by default; the previous replica keeps serving when synth fails mid-reload.").default(false));
24458
+ return cmd.addOption(new Option("--host-port <containerPort=hostPort...>", "Publish a container port on a specific host port (e.g. 80=8080); repeatable. Default: host port == container port. Use this on macOS to map a privileged container port (< 1024) to a non-privileged host port and avoid the Docker Desktop admin-password prompt. (Single-replica services only — multi-replica services do not publish host ports.)")).addOption(new Option("--watch", "Hot-reload: re-synth + per-replica rolling deploy when the CDK source changes (honors cdk.json watch.include/exclude; cdk.out, node_modules, .git are always excluded). Each replica is rolled one at a time boot a shadow under a bumped generation suffix, wait for its container port to accept a TCP connection, atomically swap Service-Connect / Cloud Map registrations, then retire the old container — so peer services see zero connection refusals across the reload even on multi-replica services. Off by default; existing replica(s) keep serving when synth fails mid-reload.").default(false));
24107
24459
  }
24108
24460
 
24109
24461
  //#endregion
@@ -25052,4 +25404,4 @@ function addListSpecificOptions(cmd) {
25052
25404
 
25053
25405
  //#endregion
25054
25406
  export { buildHttpApiV2Event as $, resolveRuntimeFileExtension as $t, createWatchPredicates as A, AGENTCORE_HTTP_PROTOCOL as An, A2A_PATH as At, readMtlsMaterialsFromDisk as B, LocalInvokeBuildError as Bn, invokeAgentCore as Bt, getContainerNetworkIp as C, discoverWebSocketApisOrThrow as Cn, applyCorsResponseHeaders as Ct, createLocalInvokeAgentCoreCommand as D, resolveLambdaArnIntrinsic as Dn, matchPreflight as Dt, addInvokeAgentCoreSpecificOptions as E, pickRefLogicalId as En, isFunctionUrlOacFronted as Et, buildStageMap as F, resolveAgentCoreTarget as Fn, mcpInvokeOnce as Ft, buildMethodArn as G, computeCodeImageTag as Gt, resolveSelectionExpression as H, downloadAndExtractS3Bundle as Ht, availableApiIdentifiers as I, derivePseudoParametersFromRegion as In, parseSseForJsonRpc as It, invokeRequestAuthorizer as J, addInvokeSpecificOptions as Jt, computeRequestIdentityHash as K, renderCodeDockerfile as Kt, filterRoutesByApiIdentifier as L, formatStateRemedy as Ln, AGENTCORE_SIGV4_SERVICE as Lt, createAuthorizerCache as M, AGENTCORE_RUNTIME_TYPE as Mn, MCP_CONTAINER_PORT as Mt, createFileWatcher as N, AgentCoreResolutionError as Nn, MCP_PATH as Nt, addStartApiSpecificOptions as O, AGENTCORE_A2A_PROTOCOL as On, invokeAgentCoreWs as Ot, attachStageContext as P, pickAgentCoreCandidateStack as Pn, MCP_PROTOCOL_VERSION as Pt, applyAuthorizerOverlay as Q, resolveRuntimeCodeMountPath as Qt, filterRoutesByApiIdentifiers as R, substituteImagePlaceholders as Rn, signAgentCoreInvocation as Rt, CloudMapRegistry as S, discoverWebSocketApis as Sn, attachAuthorizers as St, createLocalRunTaskCommand as T, discoverRoutes as Tn, buildCorsConfigFromCloudFrontChain as Tt, resolveServiceIntegrationParameters as U, SUPPORTED_CODE_RUNTIMES as Ut, startApiServer as V, waitForAgentCorePing as Vt, defaultCredentialsLoader as W, buildAgentCoreCodeImage as Wt, matchRoute as X, architectureToPlatform as Xt, invokeTokenAuthorizer as Y, createLocalInvokeCommand as Yt, translateLambdaResponse as Z, buildContainerImage as Zt, parseMaxTasks as _, resolveSsmParameters as _n, buildJwksUrlFromIssuer as _t, albStrategy as a, substituteEnvVarsFromStateAsync as an, VtlEvaluationError as at, runEcsServiceEmulator as b, countTargets as bn, verifyJwtAuthorizer as bt, resolveAlbTarget as c, LocalStateSourceError as cn, bufferToBody as ct, addStartServiceSpecificOptions as d, rejectExplicitCfnStackWithMultipleStacks as dn, handleConnectionsRequest as dt, resolveRuntimeImage as en, buildRestV1Event as et, createLocalStartServiceCommand as f, resolveCfnFallbackRegion as fn, parseConnectionsPath as ft, buildEcsImageResolutionContext as g, collectSsmParameterRefs as gn, buildCognitoJwksUrl as gt, addCommonEcsServiceOptions as h, CfnLocalStateProvider as hn, buildMessageEvent as ht, addAlbSpecificOptions as i, substituteEnvVarsFromState as in, tryParseStatus as it, resolveApiTargetSubset as j, AGENTCORE_MCP_PROTOCOL as jn, a2aInvokeOnce as jt, createLocalStartApiCommand as k, AGENTCORE_AGUI_PROTOCOL as kn, A2A_CONTAINER_PORT as kt, isApplicationLoadBalancer as l, createLocalStateProvider as ln, ConnectionRegistry as lt, MAX_TASKS_SUBNET_RANGE_CAP as m, resolveCfnStackName as mn, buildDisconnectEvent as mt, createLocalListCommand as n, substituteAgainstState as nn, pickResponseTemplate as nt, createLocalStartAlbCommand as o, resolveEnvVars as on, HOST_GATEWAY_MIN_VERSION as ot, serviceStrategy as p, resolveCfnRegion as pn, buildConnectEvent as pt, evaluateCachedLambdaPolicy as q, toCmdArgv as qt, formatTargetListing as r, substituteAgainstStateAsync as rn, selectIntegrationResponse as rt, parseLbPortOverrides as s, materializeLayerFromArn as sn, probeHostGatewaySupport as st, addListSpecificOptions as t, EcsTaskResolutionError as tn, evaluateResponseParameters as tt, resolveAlbFrontDoor as u, isCfnFlagPresent as un, buildMgmtEndpointEnvUrl as ut, parseRestartPolicy as v, resolveWatchConfig as vn, createJwksCache as vt, addRunTaskSpecificOptions as w, parseSelectionExpressionPath as wn, buildCorsConfigByApiId as wt, buildCloudMapIndex as x, listTargets as xn, verifyJwtViaDiscovery as xt, resolveSharedSidecarCredentials as y, resolveSingleTarget as yn, verifyCognitoJwt as yt, groupRoutesByServer as z, tryResolveImageFnJoin as zn, AGENTCORE_SESSION_ID_HEADER as zt };
25055
- //# sourceMappingURL=local-list-CyW86HDN.js.map
25407
+ //# sourceMappingURL=local-list-faPgnDlc.js.map