@kody-ade/kody-engine 0.4.115 → 0.4.116

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/bin/kody.js +75 -33
  2. package/package.json +1 -1
package/dist/bin/kody.js CHANGED
@@ -880,7 +880,7 @@ var init_loadPriorArt = __esm({
880
880
  // package.json
881
881
  var package_default = {
882
882
  name: "@kody-ade/kody-engine",
883
- version: "0.4.115",
883
+ version: "0.4.116",
884
884
  description: "kody \u2014 autonomous development engine. Single-session Claude Code agent behind a generic executor + declarative executable profiles.",
885
885
  license: "MIT",
886
886
  type: "module",
@@ -4884,6 +4884,7 @@ function sleep2(ms) {
4884
4884
  }
4885
4885
 
4886
4886
  // src/pool/manager.ts
4887
+ var MAX_CLAIM_ATTEMPTS = 3;
4887
4888
  var PoolManager = class {
4888
4889
  constructor(deps) {
4889
4890
  this.deps = deps;
@@ -4923,43 +4924,84 @@ var PoolManager = class {
4923
4924
  await this.refill();
4924
4925
  }
4925
4926
  /**
4926
- * Claim a warm machine for a job. Returns ok:false (caller falls back to
4927
- * create-fresh) when the pool is empty or the woken machine fails to take
4928
- * the job. The pick is synchronous the atomic step.
4927
+ * Claim a warm machine for a job. Tries free machines in turn: if a woken
4928
+ * machine is stale/unhealthy/rejecting (e.g. it vanished out-of-band), it's
4929
+ * destroyed and the next free one is tried, up to MAX_CLAIM_ATTEMPTS. Only
4930
+ * when none work (or the pool is empty) does it return ok:false so the
4931
+ * caller falls back to create-fresh. The pick (shift) is synchronous — the
4932
+ * atomic step that prevents two concurrent claims grabbing the same machine.
4929
4933
  */
4930
4934
  async claim(job) {
4931
- const machine = this.free.shift();
4932
- if (!machine) {
4933
- this.log("claim: pool empty");
4934
- void this.refill();
4935
- return { ok: false, reason: "pool empty" };
4936
- }
4937
- this.claimsInFlight++;
4938
- try {
4939
- await this.deps.fly.start(machine.id);
4940
- const base = this.baseUrl(machine);
4941
- const healthy = await this.deps.fly.waitHealthy(base, { timeoutMs: this.deps.config.healthTimeoutMs });
4942
- if (!healthy) {
4943
- this.log(`claim: machine ${machine.id} unhealthy after wake \u2014 destroying`);
4944
- await this.safeDestroy(machine.id);
4945
- return { ok: false, reason: "woken machine unhealthy" };
4946
- }
4947
- const accepted = await this.postRun(machine, job, this.deps.config);
4948
- if (!accepted) {
4949
- this.log(`claim: machine ${machine.id} rejected job \u2014 destroying`);
4935
+ let lastReason = "pool empty";
4936
+ for (let attempt = 0; attempt < MAX_CLAIM_ATTEMPTS; attempt++) {
4937
+ const machine = this.free.shift();
4938
+ if (!machine) break;
4939
+ this.claimsInFlight++;
4940
+ try {
4941
+ await this.deps.fly.start(machine.id);
4942
+ const healthy = await this.deps.fly.waitHealthy(this.baseUrl(machine), {
4943
+ timeoutMs: this.deps.config.healthTimeoutMs
4944
+ });
4945
+ if (!healthy) {
4946
+ this.log(`claim: machine ${machine.id} unhealthy after wake \u2014 destroying, trying next`);
4947
+ await this.safeDestroy(machine.id);
4948
+ lastReason = "woken machine unhealthy";
4949
+ continue;
4950
+ }
4951
+ const accepted = await this.postRun(machine, job, this.deps.config);
4952
+ if (!accepted) {
4953
+ this.log(`claim: machine ${machine.id} rejected job \u2014 destroying, trying next`);
4954
+ await this.safeDestroy(machine.id);
4955
+ lastReason = "machine rejected job";
4956
+ continue;
4957
+ }
4958
+ this.log(`claim: machine ${machine.id} took job ${job.jobId}`);
4959
+ void this.refill();
4960
+ return { ok: true, machineId: machine.id };
4961
+ } catch (err) {
4962
+ this.log(`claim: error on ${machine.id}: ${errMsg2(err)} \u2014 destroying, trying next`);
4950
4963
  await this.safeDestroy(machine.id);
4951
- return { ok: false, reason: "machine rejected job" };
4964
+ lastReason = errMsg2(err);
4965
+ } finally {
4966
+ this.claimsInFlight--;
4952
4967
  }
4953
- this.log(`claim: machine ${machine.id} took job ${job.jobId}`);
4954
- return { ok: true, machineId: machine.id };
4968
+ }
4969
+ void this.refill();
4970
+ return { ok: false, reason: lastReason };
4971
+ }
4972
+ /**
4973
+ * Periodic self-heal: reconcile the in-memory free list against actual Fly
4974
+ * state. Prunes free entries whose machine vanished out-of-band (auto-destroy
4975
+ * after a job, manual ops) so a later claim never tries a dead machine, and
4976
+ * adopts any suspended machines we lost track of. Then tops up. Unlike
4977
+ * reconcile() this MERGES rather than rebuilds, so it won't drop a machine
4978
+ * that's momentarily not yet reflected as suspended by Fly's eventual
4979
+ * consistency.
4980
+ */
4981
+ async resync() {
4982
+ let machines;
4983
+ try {
4984
+ machines = await this.deps.fly.listPooled();
4955
4985
  } catch (err) {
4956
- this.log(`claim: error on ${machine.id}: ${errMsg2(err)} \u2014 destroying`);
4957
- await this.safeDestroy(machine.id);
4958
- return { ok: false, reason: errMsg2(err) };
4959
- } finally {
4960
- this.claimsInFlight--;
4961
- void this.refill();
4986
+ this.log(`resync: listPooled failed: ${errMsg2(err)}`);
4987
+ return;
4962
4988
  }
4989
+ const liveIds = new Set(machines.map((m) => m.id));
4990
+ const before = this.free.length;
4991
+ this.free = this.free.filter((f) => liveIds.has(f.id));
4992
+ const pruned = before - this.free.length;
4993
+ const tracked = new Set(this.free.map((f) => f.id));
4994
+ let adopted = 0;
4995
+ for (const m of machines) {
4996
+ if ((m.state === "suspended" || m.state === "suspending") && m.private_ip && !tracked.has(m.id)) {
4997
+ this.free.push({ id: m.id, privateIp: m.private_ip });
4998
+ adopted++;
4999
+ }
5000
+ }
5001
+ if (pruned > 0 || adopted > 0) {
5002
+ this.log(`resync: pruned ${pruned} stale, adopted ${adopted} (free=${this.free.length})`);
5003
+ }
5004
+ await this.refill();
4963
5005
  }
4964
5006
  /** Top up free machines to `min`. Serialized so it never overshoots. */
4965
5007
  async refill() {
@@ -5180,7 +5222,7 @@ var poolServe = async (ctx) => {
5180
5222
  manager.reconcile().catch((err) => log(`reconcile failed: ${err instanceof Error ? err.message : String(err)}`));
5181
5223
  const refillMs = envInt("POOL_REFILL_INTERVAL_MS", 6e4);
5182
5224
  const tick = setInterval(() => {
5183
- manager.refill().catch((err) => log(`refill tick failed: ${err instanceof Error ? err.message : String(err)}`));
5225
+ manager.resync().catch((err) => log(`resync tick failed: ${err instanceof Error ? err.message : String(err)}`));
5184
5226
  }, refillMs);
5185
5227
  const server = createServer3(async (req, res) => {
5186
5228
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kody-ade/kody-engine",
3
- "version": "0.4.115",
3
+ "version": "0.4.116",
4
4
  "description": "kody — autonomous development engine. Single-session Claude Code agent behind a generic executor + declarative executable profiles.",
5
5
  "license": "MIT",
6
6
  "type": "module",