@kognai/orchestrator-core 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -79,6 +79,13 @@ const MAX_HOURS = 6; // kill orchestrator if it runs longer than this
79
79
  // Rate limiter: minimum gap between sprint executions (prevents burning Claude 5h limit)
80
80
  // Default: 30 min. Override via SPRINT_COOLDOWN_MINUTES env var.
81
81
  const COOLDOWN_MINUTES = parseInt(process.env.SPRINT_COOLDOWN_MINUTES ?? '30', 10);
82
+ // TICKET-348 sprint-level backoff: a sprint whose run makes NO forward progress
83
+ // (no pending task reached a terminal/done state) this many times IN A ROW is
84
+ // auto-skipped as 'loop-stuck', so a permanently-failing sprint (e.g. one whose
85
+ // files keep truncating) stops monopolising the runner and the selector advances
86
+ // to other queued work. State lives in .swarm-state/sprint-backoff.json.
87
+ const SPRINT_BACKOFF = (0, path_1.join)(ROOT, '.swarm-state', 'sprint-backoff.json');
88
+ const SPRINT_BACKOFF_THRESHOLD = parseInt(process.env.SPRINT_BACKOFF_THRESHOLD ?? '2', 10);
82
89
  // Daily cap: max sprints per calendar day. Default: 100.
83
90
  const DAILY_SPRINT_CAP = parseInt(process.env.DAILY_SPRINT_CAP ?? '100', 10);
84
91
  // Rolling window cap: max sprints within the last N hours. Default: 20 per 5h.
@@ -354,6 +361,63 @@ function extractSprintNumber(filename) {
354
361
  // (Multi-session safety: prevents another Claude session reverting a local file
355
362
  // from causing the runner to re-execute paused/done work.)
356
363
  const NOTION_OVERRIDE_STATUSES = new Set(['skipped', 'blocked', 'done', 'done-manual', 'loop-stuck', 'rejected']);
364
+ function readBackoff() {
365
+ try {
366
+ return JSON.parse((0, fs_1.readFileSync)(SPRINT_BACKOFF, 'utf8'));
367
+ }
368
+ catch {
369
+ return {};
370
+ }
371
+ }
372
+ function writeBackoff(data) {
373
+ try {
374
+ const dir = (0, path_1.join)(ROOT, '.swarm-state');
375
+ if (!(0, fs_1.existsSync)(dir))
376
+ (0, fs_1.mkdirSync)(dir, { recursive: true });
377
+ const tmp = `${SPRINT_BACKOFF}.tmp.${process.pid}`;
378
+ (0, fs_1.writeFileSync)(tmp, JSON.stringify(data, null, 2));
379
+ (0, fs_1.renameSync)(tmp, SPRINT_BACKOFF);
380
+ }
381
+ catch { /* backoff bookkeeping is best-effort — never block a run */ }
382
+ }
383
+ /** Count pending tasks in a sprint's MERGED view (source + .swarm-state status). */
384
+ function countPendingTasks(sprintPath) {
385
+ try {
386
+ return ((0, sprint_state_1.loadSprintMerged)(sprintPath).tasks ?? []).filter((t) => t.status === 'pending').length;
387
+ }
388
+ catch {
389
+ return 0;
390
+ }
391
+ }
392
+ /** True once a sprint has hit the no-progress threshold — selector skips it. */
393
+ function isBackedOff(sprintId) {
394
+ const e = readBackoff()[sprintId];
395
+ return !!e && (e.loop_stuck === true || e.no_progress >= SPRINT_BACKOFF_THRESHOLD);
396
+ }
397
+ /** Post-run: progress = pending count dropped (a task reached terminal/done).
398
+ * No progress → increment the consecutive counter; at the threshold, flag
399
+ * loop_stuck so findPendingSprint skips it. Any progress → reset. */
400
+ function recordSprintProgress(sprintId, pendingBefore, pendingAfter) {
401
+ const data = readBackoff();
402
+ if (pendingAfter < pendingBefore) {
403
+ if (data[sprintId]) {
404
+ delete data[sprintId];
405
+ writeBackoff(data);
406
+ }
407
+ return;
408
+ }
409
+ const prev = data[sprintId]?.no_progress ?? 0;
410
+ const next = prev + 1;
411
+ const loop_stuck = next >= SPRINT_BACKOFF_THRESHOLD;
412
+ data[sprintId] = { no_progress: next, last: new Date().toISOString(), loop_stuck };
413
+ writeBackoff(data);
414
+ if (loop_stuck) {
415
+ log(`⛔ Backoff: ${sprintId} made no progress ${next}× in a row — marked loop-stuck (auto-skipped). Clear .swarm-state/sprint-backoff.json or fix the sprint to re-enable.`);
416
+ }
417
+ else {
418
+ log(`Backoff: ${sprintId} no progress this run (${next}/${SPRINT_BACKOFF_THRESHOLD} before auto-skip).`);
419
+ }
420
+ }
357
421
  async function findPendingSprint() {
358
422
  if (!(0, fs_1.existsSync)(SPRINTS))
359
423
  return null;
@@ -404,6 +468,12 @@ async function findPendingSprint() {
404
468
  log(`Skipped ${file}: Notion source-of-truth says '${notionStatus}' (overrides local pending)`);
405
469
  continue;
406
470
  }
471
+ // TICKET-348: local backoff — skip a sprint that has made no forward
472
+ // progress N runs in a row (loop-stuck), so it can't monopolise the runner.
473
+ if (isBackedOff(sprintId)) {
474
+ log(`Skipped ${file}: backoff — no forward progress ${SPRINT_BACKOFF_THRESHOLD}× in a row (loop-stuck, auto-skipped)`);
475
+ continue;
476
+ }
407
477
  // Dependency check: respect depends_on_sprint — if the upstream sprint
408
478
  // has any non-terminal task, skip this one. Founder directive 2026-05-26:
409
479
  // swarm must never deadlock on a sprint whose prereqs haven't shipped.
@@ -828,6 +898,10 @@ async function runSprintCycle(opts) {
828
898
  return;
829
899
  }
830
900
  log(`Found pending sprint: ${sprintPath}`);
901
+ // TICKET-348: snapshot pending count before the run so we can detect whether
902
+ // this run made any forward progress (and apply backoff if it didn't).
903
+ const backoffSprintId = (0, path_1.basename)(sprintPath).replace(/\.json$/, '');
904
+ const pendingBefore = countPendingTasks(sprintPath);
831
905
  // TICKET-210: build the ACTIVE sprint from the MERGED view (source definition
832
906
  // + .swarm-state status), NOT raw source. The source file holds every task at
833
907
  // its authored status (usually 'pending'); reading it directly meant a sprint
@@ -886,15 +960,52 @@ async function runSprintCycle(opts) {
886
960
  // overlapping orphans can't accumulate even if cron ever gets re-armed
887
961
  const orchestratorTimeoutMs = PER_RUN_HARD_TIMEOUT_MIN * 60 * 1000;
888
962
  log(`Spawning orchestrator with ${PER_RUN_HARD_TIMEOUT_MIN}-min hard timeout`);
889
- const result = (0, child_process_1.spawnSync)('npx', ['ts-node', orchestratorPath, activePath], {
890
- stdio: 'inherit',
891
- cwd: ROOT,
892
- env: { ...process.env },
893
- timeout: orchestratorTimeoutMs,
894
- killSignal: 'SIGKILL', // SIGTERM ignored when blocked on subprocess I/O
963
+ // TICKET-347: spawnSync's `timeout` only SIGKILLs the DIRECT child (`npx`).
964
+ // Its `ts-node` child + the orchestrator grandchildren (and everything THEY
965
+ // spawn) get orphaned and keep running — reparented to init. A live incident
966
+ // (2026-06-13) had a "timed-out" run keep executing for ~2h after the runner
967
+ // declared it failed, burning ~1.3M tokens + attempting wallet settlements.
968
+ // Fix: spawn DETACHED so the child leads its own process group, then SIGKILL
969
+ // the WHOLE group (negative pid) on timeout — taking the entire subtree down.
970
+ const result = await new Promise((resolveRun) => {
971
+ const child = (0, child_process_1.spawn)('npx', ['ts-node', orchestratorPath, activePath], {
972
+ stdio: 'inherit',
973
+ cwd: ROOT,
974
+ env: { ...process.env },
975
+ detached: true, // new process group (setsid) → killable as a unit
976
+ });
977
+ let timedOut = false;
978
+ let settled = false;
979
+ const finish = (status) => {
980
+ if (settled)
981
+ return;
982
+ settled = true;
983
+ clearTimeout(timer);
984
+ resolveRun({ status, timedOut });
985
+ };
986
+ const timer = setTimeout(() => {
987
+ timedOut = true;
988
+ // Negative pid = signal the entire process group (the detached subtree).
989
+ try {
990
+ if (child.pid)
991
+ process.kill(-child.pid, 'SIGKILL');
992
+ }
993
+ catch { /* group already gone */ }
994
+ // Belt-and-braces: also target the direct child in case the group call missed.
995
+ try {
996
+ child.kill('SIGKILL');
997
+ }
998
+ catch { /* already dead */ }
999
+ }, orchestratorTimeoutMs);
1000
+ child.on('error', (err) => { log(`Orchestrator spawn error: ${err.message}`); finish(1); });
1001
+ child.on('exit', (code) => finish(code));
895
1002
  });
896
1003
  const elapsed = Math.round((Date.now() - start) / 60000);
897
- const status = result.status === 0 ? '✅ Completed' : `❌ Failed (exit ${result.status})`;
1004
+ const status = result.status === 0
1005
+ ? '✅ Completed'
1006
+ : result.timedOut
1007
+ ? `⏱️ Killed — ${PER_RUN_HARD_TIMEOUT_MIN}-min hard timeout (process group SIGKILLed)`
1008
+ : `❌ Failed (exit ${result.status})`;
898
1009
  log(`Orchestrator finished: ${status} (${elapsed} min)`);
899
1010
  // Founder directive 2026-05-26: on non-zero exit, write an incident record
900
1011
  // and emit an event for CTO + CEO to investigate autonomously. The swarm
@@ -1030,6 +1141,10 @@ async function runSprintCycle(opts) {
1030
1141
  catch {
1031
1142
  // non-fatal
1032
1143
  }
1144
+ // TICKET-348: update sprint backoff. State is fully synced by now (ACTIVE→
1145
+ // .swarm-state + forensic git-log recovery), so countPendingTasks reflects the
1146
+ // post-run truth. No drop in pending = no progress = step toward loop-stuck.
1147
+ recordSprintProgress(backoffSprintId, pendingBefore, countPendingTasks(sprintPath));
1033
1148
  // TICKET-201: post-sprint hook (e.g. dispatch-approved-proposals).
1034
1149
  // Supplied by the product entry so core stays product-agnostic.
1035
1150
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kognai/orchestrator-core",
3
- "version": "0.2.4",
3
+ "version": "0.2.6",
4
4
  "description": "Kognai sovereign orchestrator — core engine (template-agnostic). Shared by all products (Kognai/coding, Voxight/market-intel, Invoica/fin-compliance); each supplies only its template. Replaces per-repo forks of orchestrate-agents-v2 / sprint-runner / lib.",
5
5
  "license": "MIT",
6
6
  "author": "SkinGem",