@exaudeus/workrail 3.44.0 → 3.45.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/console-ui/assets/{index-Bi38ITiQ.js → index-BpanIvmi.js} +1 -1
- package/dist/console-ui/index.html +1 -1
- package/dist/daemon/workflow-runner.d.ts +12 -2
- package/dist/daemon/workflow-runner.js +96 -13
- package/dist/manifest.json +17 -17
- package/dist/trigger/notification-service.d.ts +1 -1
- package/dist/trigger/notification-service.js +4 -0
- package/dist/trigger/trigger-router.js +8 -0
- package/dist/trigger/types.d.ts +2 -0
- package/dist/v2/usecases/console-routes.js +3 -0
- package/docs/design/design-candidates-stuck-escalation.md +183 -0
- package/docs/design/design-review-findings-stuck-escalation.md +93 -0
- package/docs/design/implementation-plan-stuck-escalation.md +172 -0
- package/package.json +1 -1
|
@@ -1081,6 +1081,16 @@ function makeSpawnAgentTool(sessionId, ctx, apiKey, thisWorkrailSessionId, curre
|
|
|
1081
1081
|
notes: childResult.message,
|
|
1082
1082
|
};
|
|
1083
1083
|
}
|
|
1084
|
+
else if (childResult._tag === 'stuck') {
|
|
1085
|
+
resultObj = {
|
|
1086
|
+
childSessionId,
|
|
1087
|
+
outcome: 'stuck',
|
|
1088
|
+
notes: childResult.message,
|
|
1089
|
+
...(childResult.issueSummaries !== undefined
|
|
1090
|
+
? { issueSummaries: childResult.issueSummaries }
|
|
1091
|
+
: {}),
|
|
1092
|
+
};
|
|
1093
|
+
}
|
|
1084
1094
|
else {
|
|
1085
1095
|
(0, assert_never_js_1.assertNever)(childResult);
|
|
1086
1096
|
}
|
|
@@ -1093,6 +1103,31 @@ function makeSpawnAgentTool(sessionId, ctx, apiKey, thisWorkrailSessionId, curre
|
|
|
1093
1103
|
},
|
|
1094
1104
|
};
|
|
1095
1105
|
}
|
|
1106
|
+
async function writeStuckOutboxEntry(opts) {
|
|
1107
|
+
try {
|
|
1108
|
+
const outboxPath = path.join(os.homedir(), '.workrail', 'outbox.jsonl');
|
|
1109
|
+
await fs.mkdir(path.dirname(outboxPath), { recursive: true });
|
|
1110
|
+
const entry = JSON.stringify({
|
|
1111
|
+
id: (0, node_crypto_1.randomUUID)(),
|
|
1112
|
+
kind: 'stuck',
|
|
1113
|
+
message: `Session stuck (${opts.reason}): workflowId=${opts.workflowId}` +
|
|
1114
|
+
(opts.issueSummaries && opts.issueSummaries.length > 0
|
|
1115
|
+
? ` -- issues: ${opts.issueSummaries.join('; ')}`
|
|
1116
|
+
: ''),
|
|
1117
|
+
timestamp: new Date().toISOString(),
|
|
1118
|
+
workflowId: opts.workflowId,
|
|
1119
|
+
reason: opts.reason,
|
|
1120
|
+
...(opts.issueSummaries && opts.issueSummaries.length > 0
|
|
1121
|
+
? { issueSummaries: opts.issueSummaries }
|
|
1122
|
+
: {}),
|
|
1123
|
+
});
|
|
1124
|
+
await fs.appendFile(outboxPath, entry + '\n');
|
|
1125
|
+
}
|
|
1126
|
+
catch (err) {
|
|
1127
|
+
console.warn(`[WorkflowRunner] Could not write stuck outbox entry: ` +
|
|
1128
|
+
`${err instanceof Error ? err.message : String(err)}`);
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1096
1131
|
async function appendIssueAsync(issuesDir, sessionId, record) {
|
|
1097
1132
|
await fs.mkdir(issuesDir, { recursive: true });
|
|
1098
1133
|
const filePath = path.join(issuesDir, `${sessionId}.jsonl`);
|
|
@@ -1449,19 +1484,6 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
|
|
|
1449
1484
|
if (startContinueToken) {
|
|
1450
1485
|
await persistTokens(sessionId, startContinueToken, startCheckpointToken);
|
|
1451
1486
|
}
|
|
1452
|
-
if (trigger.botIdentity) {
|
|
1453
|
-
try {
|
|
1454
|
-
await execFileAsync('git', ['-C', trigger.workspacePath, 'config', 'user.name', trigger.botIdentity.name]);
|
|
1455
|
-
await execFileAsync('git', ['-C', trigger.workspacePath, 'config', 'user.email', trigger.botIdentity.email]);
|
|
1456
|
-
console.log(`[WorkflowRunner] Bot identity set: sessionId=${sessionId} ` +
|
|
1457
|
-
`name=${trigger.botIdentity.name} email=${trigger.botIdentity.email}`);
|
|
1458
|
-
}
|
|
1459
|
-
catch (identityErr) {
|
|
1460
|
-
console.warn(`[WorkflowRunner] WARNING: Failed to set bot identity for sessionId=${sessionId}: ` +
|
|
1461
|
-
`${identityErr instanceof Error ? identityErr.message : String(identityErr)}. ` +
|
|
1462
|
-
`Commits will use default git config.`);
|
|
1463
|
-
}
|
|
1464
|
-
}
|
|
1465
1487
|
let sessionWorkspacePath = trigger.workspacePath;
|
|
1466
1488
|
let sessionWorktreePath;
|
|
1467
1489
|
if (trigger.branchStrategy === 'worktree') {
|
|
@@ -1497,6 +1519,19 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
|
|
|
1497
1519
|
};
|
|
1498
1520
|
}
|
|
1499
1521
|
}
|
|
1522
|
+
if (trigger.botIdentity) {
|
|
1523
|
+
try {
|
|
1524
|
+
await execFileAsync('git', ['-C', sessionWorkspacePath, 'config', 'user.name', trigger.botIdentity.name]);
|
|
1525
|
+
await execFileAsync('git', ['-C', sessionWorkspacePath, 'config', 'user.email', trigger.botIdentity.email]);
|
|
1526
|
+
console.log(`[WorkflowRunner] Bot identity set: sessionId=${sessionId} ` +
|
|
1527
|
+
`name=${trigger.botIdentity.name} email=${trigger.botIdentity.email}`);
|
|
1528
|
+
}
|
|
1529
|
+
catch (identityErr) {
|
|
1530
|
+
console.warn(`[WorkflowRunner] WARNING: Failed to set bot identity for sessionId=${sessionId}: ` +
|
|
1531
|
+
`${identityErr instanceof Error ? identityErr.message : String(identityErr)}. ` +
|
|
1532
|
+
`Commits will use default git config.`);
|
|
1533
|
+
}
|
|
1534
|
+
}
|
|
1500
1535
|
if (firstStep.isComplete) {
|
|
1501
1536
|
await fs.unlink(path.join(exports.DAEMON_SESSIONS_DIR, `${sessionId}.json`)).catch(() => { });
|
|
1502
1537
|
emitter?.emit({ kind: 'session_completed', sessionId, workflowId: trigger.workflowId, outcome: 'success', detail: 'stop', ...withWorkrailSession(workrailSessionId) });
|
|
@@ -1588,7 +1623,10 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
|
|
|
1588
1623
|
});
|
|
1589
1624
|
const sessionTimeoutMs = (trigger.agentConfig?.maxSessionMinutes ?? DEFAULT_SESSION_TIMEOUT_MINUTES) * 60 * 1000;
|
|
1590
1625
|
const maxTurns = trigger.agentConfig?.maxTurns ?? DEFAULT_MAX_TURNS;
|
|
1626
|
+
const sessionStartMs = Date.now();
|
|
1627
|
+
void sessionStartMs;
|
|
1591
1628
|
let timeoutReason = null;
|
|
1629
|
+
let stuckReason = null;
|
|
1592
1630
|
let turnCount = 0;
|
|
1593
1631
|
const unsubscribe = agent.subscribe(async (event) => {
|
|
1594
1632
|
if (event.type !== 'turn_end')
|
|
@@ -1623,6 +1661,17 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
|
|
|
1623
1661
|
argsSummary: lastNToolCalls[0]?.argsSummary,
|
|
1624
1662
|
...withWorkrailSession(workrailSessionId),
|
|
1625
1663
|
});
|
|
1664
|
+
void writeStuckOutboxEntry({
|
|
1665
|
+
workflowId: trigger.workflowId,
|
|
1666
|
+
reason: 'repeated_tool_call',
|
|
1667
|
+
...(issueSummaries.length > 0 ? { issueSummaries: [...issueSummaries] } : {}),
|
|
1668
|
+
});
|
|
1669
|
+
const stuckPolicy = trigger.agentConfig?.stuckAbortPolicy ?? 'abort';
|
|
1670
|
+
if (stuckPolicy !== 'notify_only' && stuckReason === null && timeoutReason === null) {
|
|
1671
|
+
stuckReason = 'repeated_tool_call';
|
|
1672
|
+
agent.abort();
|
|
1673
|
+
return;
|
|
1674
|
+
}
|
|
1626
1675
|
}
|
|
1627
1676
|
if (maxTurns > 0 &&
|
|
1628
1677
|
turnCount >= Math.floor(maxTurns * 0.8) &&
|
|
@@ -1634,6 +1683,20 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
|
|
|
1634
1683
|
detail: `${turnCount} turns used, 0 step advances (${maxTurns} turn limit)`,
|
|
1635
1684
|
...withWorkrailSession(workrailSessionId),
|
|
1636
1685
|
});
|
|
1686
|
+
const noProgressAbortEnabled = trigger.agentConfig?.noProgressAbortEnabled ?? false;
|
|
1687
|
+
if (noProgressAbortEnabled) {
|
|
1688
|
+
void writeStuckOutboxEntry({
|
|
1689
|
+
workflowId: trigger.workflowId,
|
|
1690
|
+
reason: 'no_progress',
|
|
1691
|
+
...(issueSummaries.length > 0 ? { issueSummaries: [...issueSummaries] } : {}),
|
|
1692
|
+
});
|
|
1693
|
+
const noProgressPolicy = trigger.agentConfig?.stuckAbortPolicy ?? 'abort';
|
|
1694
|
+
if (noProgressPolicy !== 'notify_only' && stuckReason === null && timeoutReason === null) {
|
|
1695
|
+
stuckReason = 'no_progress';
|
|
1696
|
+
agent.abort();
|
|
1697
|
+
return;
|
|
1698
|
+
}
|
|
1699
|
+
}
|
|
1637
1700
|
}
|
|
1638
1701
|
if (timeoutReason !== null) {
|
|
1639
1702
|
emitter?.emit({
|
|
@@ -1693,6 +1756,26 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
|
|
|
1693
1756
|
}
|
|
1694
1757
|
console.log(`[WorkflowRunner] Agent loop ended: sessionId=${sessionId} stopReason=${stopReason}${errorMessage ? ` error=${errorMessage.slice(0, 120)}` : ''}`);
|
|
1695
1758
|
}
|
|
1759
|
+
if (stuckReason !== null) {
|
|
1760
|
+
emitter?.emit({
|
|
1761
|
+
kind: 'session_completed',
|
|
1762
|
+
sessionId,
|
|
1763
|
+
workflowId: trigger.workflowId,
|
|
1764
|
+
outcome: 'timeout',
|
|
1765
|
+
detail: stuckReason,
|
|
1766
|
+
...withWorkrailSession(workrailSessionId),
|
|
1767
|
+
});
|
|
1768
|
+
if (workrailSessionId !== null)
|
|
1769
|
+
daemonRegistry?.unregister(workrailSessionId, 'failed');
|
|
1770
|
+
return {
|
|
1771
|
+
_tag: 'stuck',
|
|
1772
|
+
workflowId: trigger.workflowId,
|
|
1773
|
+
reason: stuckReason,
|
|
1774
|
+
message: `Session aborted: stuck heuristic fired (${stuckReason})`,
|
|
1775
|
+
stopReason: 'aborted',
|
|
1776
|
+
...(issueSummaries.length > 0 ? { issueSummaries: [...issueSummaries] } : {}),
|
|
1777
|
+
};
|
|
1778
|
+
}
|
|
1696
1779
|
if (timeoutReason !== null) {
|
|
1697
1780
|
emitter?.emit({ kind: 'session_completed', sessionId, workflowId: trigger.workflowId, outcome: 'timeout', detail: timeoutReason, ...withWorkrailSession(workrailSessionId) });
|
|
1698
1781
|
if (workrailSessionId !== null)
|
package/dist/manifest.json
CHANGED
|
@@ -449,8 +449,8 @@
|
|
|
449
449
|
"sha256": "5fe866e54f796975dec5d8ba9983aefd86074db212d3fccd64eed04bc9f0b3da",
|
|
450
450
|
"bytes": 8011
|
|
451
451
|
},
|
|
452
|
-
"console-ui/assets/index-
|
|
453
|
-
"sha256": "
|
|
452
|
+
"console-ui/assets/index-BpanIvmi.js": {
|
|
453
|
+
"sha256": "e5c3e897dbda3f810ce737422d3f84d06b8e146c7923041fbd96b276538435c6",
|
|
454
454
|
"bytes": 760528
|
|
455
455
|
},
|
|
456
456
|
"console-ui/assets/index-DGj8EsFR.css": {
|
|
@@ -458,7 +458,7 @@
|
|
|
458
458
|
"bytes": 60631
|
|
459
459
|
},
|
|
460
460
|
"console-ui/index.html": {
|
|
461
|
-
"sha256": "
|
|
461
|
+
"sha256": "587aa7591502ca4dadbea5c7f5c56136d19f18d179c735abe693e9ee2c290e1e",
|
|
462
462
|
"bytes": 417
|
|
463
463
|
},
|
|
464
464
|
"console/standalone-console.d.ts": {
|
|
@@ -550,12 +550,12 @@
|
|
|
550
550
|
"bytes": 1512
|
|
551
551
|
},
|
|
552
552
|
"daemon/workflow-runner.d.ts": {
|
|
553
|
-
"sha256": "
|
|
554
|
-
"bytes":
|
|
553
|
+
"sha256": "4c67cc7a44c934469c190f11a71bd18bf0dfc31f59ab0c315b98315b96d59cce",
|
|
554
|
+
"bytes": 7048
|
|
555
555
|
},
|
|
556
556
|
"daemon/workflow-runner.js": {
|
|
557
|
-
"sha256": "
|
|
558
|
-
"bytes":
|
|
557
|
+
"sha256": "a5d74ec723ff0dce45d2335b811959ff0c6e6f8851edf399a70853f6bf127893",
|
|
558
|
+
"bytes": 93222
|
|
559
559
|
},
|
|
560
560
|
"di/container.d.ts": {
|
|
561
561
|
"sha256": "003bb7fb7478d627524b9b1e76bd0a963a243794a687ff233b96dc0e33a06d9f",
|
|
@@ -1606,12 +1606,12 @@
|
|
|
1606
1606
|
"bytes": 1222
|
|
1607
1607
|
},
|
|
1608
1608
|
"trigger/notification-service.d.ts": {
|
|
1609
|
-
"sha256": "
|
|
1610
|
-
"bytes":
|
|
1609
|
+
"sha256": "25509f290a11ac9a8aa03e5d7e44011950c841436e9458eb855b94d15d036d68",
|
|
1610
|
+
"bytes": 1582
|
|
1611
1611
|
},
|
|
1612
1612
|
"trigger/notification-service.js": {
|
|
1613
|
-
"sha256": "
|
|
1614
|
-
"bytes":
|
|
1613
|
+
"sha256": "9d9a5951229f2c6ffaa413a7421efa6611d9ca0ed456edf5c297a4506e84a80e",
|
|
1614
|
+
"bytes": 6521
|
|
1615
1615
|
},
|
|
1616
1616
|
"trigger/polled-event-store.d.ts": {
|
|
1617
1617
|
"sha256": "2952a25804177b2389d4273bfc41192477d100bc26100683861dedf28520dec1",
|
|
@@ -1642,8 +1642,8 @@
|
|
|
1642
1642
|
"bytes": 2123
|
|
1643
1643
|
},
|
|
1644
1644
|
"trigger/trigger-router.js": {
|
|
1645
|
-
"sha256": "
|
|
1646
|
-
"bytes":
|
|
1645
|
+
"sha256": "8a4a4699df4210b5631211e7370ed6b70d972e82321cf8c66dcc9f60661e5d2c",
|
|
1646
|
+
"bytes": 17750
|
|
1647
1647
|
},
|
|
1648
1648
|
"trigger/trigger-store.d.ts": {
|
|
1649
1649
|
"sha256": "7afb05127d55bc3757a550dd15d4b797766b3fff29d1bfe76b303764b93322e7",
|
|
@@ -1654,8 +1654,8 @@
|
|
|
1654
1654
|
"bytes": 38148
|
|
1655
1655
|
},
|
|
1656
1656
|
"trigger/types.d.ts": {
|
|
1657
|
-
"sha256": "
|
|
1658
|
-
"bytes":
|
|
1657
|
+
"sha256": "a1336ad769dbe4760e7acd3b2a92961251492aa29245b5d906ad89011ea934fa",
|
|
1658
|
+
"bytes": 3587
|
|
1659
1659
|
},
|
|
1660
1660
|
"trigger/types.js": {
|
|
1661
1661
|
"sha256": "45b4e4f23a6d1a2b07350196871b0c53840e5d8142b47f7acedd2f40ae7a6b73",
|
|
@@ -2970,8 +2970,8 @@
|
|
|
2970
2970
|
"bytes": 798
|
|
2971
2971
|
},
|
|
2972
2972
|
"v2/usecases/console-routes.js": {
|
|
2973
|
-
"sha256": "
|
|
2974
|
-
"bytes":
|
|
2973
|
+
"sha256": "3e8bd3adfdc66926d91044506d2c77253b784b9e4356a8610c585e6a27153d4b",
|
|
2974
|
+
"bytes": 29776
|
|
2975
2975
|
},
|
|
2976
2976
|
"v2/usecases/console-service.d.ts": {
|
|
2977
2977
|
"sha256": "fc8fe65427fa9f4f3535344b385b36f66ca06b7e3bfaea708931817a3edcad2b",
|
|
@@ -21,7 +21,7 @@ export interface NotificationConfig {
|
|
|
21
21
|
export interface NotificationPayload {
|
|
22
22
|
readonly event: 'session_completed';
|
|
23
23
|
readonly workflowId: string;
|
|
24
|
-
readonly outcome: 'success' | 'error' | 'timeout' | 'delivery_failed';
|
|
24
|
+
readonly outcome: 'success' | 'error' | 'timeout' | 'stuck' | 'delivery_failed';
|
|
25
25
|
readonly detail: string;
|
|
26
26
|
readonly goal: string;
|
|
27
27
|
readonly timestamp: string;
|
|
@@ -48,6 +48,8 @@ function buildNotificationBody(result, goal) {
|
|
|
48
48
|
return `Session failed: ${truncated}`;
|
|
49
49
|
case 'timeout':
|
|
50
50
|
return `Session timed out: ${truncated}`;
|
|
51
|
+
case 'stuck':
|
|
52
|
+
return `Session stuck (${result.reason}): ${truncated}`;
|
|
51
53
|
case 'delivery_failed':
|
|
52
54
|
return `Session completed but result delivery failed: ${truncated}`;
|
|
53
55
|
}
|
|
@@ -63,6 +65,8 @@ function buildDetail(result) {
|
|
|
63
65
|
return result.message;
|
|
64
66
|
case 'timeout':
|
|
65
67
|
return result.message;
|
|
68
|
+
case 'stuck':
|
|
69
|
+
return result.message;
|
|
66
70
|
case 'delivery_failed':
|
|
67
71
|
return `stopReason: ${result.stopReason}; deliveryError: ${result.deliveryError}`;
|
|
68
72
|
}
|
|
@@ -327,6 +327,10 @@ class TriggerRouter {
|
|
|
327
327
|
console.log(`[TriggerRouter] Workflow failed: triggerId=${trigger.id} ` +
|
|
328
328
|
`workflowId=${trigger.workflowId} error=${result.message} stopReason=${result.stopReason}`);
|
|
329
329
|
}
|
|
330
|
+
else if (result._tag === 'stuck') {
|
|
331
|
+
console.log(`[TriggerRouter] Workflow stuck: triggerId=${trigger.id} ` +
|
|
332
|
+
`workflowId=${trigger.workflowId} reason=${result.reason} message=${result.message}`);
|
|
333
|
+
}
|
|
330
334
|
else {
|
|
331
335
|
(0, assert_never_js_1.assertNever)(result);
|
|
332
336
|
}
|
|
@@ -366,6 +370,10 @@ class TriggerRouter {
|
|
|
366
370
|
console.log(`[TriggerRouter] Dispatch failed: workflowId=${workflowTrigger.workflowId} ` +
|
|
367
371
|
`error=${result.message} stopReason=${result.stopReason}`);
|
|
368
372
|
}
|
|
373
|
+
else if (result._tag === 'stuck') {
|
|
374
|
+
console.log(`[TriggerRouter] Dispatch stuck: workflowId=${workflowTrigger.workflowId} ` +
|
|
375
|
+
`reason=${result.reason} message=${result.message}`);
|
|
376
|
+
}
|
|
369
377
|
else {
|
|
370
378
|
(0, assert_never_js_1.assertNever)(result);
|
|
371
379
|
}
|
package/dist/trigger/types.d.ts
CHANGED
|
@@ -71,6 +71,8 @@ export interface TriggerDefinition {
|
|
|
71
71
|
readonly model?: string;
|
|
72
72
|
readonly maxSessionMinutes?: number;
|
|
73
73
|
readonly maxTurns?: number;
|
|
74
|
+
readonly stuckAbortPolicy?: 'abort' | 'notify_only';
|
|
75
|
+
readonly noProgressAbortEnabled?: boolean;
|
|
74
76
|
};
|
|
75
77
|
readonly concurrencyMode: 'serial' | 'parallel';
|
|
76
78
|
readonly callbackUrl?: string;
|
|
@@ -604,6 +604,9 @@ function mountConsoleRoutes(app, consoleService, workflowService, timingRingBuff
|
|
|
604
604
|
else if (result._tag === 'error') {
|
|
605
605
|
console.log(`[ConsoleRoutes] Auto dispatch failed: workflowId=${workflowId} error=${result.message}`);
|
|
606
606
|
}
|
|
607
|
+
else if (result._tag === 'stuck') {
|
|
608
|
+
console.log(`[ConsoleRoutes] Auto dispatch stuck: workflowId=${workflowId} reason=${result.reason} message=${result.message}`);
|
|
609
|
+
}
|
|
607
610
|
else {
|
|
608
611
|
(0, assert_never_js_1.assertNever)(result);
|
|
609
612
|
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Design Candidates: WorkTrain Stuck-Escalation
|
|
2
|
+
|
|
3
|
+
*Generated: 2026-04-19 | Pitch: .workrail/current-pitch.md*
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Problem Understanding
|
|
8
|
+
|
|
9
|
+
### Core Tensions
|
|
10
|
+
|
|
11
|
+
1. **Stuck vs timeout conflation**: When `repeated_tool_call` fires, the session
|
|
12
|
+
currently runs until wall-clock or max-turns timeout. The result is
|
|
13
|
+
`_tag: 'timeout'`, which is indistinguishable from a legitimate slow session.
|
|
14
|
+
Automated routing requires a distinct discriminant.
|
|
15
|
+
|
|
16
|
+
2. **Abort vs notify-only independence**: Outbox notification and `agent.abort()`
|
|
17
|
+
are two separate effects. `notify_only` policy suppresses the abort but must
|
|
18
|
+
not suppress the outbox write. These effects must not be coupled.
|
|
19
|
+
|
|
20
|
+
3. **ChildWorkflowRunResult atomic update**: The `as ChildWorkflowRunResult` cast
|
|
21
|
+
at line 2172 in `makeSpawnAgentTool` suppresses any compile-time error from a
|
|
22
|
+
missing union update. Only the `assertNever(childResult)` at line 2212 catches
|
|
23
|
+
the omission -- at runtime, crashing the parent session.
|
|
24
|
+
|
|
25
|
+
4. **no_progress false-positive risk**: The no_progress heuristic fires on
|
|
26
|
+
legitimate research workflows that spend many turns reading before advancing.
|
|
27
|
+
It must be opt-in (default: false) to avoid breaking existing sessions.
|
|
28
|
+
|
|
29
|
+
### Likely Seam
|
|
30
|
+
|
|
31
|
+
The `turn_end` subscriber in `runWorkflow()` is the correct location. All
|
|
32
|
+
required state (lastNToolCalls, stepAdvanceCount, timeoutReason, issueSummaries)
|
|
33
|
+
is available there as closure variables. Detection fires at the right moment
|
|
34
|
+
(after each turn, synchronously before next step injection).
|
|
35
|
+
|
|
36
|
+
### What Makes This Hard
|
|
37
|
+
|
|
38
|
+
- The `as ChildWorkflowRunResult` cast is a type-safety trap: it silences
|
|
39
|
+
TypeScript while leaving a runtime crash. Only careful reading of the pitch
|
|
40
|
+
reveals the issue.
|
|
41
|
+
- `buildOutcome()` in notification-service.ts has return type
|
|
42
|
+
`NotificationPayload['outcome']`. Adding 'stuck' to WorkflowRunResult causes
|
|
43
|
+
a compile error there unless the outcome union is also widened.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Philosophy Constraints
|
|
48
|
+
|
|
49
|
+
From CLAUDE.md:
|
|
50
|
+
|
|
51
|
+
- **Make illegal states unrepresentable**: the stuck discriminant prevents
|
|
52
|
+
conflating stuck with timeout at the type level.
|
|
53
|
+
- **Exhaustiveness everywhere**: assertNever guards in trigger-router and
|
|
54
|
+
makeSpawnAgentTool enforce this -- adding stuck arm is required.
|
|
55
|
+
- **Errors are data**: WorkflowRunResult is a Result type; WorkflowRunStuck is
|
|
56
|
+
a new variant, not an exception.
|
|
57
|
+
- **Type safety as first line of defense**: ChildWorkflowRunResult update in
|
|
58
|
+
same commit restores the compile-time invariant that the cast broke.
|
|
59
|
+
- **Fire-and-forget for side effects**: outbox write uses void + catch, same
|
|
60
|
+
as DaemonEventEmitter and issue recording.
|
|
61
|
+
|
|
62
|
+
No conflicts between stated philosophy and repo patterns.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Impact Surface
|
|
67
|
+
|
|
68
|
+
Paths that must stay consistent when WorkflowRunResult gains a new variant:
|
|
69
|
+
|
|
70
|
+
1. `makeSpawnAgentTool` -- `assertNever(childResult)` at line 2212; requires
|
|
71
|
+
ChildWorkflowRunResult update and a new `stuck` arm in the result mapping.
|
|
72
|
+
2. `trigger-router.ts` `route()` -- exhaustive if-else chain ending in
|
|
73
|
+
`assertNever(result)` at line ~689.
|
|
74
|
+
3. `trigger-router.ts` `dispatch()` -- same exhaustive chain at line ~770.
|
|
75
|
+
4. `notification-service.ts` `buildNotificationBody()` -- exhaustive switch.
|
|
76
|
+
5. `notification-service.ts` `buildDetail()` -- exhaustive switch.
|
|
77
|
+
6. `notification-service.ts` `buildOutcome()` -- return type
|
|
78
|
+
`NotificationPayload['outcome']`; 'stuck' must be added to that union.
|
|
79
|
+
7. `NotificationPayload.outcome` union -- currently
|
|
80
|
+
`'success' | 'error' | 'timeout' | 'delivery_failed'`; must add `'stuck'`.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Candidates
|
|
85
|
+
|
|
86
|
+
### Candidate A: New `_tag: 'stuck'` discriminated union variant (SELECTED)
|
|
87
|
+
|
|
88
|
+
**Summary**: Add `WorkflowRunStuck` interface with `_tag: 'stuck'`, wire abort
|
|
89
|
+
in turn_end subscriber after Signal 1 and Signal 2 emitter calls, return stuck
|
|
90
|
+
result before timeout check, update both `WorkflowRunResult` and
|
|
91
|
+
`ChildWorkflowRunResult` unions atomically, add `writeStuckOutboxEntry` helper.
|
|
92
|
+
|
|
93
|
+
**Tensions resolved**:
|
|
94
|
+
- Stuck/timeout conflation: separate discriminant, separate return path.
|
|
95
|
+
- Abort/notify independence: outbox write fires before the abort gate check.
|
|
96
|
+
- ChildWorkflowRunResult crash: atomic update with assertNever arm added.
|
|
97
|
+
- no_progress false-positive: gated by `noProgressAbortEnabled: false` default.
|
|
98
|
+
|
|
99
|
+
**Boundary solved at**: `turn_end` subscriber (detection + abort), result
|
|
100
|
+
construction (return), 4 files for propagation to callers.
|
|
101
|
+
|
|
102
|
+
**Why best-fit boundary**: The turn_end subscriber is the only location with
|
|
103
|
+
access to all required state. The result construction is the canonical output
|
|
104
|
+
boundary for runWorkflow(). Propagation to callers follows the existing
|
|
105
|
+
WorkflowRunResult variant fan-out pattern.
|
|
106
|
+
|
|
107
|
+
**Failure mode**: Forgetting to update `NotificationPayload.outcome` union --
|
|
108
|
+
caught by `npm run build` (TypeScript compile error in `buildOutcome()`).
|
|
109
|
+
|
|
110
|
+
**Repo-pattern relationship**: Mirrors `timeoutReason` flag pattern exactly.
|
|
111
|
+
Mirrors `WorkflowRunTimeout` interface field shape. Follows assertNever guard
|
|
112
|
+
pattern already established in trigger-router and makeSpawnAgentTool.
|
|
113
|
+
|
|
114
|
+
**Gains**: Distinct routing for stuck sessions, type-safe callers, clean
|
|
115
|
+
separation of abort and notification effects.
|
|
116
|
+
|
|
117
|
+
**Losses**: One more variant in the union (minor cognitive load increase).
|
|
118
|
+
|
|
119
|
+
**Scope judgment**: Best-fit. 4 files, mechanical wiring, all design resolved.
|
|
120
|
+
|
|
121
|
+
**Philosophy fit**: Honors all relevant CLAUDE.md principles. No conflicts.
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
### Candidate B: Extend `WorkflowRunTimeout.reason` with stuck sub-values
|
|
126
|
+
|
|
127
|
+
**Summary**: Add `'stuck_repeated_tool_call' | 'stuck_no_progress'` to
|
|
128
|
+
`WorkflowRunTimeout.reason` -- reuse the timeout discriminant.
|
|
129
|
+
|
|
130
|
+
**Tensions resolved**: None of the core ones. Stuck and timeout still share
|
|
131
|
+
`_tag: 'timeout'`, requiring callers to inspect reason to distinguish them.
|
|
132
|
+
|
|
133
|
+
**Failure mode**: Violates make-illegal-states-unrepresentable. Callers using
|
|
134
|
+
`result._tag === 'timeout'` would silently handle stuck sessions as timeouts.
|
|
135
|
+
|
|
136
|
+
**Repo-pattern relationship**: Departs from the exhaustiveness-everywhere
|
|
137
|
+
pattern. The assertNever guard pattern exists precisely to avoid this.
|
|
138
|
+
|
|
139
|
+
**Scope judgment**: Too narrow -- preserves the routing problem this pitch
|
|
140
|
+
exists to solve.
|
|
141
|
+
|
|
142
|
+
**Rejected because**: Violates philosophy, does not resolve the core tension,
|
|
143
|
+
and the pitch explicitly rejects conflating stuck with timeout.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Comparison and Recommendation
|
|
148
|
+
|
|
149
|
+
Candidate A is the only viable candidate. All analysis converges.
|
|
150
|
+
|
|
151
|
+
The core recommendation is to implement Candidate A exactly as specified in
|
|
152
|
+
`.workrail/current-pitch.md`, with one addition not noted in the pitch:
|
|
153
|
+
update `NotificationPayload.outcome` union to include `'stuck'` (required for
|
|
154
|
+
`buildOutcome()` to compile).
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Self-Critique
|
|
159
|
+
|
|
160
|
+
**Strongest counter-argument**: Adding a 5th variant to WorkflowRunResult
|
|
161
|
+
increases cognitive load for callers. Counter: assertNever guards make missing
|
|
162
|
+
cases compile errors, which is the correct safeguard. The complexity cost is
|
|
163
|
+
paid once (at implementation) and enforced automatically.
|
|
164
|
+
|
|
165
|
+
**Narrower option that lost**: Update only WorkflowRunResult, skip
|
|
166
|
+
ChildWorkflowRunResult. Lost because: runtime crash in makeSpawnAgentTool
|
|
167
|
+
when a child hits stuck-abort. The cast at line 2172 provides no protection.
|
|
168
|
+
|
|
169
|
+
**Broader option not justified**: Adding `onStuck:` hook to TriggerDefinition.
|
|
170
|
+
Explicitly deferred per pitch No-Gos. Would require trigger-store.ts parser
|
|
171
|
+
changes -- outside the 4-file scope.
|
|
172
|
+
|
|
173
|
+
**Pivot condition**: If `assertNever(childResult)` were removed in favor of a
|
|
174
|
+
logged fallback, ChildWorkflowRunResult update would be less critical. It is
|
|
175
|
+
not removed, so the atomic update is required.
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Open Questions for the Main Agent
|
|
180
|
+
|
|
181
|
+
None. All design decisions are resolved in the pitch. The only implementation
|
|
182
|
+
detail requiring attention is the `NotificationPayload.outcome` union widening
|
|
183
|
+
(add 'stuck') -- verify this compiles before finalizing.
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Design Review Findings: WorkTrain Stuck-Escalation
|
|
2
|
+
|
|
3
|
+
*Generated: 2026-04-19 | Pitch: .workrail/current-pitch.md*
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Tradeoff Review
|
|
8
|
+
|
|
9
|
+
| Tradeoff | Status | Condition for Failure |
|
|
10
|
+
|----------|--------|-----------------------|
|
|
11
|
+
| One more union variant in WorkflowRunResult | Acceptable | All callers use assertNever guards -- compile error enforces handling |
|
|
12
|
+
| ChildWorkflowRunResult atomic update relies on discipline | Managed | Fails only if commit is split; mitigated by single-PR implementation and compile-time test |
|
|
13
|
+
| NotificationPayload.outcome union widening (gap, not tradeoff) | Resolved | Add 'stuck' to outcome union; caught by npm run build |
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Failure Mode Review
|
|
18
|
+
|
|
19
|
+
| Failure Mode | Severity | Design Handling | Missing Mitigation |
|
|
20
|
+
|--------------|----------|-----------------|--------------------|
|
|
21
|
+
| ChildWorkflowRunResult not updated | High | Atomic commit, compile-time assignability test | None beyond discipline |
|
|
22
|
+
| stuckReason / timeoutReason race | Low | First-writer-wins guard; max_turns early return prevents race | None needed |
|
|
23
|
+
| writeStuckOutboxEntry fails | Low | Fire-and-forget, console.warn on error | None -- intentional |
|
|
24
|
+
| no_progress fires on research workflow | Low | noProgressAbortEnabled defaults to false | None needed |
|
|
25
|
+
| NotificationPayload.outcome compile error | Medium | Add 'stuck' to union | None -- caught at build |
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Runner-Up / Simpler Alternative Review
|
|
30
|
+
|
|
31
|
+
- **Candidate B** (extend WorkflowRunTimeout.reason): No elements worth borrowing.
|
|
32
|
+
Does not resolve the core routing tension.
|
|
33
|
+
- **Skip ChildWorkflowRunResult**: Not acceptable -- runtime crash in parent session.
|
|
34
|
+
- **Skip sessionStartMs**: Not recommended -- pitch explicitly adds it for Signal 5 follow-up
|
|
35
|
+
to avoid future restructuring.
|
|
36
|
+
- **Inline outbox write**: Works but reduces turn_end subscriber readability. Not worth it.
|
|
37
|
+
|
|
38
|
+
No hybrid opportunities identified.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Philosophy Alignment
|
|
43
|
+
|
|
44
|
+
| Principle | Status |
|
|
45
|
+
|-----------|--------|
|
|
46
|
+
| Make illegal states unrepresentable | Satisfied |
|
|
47
|
+
| Exhaustiveness everywhere | Satisfied |
|
|
48
|
+
| Errors are data | Satisfied |
|
|
49
|
+
| Immutability by default | Satisfied |
|
|
50
|
+
| Type safety as first line of defense | Under tension (pre-existing cast; improved but not fully resolved) |
|
|
51
|
+
| Fire-and-forget for side effects | Satisfied |
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Findings
|
|
56
|
+
|
|
57
|
+
### Yellow: NotificationPayload.outcome union widening not specified in pitch
|
|
58
|
+
|
|
59
|
+
The pitch states 'buildOutcome() returns result._tag directly -- no change needed'.
|
|
60
|
+
However, the return type annotation `NotificationPayload['outcome']` will cause a
|
|
61
|
+
TypeScript compile error when 'stuck' is added to WorkflowRunResult but not to the
|
|
62
|
+
outcome union. **Resolution**: add `'stuck'` to `NotificationPayload.outcome` union
|
|
63
|
+
in notification-service.ts. This is a mechanical fix, not a design change.
|
|
64
|
+
|
|
65
|
+
### Yellow: Pre-existing `as ChildWorkflowRunResult` cast at line 2172
|
|
66
|
+
|
|
67
|
+
The cast suppresses TypeScript's compile-time check that would otherwise catch a
|
|
68
|
+
missing ChildWorkflowRunResult update. This PR updates the union and adds a
|
|
69
|
+
compile-time assignability test to partially compensate. Removing the cast is
|
|
70
|
+
out of scope. **Residual concern**: future union additions must be caught by the
|
|
71
|
+
test rather than the compiler.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Recommended Revisions
|
|
76
|
+
|
|
77
|
+
1. Add `'stuck'` to `NotificationPayload.outcome` union (not in pitch, required for compile).
|
|
78
|
+
2. Add compile-time assignability test for `ChildWorkflowRunResult` in the test file.
|
|
79
|
+
3. Document the `as ChildWorkflowRunResult` cast issue in a code comment at line 2172
|
|
80
|
+
(or verify existing comment is sufficient).
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Residual Concerns
|
|
85
|
+
|
|
86
|
+
- The `as ChildWorkflowRunResult` cast remains. Future contributors adding a new
|
|
87
|
+
WorkflowRunResult variant may forget to update ChildWorkflowRunResult. The
|
|
88
|
+
compile-time test in the stuck-escalation test file partially mitigates this,
|
|
89
|
+
but only for the stuck variant. A broader structural fix (removing the cast)
|
|
90
|
+
is a follow-up.
|
|
91
|
+
- Webhook consumers reading `outcome: 'stuck'` must handle the new value.
|
|
92
|
+
This is a new feature, not a breaking change, but operators consuming the
|
|
93
|
+
webhook should be aware.
|