@exaudeus/workrail 3.44.0 → 3.45.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1081,6 +1081,16 @@ function makeSpawnAgentTool(sessionId, ctx, apiKey, thisWorkrailSessionId, curre
1081
1081
  notes: childResult.message,
1082
1082
  };
1083
1083
  }
1084
+ else if (childResult._tag === 'stuck') {
1085
+ resultObj = {
1086
+ childSessionId,
1087
+ outcome: 'stuck',
1088
+ notes: childResult.message,
1089
+ ...(childResult.issueSummaries !== undefined
1090
+ ? { issueSummaries: childResult.issueSummaries }
1091
+ : {}),
1092
+ };
1093
+ }
1084
1094
  else {
1085
1095
  (0, assert_never_js_1.assertNever)(childResult);
1086
1096
  }
@@ -1093,6 +1103,31 @@ function makeSpawnAgentTool(sessionId, ctx, apiKey, thisWorkrailSessionId, curre
1093
1103
  },
1094
1104
  };
1095
1105
  }
1106
+ async function writeStuckOutboxEntry(opts) {
1107
+ try {
1108
+ const outboxPath = path.join(os.homedir(), '.workrail', 'outbox.jsonl');
1109
+ await fs.mkdir(path.dirname(outboxPath), { recursive: true });
1110
+ const entry = JSON.stringify({
1111
+ id: (0, node_crypto_1.randomUUID)(),
1112
+ kind: 'stuck',
1113
+ message: `Session stuck (${opts.reason}): workflowId=${opts.workflowId}` +
1114
+ (opts.issueSummaries && opts.issueSummaries.length > 0
1115
+ ? ` -- issues: ${opts.issueSummaries.join('; ')}`
1116
+ : ''),
1117
+ timestamp: new Date().toISOString(),
1118
+ workflowId: opts.workflowId,
1119
+ reason: opts.reason,
1120
+ ...(opts.issueSummaries && opts.issueSummaries.length > 0
1121
+ ? { issueSummaries: opts.issueSummaries }
1122
+ : {}),
1123
+ });
1124
+ await fs.appendFile(outboxPath, entry + '\n');
1125
+ }
1126
+ catch (err) {
1127
+ console.warn(`[WorkflowRunner] Could not write stuck outbox entry: ` +
1128
+ `${err instanceof Error ? err.message : String(err)}`);
1129
+ }
1130
+ }
1096
1131
  async function appendIssueAsync(issuesDir, sessionId, record) {
1097
1132
  await fs.mkdir(issuesDir, { recursive: true });
1098
1133
  const filePath = path.join(issuesDir, `${sessionId}.jsonl`);
@@ -1449,19 +1484,6 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
1449
1484
  if (startContinueToken) {
1450
1485
  await persistTokens(sessionId, startContinueToken, startCheckpointToken);
1451
1486
  }
1452
- if (trigger.botIdentity) {
1453
- try {
1454
- await execFileAsync('git', ['-C', trigger.workspacePath, 'config', 'user.name', trigger.botIdentity.name]);
1455
- await execFileAsync('git', ['-C', trigger.workspacePath, 'config', 'user.email', trigger.botIdentity.email]);
1456
- console.log(`[WorkflowRunner] Bot identity set: sessionId=${sessionId} ` +
1457
- `name=${trigger.botIdentity.name} email=${trigger.botIdentity.email}`);
1458
- }
1459
- catch (identityErr) {
1460
- console.warn(`[WorkflowRunner] WARNING: Failed to set bot identity for sessionId=${sessionId}: ` +
1461
- `${identityErr instanceof Error ? identityErr.message : String(identityErr)}. ` +
1462
- `Commits will use default git config.`);
1463
- }
1464
- }
1465
1487
  let sessionWorkspacePath = trigger.workspacePath;
1466
1488
  let sessionWorktreePath;
1467
1489
  if (trigger.branchStrategy === 'worktree') {
@@ -1497,6 +1519,19 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
1497
1519
  };
1498
1520
  }
1499
1521
  }
1522
+ if (trigger.botIdentity) {
1523
+ try {
1524
+ await execFileAsync('git', ['-C', sessionWorkspacePath, 'config', 'user.name', trigger.botIdentity.name]);
1525
+ await execFileAsync('git', ['-C', sessionWorkspacePath, 'config', 'user.email', trigger.botIdentity.email]);
1526
+ console.log(`[WorkflowRunner] Bot identity set: sessionId=${sessionId} ` +
1527
+ `name=${trigger.botIdentity.name} email=${trigger.botIdentity.email}`);
1528
+ }
1529
+ catch (identityErr) {
1530
+ console.warn(`[WorkflowRunner] WARNING: Failed to set bot identity for sessionId=${sessionId}: ` +
1531
+ `${identityErr instanceof Error ? identityErr.message : String(identityErr)}. ` +
1532
+ `Commits will use default git config.`);
1533
+ }
1534
+ }
1500
1535
  if (firstStep.isComplete) {
1501
1536
  await fs.unlink(path.join(exports.DAEMON_SESSIONS_DIR, `${sessionId}.json`)).catch(() => { });
1502
1537
  emitter?.emit({ kind: 'session_completed', sessionId, workflowId: trigger.workflowId, outcome: 'success', detail: 'stop', ...withWorkrailSession(workrailSessionId) });
@@ -1588,7 +1623,10 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
1588
1623
  });
1589
1624
  const sessionTimeoutMs = (trigger.agentConfig?.maxSessionMinutes ?? DEFAULT_SESSION_TIMEOUT_MINUTES) * 60 * 1000;
1590
1625
  const maxTurns = trigger.agentConfig?.maxTurns ?? DEFAULT_MAX_TURNS;
1626
+ const sessionStartMs = Date.now();
1627
+ void sessionStartMs;
1591
1628
  let timeoutReason = null;
1629
+ let stuckReason = null;
1592
1630
  let turnCount = 0;
1593
1631
  const unsubscribe = agent.subscribe(async (event) => {
1594
1632
  if (event.type !== 'turn_end')
@@ -1623,6 +1661,17 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
1623
1661
  argsSummary: lastNToolCalls[0]?.argsSummary,
1624
1662
  ...withWorkrailSession(workrailSessionId),
1625
1663
  });
1664
+ void writeStuckOutboxEntry({
1665
+ workflowId: trigger.workflowId,
1666
+ reason: 'repeated_tool_call',
1667
+ ...(issueSummaries.length > 0 ? { issueSummaries: [...issueSummaries] } : {}),
1668
+ });
1669
+ const stuckPolicy = trigger.agentConfig?.stuckAbortPolicy ?? 'abort';
1670
+ if (stuckPolicy !== 'notify_only' && stuckReason === null && timeoutReason === null) {
1671
+ stuckReason = 'repeated_tool_call';
1672
+ agent.abort();
1673
+ return;
1674
+ }
1626
1675
  }
1627
1676
  if (maxTurns > 0 &&
1628
1677
  turnCount >= Math.floor(maxTurns * 0.8) &&
@@ -1634,6 +1683,20 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
1634
1683
  detail: `${turnCount} turns used, 0 step advances (${maxTurns} turn limit)`,
1635
1684
  ...withWorkrailSession(workrailSessionId),
1636
1685
  });
1686
+ const noProgressAbortEnabled = trigger.agentConfig?.noProgressAbortEnabled ?? false;
1687
+ if (noProgressAbortEnabled) {
1688
+ void writeStuckOutboxEntry({
1689
+ workflowId: trigger.workflowId,
1690
+ reason: 'no_progress',
1691
+ ...(issueSummaries.length > 0 ? { issueSummaries: [...issueSummaries] } : {}),
1692
+ });
1693
+ const noProgressPolicy = trigger.agentConfig?.stuckAbortPolicy ?? 'abort';
1694
+ if (noProgressPolicy !== 'notify_only' && stuckReason === null && timeoutReason === null) {
1695
+ stuckReason = 'no_progress';
1696
+ agent.abort();
1697
+ return;
1698
+ }
1699
+ }
1637
1700
  }
1638
1701
  if (timeoutReason !== null) {
1639
1702
  emitter?.emit({
@@ -1693,6 +1756,26 @@ async function runWorkflow(trigger, ctx, apiKey, daemonRegistry, emitter, steerR
1693
1756
  }
1694
1757
  console.log(`[WorkflowRunner] Agent loop ended: sessionId=${sessionId} stopReason=${stopReason}${errorMessage ? ` error=${errorMessage.slice(0, 120)}` : ''}`);
1695
1758
  }
1759
+ if (stuckReason !== null) {
1760
+ emitter?.emit({
1761
+ kind: 'session_completed',
1762
+ sessionId,
1763
+ workflowId: trigger.workflowId,
1764
+ outcome: 'timeout',
1765
+ detail: stuckReason,
1766
+ ...withWorkrailSession(workrailSessionId),
1767
+ });
1768
+ if (workrailSessionId !== null)
1769
+ daemonRegistry?.unregister(workrailSessionId, 'failed');
1770
+ return {
1771
+ _tag: 'stuck',
1772
+ workflowId: trigger.workflowId,
1773
+ reason: stuckReason,
1774
+ message: `Session aborted: stuck heuristic fired (${stuckReason})`,
1775
+ stopReason: 'aborted',
1776
+ ...(issueSummaries.length > 0 ? { issueSummaries: [...issueSummaries] } : {}),
1777
+ };
1778
+ }
1696
1779
  if (timeoutReason !== null) {
1697
1780
  emitter?.emit({ kind: 'session_completed', sessionId, workflowId: trigger.workflowId, outcome: 'timeout', detail: timeoutReason, ...withWorkrailSession(workrailSessionId) });
1698
1781
  if (workrailSessionId !== null)
@@ -449,8 +449,8 @@
449
449
  "sha256": "5fe866e54f796975dec5d8ba9983aefd86074db212d3fccd64eed04bc9f0b3da",
450
450
  "bytes": 8011
451
451
  },
452
- "console-ui/assets/index-Bi38ITiQ.js": {
453
- "sha256": "e3229116ac20f315184c69ef9eaa33267457e5e7aac1e22015ed830cb098e39a",
452
+ "console-ui/assets/index-BpanIvmi.js": {
453
+ "sha256": "e5c3e897dbda3f810ce737422d3f84d06b8e146c7923041fbd96b276538435c6",
454
454
  "bytes": 760528
455
455
  },
456
456
  "console-ui/assets/index-DGj8EsFR.css": {
@@ -458,7 +458,7 @@
458
458
  "bytes": 60631
459
459
  },
460
460
  "console-ui/index.html": {
461
- "sha256": "fb9efa806376b8f8f18d3c5d7853d04b9dfc9abc305b0ed22782379bc13a2c86",
461
+ "sha256": "587aa7591502ca4dadbea5c7f5c56136d19f18d179c735abe693e9ee2c290e1e",
462
462
  "bytes": 417
463
463
  },
464
464
  "console/standalone-console.d.ts": {
@@ -550,12 +550,12 @@
550
550
  "bytes": 1512
551
551
  },
552
552
  "daemon/workflow-runner.d.ts": {
553
- "sha256": "0406654be8c6eb147706e81e0ba666ce372db140f4720246258e0f001653181e",
554
- "bytes": 6628
553
+ "sha256": "4c67cc7a44c934469c190f11a71bd18bf0dfc31f59ab0c315b98315b96d59cce",
554
+ "bytes": 7048
555
555
  },
556
556
  "daemon/workflow-runner.js": {
557
- "sha256": "f40f265284aa1e32168d8a0cf28c08007186eafac6d55182b7ec6ddb53bed5a8",
558
- "bytes": 89592
557
+ "sha256": "a5d74ec723ff0dce45d2335b811959ff0c6e6f8851edf399a70853f6bf127893",
558
+ "bytes": 93222
559
559
  },
560
560
  "di/container.d.ts": {
561
561
  "sha256": "003bb7fb7478d627524b9b1e76bd0a963a243794a687ff233b96dc0e33a06d9f",
@@ -1606,12 +1606,12 @@
1606
1606
  "bytes": 1222
1607
1607
  },
1608
1608
  "trigger/notification-service.d.ts": {
1609
- "sha256": "c78406d3748953548f7879df8ac60cecd5e42f2f3b283f777343168ce2470b8d",
1610
- "bytes": 1572
1609
+ "sha256": "25509f290a11ac9a8aa03e5d7e44011950c841436e9458eb855b94d15d036d68",
1610
+ "bytes": 1582
1611
1611
  },
1612
1612
  "trigger/notification-service.js": {
1613
- "sha256": "693f617adc30b3a4fcebeca6a78b0da1c58819001660c017a4d0901652d675b8",
1614
- "bytes": 6373
1613
+ "sha256": "9d9a5951229f2c6ffaa413a7421efa6611d9ca0ed456edf5c297a4506e84a80e",
1614
+ "bytes": 6521
1615
1615
  },
1616
1616
  "trigger/polled-event-store.d.ts": {
1617
1617
  "sha256": "2952a25804177b2389d4273bfc41192477d100bc26100683861dedf28520dec1",
@@ -1642,8 +1642,8 @@
1642
1642
  "bytes": 2123
1643
1643
  },
1644
1644
  "trigger/trigger-router.js": {
1645
- "sha256": "605cdce397bd19e5b991fe7378faf17b4f25b4421749e1b5349413a208a4f3dd",
1646
- "bytes": 17250
1645
+ "sha256": "8a4a4699df4210b5631211e7370ed6b70d972e82321cf8c66dcc9f60661e5d2c",
1646
+ "bytes": 17750
1647
1647
  },
1648
1648
  "trigger/trigger-store.d.ts": {
1649
1649
  "sha256": "7afb05127d55bc3757a550dd15d4b797766b3fff29d1bfe76b303764b93322e7",
@@ -1654,8 +1654,8 @@
1654
1654
  "bytes": 38148
1655
1655
  },
1656
1656
  "trigger/types.d.ts": {
1657
- "sha256": "4ccedde5b927f17edbb96203083e8ffd2d578e2cc007ff2427511112ae262e30",
1658
- "bytes": 3475
1657
+ "sha256": "a1336ad769dbe4760e7acd3b2a92961251492aa29245b5d906ad89011ea934fa",
1658
+ "bytes": 3587
1659
1659
  },
1660
1660
  "trigger/types.js": {
1661
1661
  "sha256": "45b4e4f23a6d1a2b07350196871b0c53840e5d8142b47f7acedd2f40ae7a6b73",
@@ -2970,8 +2970,8 @@
2970
2970
  "bytes": 798
2971
2971
  },
2972
2972
  "v2/usecases/console-routes.js": {
2973
- "sha256": "89749c3462728cd2a821af7e4cad61d2d42b8f580765f644c9f4e9d1e9187bd1",
2974
- "bytes": 29558
2973
+ "sha256": "3e8bd3adfdc66926d91044506d2c77253b784b9e4356a8610c585e6a27153d4b",
2974
+ "bytes": 29776
2975
2975
  },
2976
2976
  "v2/usecases/console-service.d.ts": {
2977
2977
  "sha256": "fc8fe65427fa9f4f3535344b385b36f66ca06b7e3bfaea708931817a3edcad2b",
@@ -21,7 +21,7 @@ export interface NotificationConfig {
21
21
  export interface NotificationPayload {
22
22
  readonly event: 'session_completed';
23
23
  readonly workflowId: string;
24
- readonly outcome: 'success' | 'error' | 'timeout' | 'delivery_failed';
24
+ readonly outcome: 'success' | 'error' | 'timeout' | 'stuck' | 'delivery_failed';
25
25
  readonly detail: string;
26
26
  readonly goal: string;
27
27
  readonly timestamp: string;
@@ -48,6 +48,8 @@ function buildNotificationBody(result, goal) {
48
48
  return `Session failed: ${truncated}`;
49
49
  case 'timeout':
50
50
  return `Session timed out: ${truncated}`;
51
+ case 'stuck':
52
+ return `Session stuck (${result.reason}): ${truncated}`;
51
53
  case 'delivery_failed':
52
54
  return `Session completed but result delivery failed: ${truncated}`;
53
55
  }
@@ -63,6 +65,8 @@ function buildDetail(result) {
63
65
  return result.message;
64
66
  case 'timeout':
65
67
  return result.message;
68
+ case 'stuck':
69
+ return result.message;
66
70
  case 'delivery_failed':
67
71
  return `stopReason: ${result.stopReason}; deliveryError: ${result.deliveryError}`;
68
72
  }
@@ -327,6 +327,10 @@ class TriggerRouter {
327
327
  console.log(`[TriggerRouter] Workflow failed: triggerId=${trigger.id} ` +
328
328
  `workflowId=${trigger.workflowId} error=${result.message} stopReason=${result.stopReason}`);
329
329
  }
330
+ else if (result._tag === 'stuck') {
331
+ console.log(`[TriggerRouter] Workflow stuck: triggerId=${trigger.id} ` +
332
+ `workflowId=${trigger.workflowId} reason=${result.reason} message=${result.message}`);
333
+ }
330
334
  else {
331
335
  (0, assert_never_js_1.assertNever)(result);
332
336
  }
@@ -366,6 +370,10 @@ class TriggerRouter {
366
370
  console.log(`[TriggerRouter] Dispatch failed: workflowId=${workflowTrigger.workflowId} ` +
367
371
  `error=${result.message} stopReason=${result.stopReason}`);
368
372
  }
373
+ else if (result._tag === 'stuck') {
374
+ console.log(`[TriggerRouter] Dispatch stuck: workflowId=${workflowTrigger.workflowId} ` +
375
+ `reason=${result.reason} message=${result.message}`);
376
+ }
369
377
  else {
370
378
  (0, assert_never_js_1.assertNever)(result);
371
379
  }
@@ -71,6 +71,8 @@ export interface TriggerDefinition {
71
71
  readonly model?: string;
72
72
  readonly maxSessionMinutes?: number;
73
73
  readonly maxTurns?: number;
74
+ readonly stuckAbortPolicy?: 'abort' | 'notify_only';
75
+ readonly noProgressAbortEnabled?: boolean;
74
76
  };
75
77
  readonly concurrencyMode: 'serial' | 'parallel';
76
78
  readonly callbackUrl?: string;
@@ -604,6 +604,9 @@ function mountConsoleRoutes(app, consoleService, workflowService, timingRingBuff
604
604
  else if (result._tag === 'error') {
605
605
  console.log(`[ConsoleRoutes] Auto dispatch failed: workflowId=${workflowId} error=${result.message}`);
606
606
  }
607
+ else if (result._tag === 'stuck') {
608
+ console.log(`[ConsoleRoutes] Auto dispatch stuck: workflowId=${workflowId} reason=${result.reason} message=${result.message}`);
609
+ }
607
610
  else {
608
611
  (0, assert_never_js_1.assertNever)(result);
609
612
  }
@@ -0,0 +1,183 @@
1
+ # Design Candidates: WorkTrain Stuck-Escalation
2
+
3
+ *Generated: 2026-04-19 | Pitch: .workrail/current-pitch.md*
4
+
5
+ ---
6
+
7
+ ## Problem Understanding
8
+
9
+ ### Core Tensions
10
+
11
+ 1. **Stuck vs timeout conflation**: When `repeated_tool_call` fires, the session
12
+ currently runs until wall-clock or max-turns timeout. The result is
13
+ `_tag: 'timeout'`, which is indistinguishable from a legitimate slow session.
14
+ Automated routing requires a distinct discriminant.
15
+
16
+ 2. **Abort vs notify-only independence**: Outbox notification and `agent.abort()`
17
+ are two separate effects. `notify_only` policy suppresses the abort but must
18
+ not suppress the outbox write. These effects must not be coupled.
19
+
20
+ 3. **ChildWorkflowRunResult atomic update**: The `as ChildWorkflowRunResult` cast
21
+ at line 2172 in `makeSpawnAgentTool` suppresses any compile-time error from a
22
+ missing union update. Only the `assertNever(childResult)` at line 2212 catches
23
+ the omission -- at runtime, crashing the parent session.
24
+
25
+ 4. **no_progress false-positive risk**: The no_progress heuristic fires on
26
+ legitimate research workflows that spend many turns reading before advancing.
27
+ It must be opt-in (default: false) to avoid breaking existing sessions.
28
+
29
+ ### Likely Seam
30
+
31
+ The `turn_end` subscriber in `runWorkflow()` is the correct location. All
32
+ required state (lastNToolCalls, stepAdvanceCount, timeoutReason, issueSummaries)
33
+ is available there as closure variables. Detection fires at the right moment
34
+ (after each turn, synchronously before next step injection).
35
+
36
+ ### What Makes This Hard
37
+
38
+ - The `as ChildWorkflowRunResult` cast is a type-safety trap: it silences
39
+ TypeScript while leaving a runtime crash. Only careful reading of the pitch
40
+ reveals the issue.
41
+ - `buildOutcome()` in notification-service.ts has return type
42
+ `NotificationPayload['outcome']`. Adding 'stuck' to WorkflowRunResult causes
43
+ a compile error there unless the outcome union is also widened.
44
+
45
+ ---
46
+
47
+ ## Philosophy Constraints
48
+
49
+ From CLAUDE.md:
50
+
51
+ - **Make illegal states unrepresentable**: the stuck discriminant prevents
52
+ conflating stuck with timeout at the type level.
53
+ - **Exhaustiveness everywhere**: assertNever guards in trigger-router and
54
+ makeSpawnAgentTool enforce this -- adding stuck arm is required.
55
+ - **Errors are data**: WorkflowRunResult is a Result type; WorkflowRunStuck is
56
+ a new variant, not an exception.
57
+ - **Type safety as first line of defense**: ChildWorkflowRunResult update in
58
+ same commit restores the compile-time invariant that the cast broke.
59
+ - **Fire-and-forget for side effects**: outbox write uses void + catch, same
60
+ as DaemonEventEmitter and issue recording.
61
+
62
+ No conflicts between stated philosophy and repo patterns.
63
+
64
+ ---
65
+
66
+ ## Impact Surface
67
+
68
+ Paths that must stay consistent when WorkflowRunResult gains a new variant:
69
+
70
+ 1. `makeSpawnAgentTool` -- `assertNever(childResult)` at line 2212; requires
71
+ ChildWorkflowRunResult update and a new `stuck` arm in the result mapping.
72
+ 2. `trigger-router.ts` `route()` -- exhaustive if-else chain ending in
73
+ `assertNever(result)` at line ~689.
74
+ 3. `trigger-router.ts` `dispatch()` -- same exhaustive chain at line ~770.
75
+ 4. `notification-service.ts` `buildNotificationBody()` -- exhaustive switch.
76
+ 5. `notification-service.ts` `buildDetail()` -- exhaustive switch.
77
+ 6. `notification-service.ts` `buildOutcome()` -- return type
78
+ `NotificationPayload['outcome']`; 'stuck' must be added to that union.
79
+ 7. `NotificationPayload.outcome` union -- currently
80
+ `'success' | 'error' | 'timeout' | 'delivery_failed'`; must add `'stuck'`.
81
+
82
+ ---
83
+
84
+ ## Candidates
85
+
86
+ ### Candidate A: New `_tag: 'stuck'` discriminated union variant (SELECTED)
87
+
88
+ **Summary**: Add `WorkflowRunStuck` interface with `_tag: 'stuck'`, wire abort
89
+ in turn_end subscriber after Signal 1 and Signal 2 emitter calls, return stuck
90
+ result before timeout check, update both `WorkflowRunResult` and
91
+ `ChildWorkflowRunResult` unions atomically, add `writeStuckOutboxEntry` helper.
92
+
93
+ **Tensions resolved**:
94
+ - Stuck/timeout conflation: separate discriminant, separate return path.
95
+ - Abort/notify independence: outbox write fires before the abort gate check.
96
+ - ChildWorkflowRunResult crash: atomic update with assertNever arm added.
97
+ - no_progress false-positive: gated by `noProgressAbortEnabled: false` default.
98
+
99
+ **Boundary solved at**: `turn_end` subscriber (detection + abort), result
100
+ construction (return), 4 files for propagation to callers.
101
+
102
+ **Why best-fit boundary**: The turn_end subscriber is the only location with
103
+ access to all required state. The result construction is the canonical output
104
+ boundary for runWorkflow(). Propagation to callers follows the existing
105
+ WorkflowRunResult variant fan-out pattern.
106
+
107
+ **Failure mode**: Forgetting to update `NotificationPayload.outcome` union --
108
+ caught by `npm run build` (TypeScript compile error in `buildOutcome()`).
109
+
110
+ **Repo-pattern relationship**: Mirrors `timeoutReason` flag pattern exactly.
111
+ Mirrors `WorkflowRunTimeout` interface field shape. Follows assertNever guard
112
+ pattern already established in trigger-router and makeSpawnAgentTool.
113
+
114
+ **Gains**: Distinct routing for stuck sessions, type-safe callers, clean
115
+ separation of abort and notification effects.
116
+
117
+ **Losses**: One more variant in the union (minor cognitive load increase).
118
+
119
+ **Scope judgment**: Best-fit. 4 files, mechanical wiring, all design resolved.
120
+
121
+ **Philosophy fit**: Honors all relevant CLAUDE.md principles. No conflicts.
122
+
123
+ ---
124
+
125
+ ### Candidate B: Extend `WorkflowRunTimeout.reason` with stuck sub-values
126
+
127
+ **Summary**: Add `'stuck_repeated_tool_call' | 'stuck_no_progress'` to
128
+ `WorkflowRunTimeout.reason` -- reuse the timeout discriminant.
129
+
130
+ **Tensions resolved**: None of the core ones. Stuck and timeout still share
131
+ `_tag: 'timeout'`, requiring callers to inspect reason to distinguish them.
132
+
133
+ **Failure mode**: Violates make-illegal-states-unrepresentable. Callers using
134
+ `result._tag === 'timeout'` would silently handle stuck sessions as timeouts.
135
+
136
+ **Repo-pattern relationship**: Departs from the exhaustiveness-everywhere
137
+ pattern. The assertNever guard pattern exists precisely to avoid this.
138
+
139
+ **Scope judgment**: Too narrow -- preserves the routing problem this pitch
140
+ exists to solve.
141
+
142
+ **Rejected because**: Violates philosophy, does not resolve the core tension,
143
+ and the pitch explicitly rejects conflating stuck with timeout.
144
+
145
+ ---
146
+
147
+ ## Comparison and Recommendation
148
+
149
+ Candidate A is the only viable candidate. All analysis converges.
150
+
151
+ The core recommendation is to implement Candidate A exactly as specified in
152
+ `.workrail/current-pitch.md`, with one addition not noted in the pitch:
153
+ update `NotificationPayload.outcome` union to include `'stuck'` (required for
154
+ `buildOutcome()` to compile).
155
+
156
+ ---
157
+
158
+ ## Self-Critique
159
+
160
+ **Strongest counter-argument**: Adding a 5th variant to WorkflowRunResult
161
+ increases cognitive load for callers. Counter: assertNever guards make missing
162
+ cases compile errors, which is the correct safeguard. The complexity cost is
163
+ paid once (at implementation) and enforced automatically.
164
+
165
+ **Narrower option that lost**: Update only WorkflowRunResult, skip
166
+ ChildWorkflowRunResult. Lost because: runtime crash in makeSpawnAgentTool
167
+ when a child hits stuck-abort. The cast at line 2172 provides no protection.
168
+
169
+ **Broader option not justified**: Adding `onStuck:` hook to TriggerDefinition.
170
+ Explicitly deferred per pitch No-Gos. Would require trigger-store.ts parser
171
+ changes -- outside the 4-file scope.
172
+
173
+ **Pivot condition**: If `assertNever(childResult)` were removed in favor of a
174
+ logged fallback, ChildWorkflowRunResult update would be less critical. It is
175
+ not removed, so the atomic update is required.
176
+
177
+ ---
178
+
179
+ ## Open Questions for the Main Agent
180
+
181
+ None. All design decisions are resolved in the pitch. The only implementation
182
+ detail requiring attention is the `NotificationPayload.outcome` union widening
183
+ (add 'stuck') -- verify this compiles before finalizing.
@@ -0,0 +1,93 @@
1
+ # Design Review Findings: WorkTrain Stuck-Escalation
2
+
3
+ *Generated: 2026-04-19 | Pitch: .workrail/current-pitch.md*
4
+
5
+ ---
6
+
7
+ ## Tradeoff Review
8
+
9
+ | Tradeoff | Status | Condition for Failure |
10
+ |----------|--------|-----------------------|
11
+ | One more union variant in WorkflowRunResult | Acceptable | All callers use assertNever guards -- compile error enforces handling |
12
+ | ChildWorkflowRunResult atomic update relies on discipline | Managed | Fails only if commit is split; mitigated by single-PR implementation and compile-time test |
13
+ | NotificationPayload.outcome union widening (gap, not tradeoff) | Resolved | Add 'stuck' to outcome union; caught by npm run build |
14
+
15
+ ---
16
+
17
+ ## Failure Mode Review
18
+
19
+ | Failure Mode | Severity | Design Handling | Missing Mitigation |
20
+ |--------------|----------|-----------------|--------------------|
21
+ | ChildWorkflowRunResult not updated | High | Atomic commit, compile-time assignability test | None beyond discipline |
22
+ | stuckReason / timeoutReason race | Low | First-writer-wins guard; max_turns early return prevents race | None needed |
23
+ | writeStuckOutboxEntry fails | Low | Fire-and-forget, console.warn on error | None -- intentional |
24
+ | no_progress fires on research workflow | Low | noProgressAbortEnabled defaults to false | None needed |
25
+ | NotificationPayload.outcome compile error | Medium | Add 'stuck' to union | None -- caught at build |
26
+
27
+ ---
28
+
29
+ ## Runner-Up / Simpler Alternative Review
30
+
31
+ - **Candidate B** (extend WorkflowRunTimeout.reason): No elements worth borrowing.
32
+ Does not resolve the core routing tension.
33
+ - **Skip ChildWorkflowRunResult**: Not acceptable -- runtime crash in parent session.
34
+ - **Skip sessionStartMs**: Not recommended -- pitch explicitly adds it for Signal 5 follow-up
35
+ to avoid future restructuring.
36
+ - **Inline outbox write**: Works but reduces turn_end subscriber readability. Not worth it.
37
+
38
+ No hybrid opportunities identified.
39
+
40
+ ---
41
+
42
+ ## Philosophy Alignment
43
+
44
+ | Principle | Status |
45
+ |-----------|--------|
46
+ | Make illegal states unrepresentable | Satisfied |
47
+ | Exhaustiveness everywhere | Satisfied |
48
+ | Errors are data | Satisfied |
49
+ | Immutability by default | Satisfied |
50
+ | Type safety as first line of defense | Under tension (pre-existing cast; improved but not fully resolved) |
51
+ | Fire-and-forget for side effects | Satisfied |
52
+
53
+ ---
54
+
55
+ ## Findings
56
+
57
+ ### Yellow: NotificationPayload.outcome union widening not specified in pitch
58
+
59
+ The pitch states 'buildOutcome() returns result._tag directly -- no change needed'.
60
+ However, the return type annotation `NotificationPayload['outcome']` will cause a
61
+ TypeScript compile error when 'stuck' is added to WorkflowRunResult but not to the
62
+ outcome union. **Resolution**: add `'stuck'` to `NotificationPayload.outcome` union
63
+ in notification-service.ts. This is a mechanical fix, not a design change.
64
+
65
+ ### Yellow: Pre-existing `as ChildWorkflowRunResult` cast at line 2172
66
+
67
+ The cast suppresses TypeScript's compile-time check that would otherwise catch a
68
+ missing ChildWorkflowRunResult update. This PR updates the union and adds a
69
+ compile-time assignability test to partially compensate. Removing the cast is
70
+ out of scope. **Residual concern**: future union additions must be caught by the
71
+ test rather than the compiler.
72
+
73
+ ---
74
+
75
+ ## Recommended Revisions
76
+
77
+ 1. Add `'stuck'` to `NotificationPayload.outcome` union (not in pitch, required for compile).
78
+ 2. Add compile-time assignability test for `ChildWorkflowRunResult` in the test file.
79
+ 3. Document the `as ChildWorkflowRunResult` cast issue in a code comment at line 2172
80
+ (or verify existing comment is sufficient).
81
+
82
+ ---
83
+
84
+ ## Residual Concerns
85
+
86
+ - The `as ChildWorkflowRunResult` cast remains. Future contributors adding a new
87
+ WorkflowRunResult variant may forget to update ChildWorkflowRunResult. The
88
+ compile-time test in the stuck-escalation test file partially mitigates this,
89
+ but only for the stuck variant. A broader structural fix (removing the cast)
90
+ is a follow-up.
91
+ - Webhook consumers reading `outcome: 'stuck'` must handle the new value.
92
+ This is a new feature, not a breaking change, but operators consuming the
93
+ webhook should be aware.