@exaudeus/workrail 3.73.2 → 3.74.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-worktrain.js +126 -1
- package/dist/console-ui/assets/{index-CfI4I3OX.js → index-CfU3va8H.js} +1 -1
- package/dist/console-ui/index.html +1 -1
- package/dist/coordinators/pr-review.d.ts +11 -1
- package/dist/coordinators/types.d.ts +15 -0
- package/dist/coordinators/types.js +2 -0
- package/dist/manifest.json +17 -9
- package/dist/trigger/coordinator-deps.js +203 -36
- package/docs/authoring.md +23 -0
- package/docs/ideas/backlog.md +100 -65
- package/package.json +1 -1
- package/spec/authoring-spec.json +36 -1
package/dist/manifest.json
CHANGED
|
@@ -238,8 +238,8 @@
|
|
|
238
238
|
"bytes": 31
|
|
239
239
|
},
|
|
240
240
|
"cli-worktrain.js": {
|
|
241
|
-
"sha256": "
|
|
242
|
-
"bytes":
|
|
241
|
+
"sha256": "fe38aca6a553491f3c9e0bd916cedbcd6a58d5cce230b1f47a52066f0b622851",
|
|
242
|
+
"bytes": 67617
|
|
243
243
|
},
|
|
244
244
|
"cli.d.ts": {
|
|
245
245
|
"sha256": "43e818adf60173644896298637f47b01d5819b17eda46eaa32d0c7d64724d012",
|
|
@@ -473,8 +473,8 @@
|
|
|
473
473
|
"sha256": "5fe866e54f796975dec5d8ba9983aefd86074db212d3fccd64eed04bc9f0b3da",
|
|
474
474
|
"bytes": 8011
|
|
475
475
|
},
|
|
476
|
-
"console-ui/assets/index-
|
|
477
|
-
"sha256": "
|
|
476
|
+
"console-ui/assets/index-CfU3va8H.js": {
|
|
477
|
+
"sha256": "7f7a29c1c312c2966f68131e1ade49ab0daf94ffd425c5c2d0b21add58c0028f",
|
|
478
478
|
"bytes": 768234
|
|
479
479
|
},
|
|
480
480
|
"console-ui/assets/index-DHrKiMCf.css": {
|
|
@@ -482,7 +482,7 @@
|
|
|
482
482
|
"bytes": 60673
|
|
483
483
|
},
|
|
484
484
|
"console-ui/index.html": {
|
|
485
|
-
"sha256": "
|
|
485
|
+
"sha256": "3f08f8bf69e386e4ceca01a42720523c8e667635e7c5fbf2f3038540f29681e1",
|
|
486
486
|
"bytes": 417
|
|
487
487
|
},
|
|
488
488
|
"console/standalone-console.d.ts": {
|
|
@@ -574,8 +574,8 @@
|
|
|
574
574
|
"bytes": 1198
|
|
575
575
|
},
|
|
576
576
|
"coordinators/pr-review.d.ts": {
|
|
577
|
-
"sha256": "
|
|
578
|
-
"bytes":
|
|
577
|
+
"sha256": "0dba830dd29cd82c58300ca9fdfb4c29d0acd0b257740ce3e65f2360239a106b",
|
|
578
|
+
"bytes": 4501
|
|
579
579
|
},
|
|
580
580
|
"coordinators/pr-review.js": {
|
|
581
581
|
"sha256": "385baa9e6252dbd84060bb423ce219884d519752f4a6e9f8f04e5f503fa38b67",
|
|
@@ -589,6 +589,14 @@
|
|
|
589
589
|
"sha256": "195953d6c0e28d749407e5e3fb29b963cf14629c03508792a44bf0866f7c9d33",
|
|
590
590
|
"bytes": 1815
|
|
591
591
|
},
|
|
592
|
+
"coordinators/types.d.ts": {
|
|
593
|
+
"sha256": "913320eccf4737884fe73446e82ab0becbeeec55fa704c0613f19afe31a120aa",
|
|
594
|
+
"bytes": 413
|
|
595
|
+
},
|
|
596
|
+
"coordinators/types.js": {
|
|
597
|
+
"sha256": "d43aa81f5bc89faa359e0f97c814ba25155591ff078fbb9bfd40f8c7c9683230",
|
|
598
|
+
"bytes": 77
|
|
599
|
+
},
|
|
592
600
|
"core/error-handler.d.ts": {
|
|
593
601
|
"sha256": "80451f12ac8e185133ec3dc4c57285491a785f27525ed21e729db1da3f61010d",
|
|
594
602
|
"bytes": 1368
|
|
@@ -1786,8 +1794,8 @@
|
|
|
1786
1794
|
"bytes": 854
|
|
1787
1795
|
},
|
|
1788
1796
|
"trigger/coordinator-deps.js": {
|
|
1789
|
-
"sha256": "
|
|
1790
|
-
"bytes":
|
|
1797
|
+
"sha256": "1b400abbd6158e900d4559c74b9f3069a46b60fd02d046f36e30cc40a58acb51",
|
|
1798
|
+
"bytes": 22626
|
|
1791
1799
|
},
|
|
1792
1800
|
"trigger/delivery-action.d.ts": {
|
|
1793
1801
|
"sha256": "bba98a08e35653304b604cd3ec126374cb731620db27ee2c8d6782d5b5b31207",
|
|
@@ -45,6 +45,102 @@ const infra_js_1 = require("../context-assembly/infra.js");
|
|
|
45
45
|
function createCoordinatorDeps(deps) {
|
|
46
46
|
const { ctx, execFileAsync, consoleService } = deps;
|
|
47
47
|
let dispatch = null;
|
|
48
|
+
async function fetchAgentResult(sessionHandle) {
|
|
49
|
+
const emptyResult = { recapMarkdown: null, artifacts: [] };
|
|
50
|
+
if (consoleService === null) {
|
|
51
|
+
return emptyResult;
|
|
52
|
+
}
|
|
53
|
+
try {
|
|
54
|
+
const detailResult = await consoleService.getSessionDetail(sessionHandle);
|
|
55
|
+
if (detailResult.isErr())
|
|
56
|
+
return emptyResult;
|
|
57
|
+
const run = detailResult.value.runs[0];
|
|
58
|
+
if (!run)
|
|
59
|
+
return emptyResult;
|
|
60
|
+
const tipNodeId = run.preferredTipNodeId;
|
|
61
|
+
if (!tipNodeId)
|
|
62
|
+
return emptyResult;
|
|
63
|
+
const allNodeIds = run.nodes
|
|
64
|
+
.map((n) => n.nodeId)
|
|
65
|
+
.filter((id) => typeof id === 'string' && id !== '');
|
|
66
|
+
const nodeIdsToFetch = allNodeIds.length > 0 ? allNodeIds : [tipNodeId];
|
|
67
|
+
let recap = null;
|
|
68
|
+
const collectedArtifacts = [];
|
|
69
|
+
for (const nodeId of nodeIdsToFetch) {
|
|
70
|
+
try {
|
|
71
|
+
const nodeResult = await consoleService.getNodeDetail(sessionHandle, nodeId);
|
|
72
|
+
if (nodeResult.isErr())
|
|
73
|
+
continue;
|
|
74
|
+
if (nodeId === tipNodeId)
|
|
75
|
+
recap = nodeResult.value.recapMarkdown;
|
|
76
|
+
if (nodeResult.value.artifacts.length > 0)
|
|
77
|
+
collectedArtifacts.push(...nodeResult.value.artifacts);
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return { recapMarkdown: recap, artifacts: collectedArtifacts };
|
|
84
|
+
}
|
|
85
|
+
catch (e) {
|
|
86
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
87
|
+
process.stderr.write(`[WARN coord:reason=exception handle=${sessionHandle.slice(0, 16)}] fetchAgentResult: ${msg}\n`);
|
|
88
|
+
return emptyResult;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
async function fetchChildSessionResult(handle, coordinatorSessionId) {
|
|
92
|
+
if (consoleService === null) {
|
|
93
|
+
process.stderr.write(`[WARN coord:reason=await_degraded handle=${handle.slice(0, 16)}${coordinatorSessionId ? ' parent=' + coordinatorSessionId.slice(0, 16) : ''}] fetchChildSessionResult: ConsoleService unavailable\n`);
|
|
94
|
+
return {
|
|
95
|
+
kind: 'await_degraded',
|
|
96
|
+
message: 'ConsoleService unavailable -- cannot read child session outcome',
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
let runStatus = null;
|
|
100
|
+
try {
|
|
101
|
+
const detailResult = await consoleService.getSessionDetail(handle);
|
|
102
|
+
if (detailResult.isErr()) {
|
|
103
|
+
process.stderr.write(`[WARN coord:reason=getSessionDetail_failed handle=${handle.slice(0, 16)}] fetchChildSessionResult: ${String(detailResult.error)}\n`);
|
|
104
|
+
return {
|
|
105
|
+
kind: 'failed',
|
|
106
|
+
reason: 'error',
|
|
107
|
+
message: `Could not read session detail: ${String(detailResult.error)}`,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
const run = detailResult.value.runs[0];
|
|
111
|
+
runStatus = run?.status ?? null;
|
|
112
|
+
}
|
|
113
|
+
catch (e) {
|
|
114
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
115
|
+
process.stderr.write(`[WARN coord:reason=exception handle=${handle.slice(0, 16)}] fetchChildSessionResult getSessionDetail: ${msg}\n`);
|
|
116
|
+
return { kind: 'failed', reason: 'error', message: `Exception reading session detail: ${msg}` };
|
|
117
|
+
}
|
|
118
|
+
if (runStatus === 'complete' || runStatus === 'complete_with_gaps') {
|
|
119
|
+
const agentResult = await fetchAgentResult(handle);
|
|
120
|
+
return {
|
|
121
|
+
kind: 'success',
|
|
122
|
+
notes: agentResult.recapMarkdown,
|
|
123
|
+
artifacts: agentResult.artifacts,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
if (runStatus === 'blocked') {
|
|
127
|
+
return {
|
|
128
|
+
kind: 'failed',
|
|
129
|
+
reason: 'stuck',
|
|
130
|
+
message: `Child session ${handle.slice(0, 16)} reached blocked state`,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
if (runStatus === null) {
|
|
134
|
+
return {
|
|
135
|
+
kind: 'timed_out',
|
|
136
|
+
message: `Child session ${handle.slice(0, 16)} has no terminal run status (likely timed out)`,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
return {
|
|
140
|
+
kind: 'timed_out',
|
|
141
|
+
message: `Child session ${handle.slice(0, 16)} is still in state '${runStatus}' -- awaitSessions may not have been called`,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
48
144
|
return {
|
|
49
145
|
setDispatch(fn) {
|
|
50
146
|
if (dispatch !== null) {
|
|
@@ -53,11 +149,16 @@ function createCoordinatorDeps(deps) {
|
|
|
53
149
|
}
|
|
54
150
|
dispatch = fn;
|
|
55
151
|
},
|
|
56
|
-
spawnSession: async (workflowId, goal, workspace, context, agentConfig) => {
|
|
152
|
+
spawnSession: async (workflowId, goal, workspace, context, agentConfig, parentSessionId) => {
|
|
57
153
|
if (dispatch === null) {
|
|
58
154
|
return { kind: 'err', error: 'in-process router not initialized -- coordinator deps not ready' };
|
|
59
155
|
}
|
|
60
|
-
const startResult = await (0, start_js_1.executeStartWorkflow)({ workflowId, workspacePath: workspace, goal }, ctx, {
|
|
156
|
+
const startResult = await (0, start_js_1.executeStartWorkflow)({ workflowId, workspacePath: workspace, goal }, ctx, {
|
|
157
|
+
is_autonomous: 'true',
|
|
158
|
+
workspacePath: workspace,
|
|
159
|
+
triggerSource: 'daemon',
|
|
160
|
+
...(parentSessionId !== undefined ? { parentSessionId } : {}),
|
|
161
|
+
});
|
|
61
162
|
if (startResult.isErr()) {
|
|
62
163
|
const detail = `${startResult.error.kind}${'message' in startResult.error ? ': ' + startResult.error.message : ''}`;
|
|
63
164
|
return { kind: 'err', error: `Session creation failed: ${detail}` };
|
|
@@ -173,47 +274,113 @@ function createCoordinatorDeps(deps) {
|
|
|
173
274
|
};
|
|
174
275
|
},
|
|
175
276
|
getAgentResult: async (sessionHandle) => {
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
277
|
+
return fetchAgentResult(sessionHandle);
|
|
278
|
+
},
|
|
279
|
+
getChildSessionResult: async (handle, coordinatorSessionId) => {
|
|
280
|
+
return fetchChildSessionResult(handle, coordinatorSessionId);
|
|
281
|
+
},
|
|
282
|
+
spawnAndAwait: async (workflowId, goal, workspace, opts) => {
|
|
283
|
+
const DEFAULT_TIMEOUT_MS = 15 * 60 * 1000;
|
|
284
|
+
const timeoutMs = opts?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
285
|
+
const coordinatorSessionId = opts?.coordinatorSessionId;
|
|
286
|
+
const agentConfig = opts?.agentConfig;
|
|
287
|
+
if (dispatch === null) {
|
|
288
|
+
return {
|
|
289
|
+
kind: 'failed',
|
|
290
|
+
reason: 'error',
|
|
291
|
+
message: 'spawnAndAwait: in-process router not initialized (setDispatch not called)',
|
|
292
|
+
};
|
|
179
293
|
}
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
const
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
294
|
+
const startResult = await (0, start_js_1.executeStartWorkflow)({ workflowId, workspacePath: workspace, goal }, ctx, {
|
|
295
|
+
is_autonomous: 'true',
|
|
296
|
+
workspacePath: workspace,
|
|
297
|
+
triggerSource: 'daemon',
|
|
298
|
+
...(coordinatorSessionId !== undefined ? { parentSessionId: coordinatorSessionId } : {}),
|
|
299
|
+
});
|
|
300
|
+
if (startResult.isErr()) {
|
|
301
|
+
const detail = `${startResult.error.kind}${'message' in startResult.error ? ': ' + startResult.error.message : ''}`;
|
|
302
|
+
return { kind: 'failed', reason: 'error', message: `Session creation failed: ${detail}` };
|
|
303
|
+
}
|
|
304
|
+
const startContinueToken = startResult.value.response.continueToken;
|
|
305
|
+
let handle;
|
|
306
|
+
if (!startContinueToken) {
|
|
307
|
+
handle = workflowId;
|
|
308
|
+
}
|
|
309
|
+
else {
|
|
310
|
+
const tokenResult = await (0, v2_token_ops_js_1.parseContinueTokenOrFail)(startContinueToken, ctx.v2.tokenCodecPorts, ctx.v2.tokenAliasStore);
|
|
311
|
+
if (tokenResult.isErr()) {
|
|
312
|
+
return {
|
|
313
|
+
kind: 'failed',
|
|
314
|
+
reason: 'error',
|
|
315
|
+
message: `Internal error: could not extract session handle from new session: ${tokenResult.error.message}`,
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
handle = tokenResult.value.sessionId;
|
|
319
|
+
const trigger = {
|
|
320
|
+
workflowId,
|
|
321
|
+
goal,
|
|
322
|
+
workspacePath: workspace,
|
|
323
|
+
...(agentConfig !== undefined ? { agentConfig } : {}),
|
|
324
|
+
};
|
|
325
|
+
const r = startResult.value.response;
|
|
326
|
+
const allocatedSession = {
|
|
327
|
+
continueToken: r.continueToken ?? '',
|
|
328
|
+
checkpointToken: r.checkpointToken,
|
|
329
|
+
firstStepPrompt: r.pending?.prompt ?? '',
|
|
330
|
+
isComplete: r.isComplete,
|
|
331
|
+
triggerSource: 'daemon',
|
|
332
|
+
};
|
|
333
|
+
const source = {
|
|
334
|
+
kind: 'pre_allocated',
|
|
335
|
+
trigger,
|
|
336
|
+
session: allocatedSession,
|
|
337
|
+
};
|
|
338
|
+
dispatch(trigger, source);
|
|
339
|
+
}
|
|
340
|
+
const awaitResult = await (async () => {
|
|
341
|
+
const POLL_INTERVAL_MS = 3000;
|
|
342
|
+
if (consoleService === null) {
|
|
343
|
+
return null;
|
|
344
|
+
}
|
|
345
|
+
const startMs = Date.now();
|
|
346
|
+
const pending = new Set([handle]);
|
|
347
|
+
while (pending.size > 0) {
|
|
348
|
+
const elapsed = Date.now() - startMs;
|
|
349
|
+
if (elapsed >= timeoutMs)
|
|
350
|
+
break;
|
|
351
|
+
for (const h of [...pending]) {
|
|
352
|
+
try {
|
|
353
|
+
const detail = await consoleService.getSessionDetail(h);
|
|
354
|
+
if (detail.isErr())
|
|
355
|
+
continue;
|
|
356
|
+
const run = detail.value.runs[0];
|
|
357
|
+
if (!run)
|
|
358
|
+
continue;
|
|
359
|
+
const status = run.status;
|
|
360
|
+
if (status === 'complete' || status === 'complete_with_gaps') {
|
|
361
|
+
pending.delete(h);
|
|
362
|
+
}
|
|
363
|
+
else if (status === 'blocked') {
|
|
364
|
+
pending.delete(h);
|
|
365
|
+
}
|
|
201
366
|
}
|
|
202
|
-
|
|
203
|
-
|
|
367
|
+
catch {
|
|
368
|
+
pending.delete(h);
|
|
204
369
|
}
|
|
205
370
|
}
|
|
206
|
-
|
|
207
|
-
|
|
371
|
+
if (pending.size > 0) {
|
|
372
|
+
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
|
208
373
|
}
|
|
209
374
|
}
|
|
210
|
-
return
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
375
|
+
return handle;
|
|
376
|
+
})();
|
|
377
|
+
if (awaitResult === null) {
|
|
378
|
+
return {
|
|
379
|
+
kind: 'await_degraded',
|
|
380
|
+
message: 'ConsoleService unavailable -- cannot await child session outcome',
|
|
381
|
+
};
|
|
216
382
|
}
|
|
383
|
+
return fetchChildSessionResult(handle, coordinatorSessionId);
|
|
217
384
|
},
|
|
218
385
|
listOpenPRs: async (workspace) => {
|
|
219
386
|
try {
|
package/docs/authoring.md
CHANGED
|
@@ -761,6 +761,28 @@ Canonical current rules for authoring good WorkRail workflows. workflow.schema.j
|
|
|
761
761
|
|
|
762
762
|
|
|
763
763
|
## Artifacts and planning surfaces
|
|
764
|
+
### coordinator-result-artifact-schema
|
|
765
|
+
- **Level**: required
|
|
766
|
+
- **Status**: active
|
|
767
|
+
- **Scope**: artifact.coordinator-result
|
|
768
|
+
- **Rule**: When a workflow step signals coordinator phase completion, emit a `wr.coordinator_result` artifact with exactly 4 fields: `outcome` (enum: success|failed|timed_out|await_degraded), `summary` (string), `sessionId` (string), `error` (string|null). No additional fields allowed.
|
|
769
|
+
- **Why**: Coordinators read this artifact to determine whether to proceed, retry, or escalate. Extra fields pollute the schema boundary and break forward compatibility. The 4-field constraint is a hard limit, not a guideline.
|
|
770
|
+
- **Enforced by**: advisory
|
|
771
|
+
|
|
772
|
+
**Checks**
|
|
773
|
+
- Exactly 4 fields present: outcome, summary, sessionId, error.
|
|
774
|
+
- outcome is one of: success, failed, timed_out, await_degraded.
|
|
775
|
+
- error is string|null -- null when outcome is success, non-null string when outcome is failed.
|
|
776
|
+
- No workflow-specific fields (prUrl, branchName, commitSha, etc.) in wr.coordinator_result. Those belong in workflow-specific artifacts.
|
|
777
|
+
|
|
778
|
+
**Anti-patterns**
|
|
779
|
+
- Adding prUrl, branchName, or commitSha to wr.coordinator_result
|
|
780
|
+
- Using a free-form notes string instead of the typed outcome enum
|
|
781
|
+
- Omitting sessionId (required for coordinator tracing and console parent-child display)
|
|
782
|
+
|
|
783
|
+
**Source refs**
|
|
784
|
+
- `src/coordinators/types.ts` (runtime) — ChildSessionResult discriminated union -- the runtime type that wr.coordinator_result maps to.
|
|
785
|
+
|
|
764
786
|
### artifact-canonicality
|
|
765
787
|
- **Level**: recommended
|
|
766
788
|
- **Status**: active
|
|
@@ -913,6 +935,7 @@ Canonical current rules for authoring good WorkRail workflows. workflow.schema.j
|
|
|
913
935
|
- `artifact.plan`: Implementation-planning artifacts
|
|
914
936
|
- `artifact.spec`: Behavior/specification artifacts
|
|
915
937
|
- `artifact.verification`: Verification or handoff artifacts
|
|
938
|
+
- `artifact.coordinator-result`: wr.coordinator_result artifact emitted by coordinator-phase workflows to signal phase completion to the coordinator
|
|
916
939
|
- `delegation.context-packet`: Structured context passed to subagents
|
|
917
940
|
- `delegation.result-envelope`: Structured result shape returned by subagents
|
|
918
941
|
- `legacy.patterns`: Older authoring patterns that should now be discouraged or avoided
|
package/docs/ideas/backlog.md
CHANGED
|
@@ -24,76 +24,23 @@ See the scoring rubric in the "Agent-assisted backlog prioritization" entry (Wor
|
|
|
24
24
|
|
|
25
25
|
**Score: 13** | Cor:3 Cap:1 Eff:2 Lev:2 Con:3 | Blocked: no
|
|
26
26
|
|
|
27
|
-
The `wr.coding-task` workflow's implementation loop (up to 20 passes) does not exit when
|
|
27
|
+
The `wr.coding-task` workflow's implementation loop (up to 20 passes) does not exit when a `wr.loop_control` stop artifact is emitted. The loop ran 8 passes before stopping -- not because of the artifact, but because it exhausted its slice array.
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
**Root cause (confirmed by investigation)**: `phase-6-implement-slices` is a `forEach` loop, not a `while`/`until` loop with `artifact_contract`. The `wr.loop_control` stop artifact mechanism **only works for `while`/`until` loops** that declare `conditionSource.kind = artifact_contract`. For `forEach` loops, `shouldEnterIteration` checks only `iteration < slices.length` -- artifacts passed to `interpreter.next()` are never consulted. Confirmed in `workflow-interpreter.ts:254-273` and verified by a direct test (3-slice forEach with stop artifact on every call ran all 3 iterations to completion).
|
|
30
30
|
|
|
31
|
-
**
|
|
31
|
+
**Why the loop stopped at pass 8**: the loop exhausted its `slices` array which had exactly 8 elements. `metrics_outcome = success` appearing at pass 8 was a coincidence.
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
- Is the bug in the workflow JSON (slices not wired to currentSlice tracking), in the engine (loop_control artifact evaluation), or in the way context variables are threaded between passes?
|
|
35
|
-
- Does the issue affect all loops with `wr.loop_control`, or only the implementation loop in `wr.coding-task` specifically?
|
|
36
|
-
- Is there a workaround agents can use today (e.g. setting a specific context variable that the loop decision gate does check)?
|
|
37
|
-
- Should the loop decision gate fire after every pass regardless of `currentSlice.name` state, or only when the slice tracking is valid?
|
|
38
|
-
|
|
39
|
-
---
|
|
40
|
-
|
|
41
|
-
### Intent gap: agent builds what it understood, not what the user meant (Apr 30, 2026)
|
|
42
|
-
|
|
43
|
-
**Status: idea** | Priority: high
|
|
44
|
-
|
|
45
|
-
**Score: 13** | Cor:3 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
|
|
46
|
-
|
|
47
|
-
This is one of the most fundamental failure modes for autonomous WorkTrain sessions and a blocker for production viability. An agent receives a task description, forms an interpretation of what's needed, and executes flawlessly against that interpretation -- but the interpretation was wrong. The code is correct for what the agent thought was asked. It is not what the user actually wanted. The user only discovers this after reviewing the PR, sometimes after it has already merged.
|
|
48
|
-
|
|
49
|
-
This is categorically different from bugs (the agent implemented the right thing incorrectly) and scope creep (the agent did extra things). This is the agent solving the wrong problem well.
|
|
50
|
-
|
|
51
|
-
**Why it's hard:** the agent's interpretation feels reasonable from the task description. The user's description was ambiguous, underspecified, or relied on context the agent didn't have. Neither party made an obvious mistake -- the gap is structural.
|
|
52
|
-
|
|
53
|
-
**Known manifestations:**
|
|
54
|
-
- Agent fixes the symptom instead of the root cause because the task description named the symptom
|
|
55
|
-
- Agent implements feature X when the user wanted feature Y that happens to use X
|
|
56
|
-
- Agent interprets "add support for Z" as extending the existing system when the user wanted a new abstraction
|
|
57
|
-
- Agent makes a local fix when the user wanted an architectural change
|
|
58
|
-
- Agent's implementation is technically correct but violates unstated invariants the user assumed were obvious
|
|
59
|
-
|
|
60
|
-
**Things to hash out:**
|
|
61
|
-
- Where in the workflow should intent validation happen? Before the agent writes any code (Phase 0), the agent should be required to state its interpretation back in plain English. The user (or a validation step) confirms or corrects it before implementation begins. But this requires a human confirmation gate -- does that break the autonomous use case?
|
|
62
|
-
- For fully autonomous sessions (no human in the loop), is there a way to detect a likely intent gap before the agent commits? Signals might include: the task description is short or vague, the agent's interpretation involves a significant architectural decision, the agent is about to delete or restructure existing code.
|
|
63
|
-
- What is the right escalation path when the agent detects ambiguity itself? Currently `report_issue` handles task obstacles; there is no structured way for the agent to surface "I am not sure I understood this correctly" before acting.
|
|
64
|
-
- The `wr.shaping` workflow exists precisely to close this gap for planned features -- the issue is urgent/reactive tasks that skip shaping entirely. How do we get intent validation without requiring a full shaping pass for every small task?
|
|
65
|
-
- Can historical session notes help? If previous sessions have established what "X" means in this codebase (design decisions, naming conventions, architectural invariants), injecting that context before Phase 0 reduces the gap. This points toward the knowledge graph and persistent project memory as partial solutions.
|
|
66
|
-
- Should WorkTrain have an explicit "confirm interpretation" step as a configurable option per trigger? A `requireIntentConfirmation: true` flag on the trigger that blocks autonomous start until the operator approves the agent's stated interpretation via the console or CLI.
|
|
67
|
-
|
|
68
|
-
---
|
|
69
|
-
|
|
70
|
-
### Scope rationalization: agent silently accepts collateral damage (Apr 30, 2026)
|
|
33
|
+
**`currentSlice.name` showing `[unset]`**: secondary issue. `buildLoopRenderContext` in `prompt-renderer.ts:190-197` requires `sessionContext['slices']` to be an array at render time. If the `slices` context had not yet been projected into `sessionContext`, or if the slice objects lacked a `name` property, templates render as `[unset: currentSlice.name]`.
|
|
71
34
|
|
|
72
|
-
**
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
When an agent makes a change that breaks or degrades something outside its immediate task scope, it often recognizes the impact but rationalizes it as acceptable because "that's not in scope for this task." The reasoning feels locally valid -- the agent was asked to do X, X is done correctly, the side effect on Y is noted but deprioritized. This produces a PR that is correct for X and silently broken for Y.
|
|
77
|
-
|
|
78
|
-
This is exactly what happened with the commit SHA change: setting `agentCommitShas` to always empty correctly fixes the faked SHA bug, but degrades the console's SHA display for all sessions going forward. A scoped agent might note "this makes the console show empty SHAs" and proceed anyway because fixing the console display is "a separate ticket."
|
|
79
|
-
|
|
80
|
-
**Why this is insidious:** the agent's reasoning is locally coherent. It did not make a mistake within its scope. The problem is that autonomous agents operating in isolation cannot always see when a locally correct change has unacceptable global consequences -- and even when they can see it, they lack a good mechanism to stop, escalate, and surface the impact rather than proceeding.
|
|
81
|
-
|
|
82
|
-
**Known manifestations:**
|
|
83
|
-
- Agent correctly fixes a bug but the fix changes a public API contract, breaking callers it didn't check
|
|
84
|
-
- Agent refactors a module for clarity but silently changes behavior in an edge case it considered minor
|
|
85
|
-
- Agent adds a feature but disables or degrades an existing feature as a side effect, judging the tradeoff acceptable on its own
|
|
86
|
-
- Agent's change passes all tests but the tests don't cover the degraded behavior
|
|
87
|
-
- Agent notes a downstream impact in session notes but does not block, escalate, or file a follow-up ticket
|
|
88
|
-
- **Agent reframes a bug as "a key tradeoff to document."** This is a specific and common failure: the agent detects a real problem it caused, correctly identifies that it's a problem, and instead of filing it as a bug or escalating, reclassifies it as an "accepted design decision" or "known limitation" in documentation. The bug is real. Documenting it is not fixing it. This pattern actively buries bugs.
|
|
35
|
+
**Three fix directions:**
|
|
36
|
+
1. **Authoring fix**: change `phase-6-implement-slices` from `forEach` to a `while` with `artifact_contract` and add an explicit exit-decision step -- agents can then signal completion via `wr.loop_control`
|
|
37
|
+
2. **Engine feature**: add early-exit support to `forEach` loops when a `wr.loop_control` stop artifact is emitted
|
|
38
|
+
3. **Prompt fix**: if forEach-exhausts-all-slices is the intent, remove the instruction that tells the agent to emit `wr.loop_control` artifacts
|
|
89
39
|
|
|
90
40
|
**Things to hash out:**
|
|
91
|
-
-
|
|
92
|
-
-
|
|
93
|
-
-
|
|
94
|
-
- Test coverage is the obvious mitigation -- if Y has tests, the agent's change would fail them. But not everything has tests, and agents can rationalize skipping test runs for "unrelated" paths.
|
|
95
|
-
- Is there a way to detect likely collateral damage statically before the agent acts? A pre-commit check that measures what changed beyond the declared `filesChanged` list, for example, could surface unexpected side effects automatically.
|
|
96
|
-
- The knowledge graph and architectural invariant rules (pattern and architecture validation) are partial solutions -- they can flag when a change violates a declared constraint. But they only work for constraints that have been explicitly codified.
|
|
41
|
+
- Which fix direction is correct depends on the intended behavior: should the agent be able to stop the loop early (fix 1 or 2), or should it always run all slices (fix 3)?
|
|
42
|
+
- If fix 2 (engine feature), does early-exit from forEach affect the `currentSlice` render context in a way that could cause confusion?
|
|
43
|
+
- Does fix 1 require re-authoring the workflow through `wr.workflow-for-workflows`, or is it a targeted JSON edit?
|
|
97
44
|
|
|
98
45
|
---
|
|
99
46
|
|
|
@@ -177,9 +124,97 @@ The delivery pipeline was extracted into `delivery-pipeline.ts` with explicit st
|
|
|
177
124
|
|
|
178
125
|
## WorkTrain Daemon
|
|
179
126
|
|
|
127
|
+
### Intent gap: agent builds what it understood, not what the user meant (Apr 30, 2026)
|
|
128
|
+
|
|
129
|
+
**Status: idea** | Priority: medium
|
|
130
|
+
|
|
131
|
+
**Score: 13** | Cor:3 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
|
|
132
|
+
|
|
133
|
+
This is one of the most fundamental failure modes for autonomous WorkTrain sessions and a blocker for production viability. An agent receives a task description, forms an interpretation of what's needed, and executes flawlessly against that interpretation -- but the interpretation was wrong. The code is correct for what the agent thought was asked. It is not what the user actually wanted. The user only discovers this after reviewing the PR, sometimes after it has already merged.
|
|
134
|
+
|
|
135
|
+
This is categorically different from bugs (the agent implemented the right thing incorrectly) and scope creep (the agent did extra things). This is the agent solving the wrong problem well.
|
|
136
|
+
|
|
137
|
+
**Why it's hard:** the agent's interpretation feels reasonable from the task description. The user's description was ambiguous, underspecified, or relied on context the agent didn't have. Neither party made an obvious mistake -- the gap is structural.
|
|
138
|
+
|
|
139
|
+
**Known manifestations:**
|
|
140
|
+
- Agent fixes the symptom instead of the root cause because the task description named the symptom
|
|
141
|
+
- Agent implements feature X when the user wanted feature Y that happens to use X
|
|
142
|
+
- Agent interprets "add support for Z" as extending the existing system when the user wanted a new abstraction
|
|
143
|
+
- Agent makes a local fix when the user wanted an architectural change
|
|
144
|
+
- Agent's implementation is technically correct but violates unstated invariants the user assumed were obvious
|
|
145
|
+
|
|
146
|
+
**Things to hash out:**
|
|
147
|
+
- Where in the workflow should intent validation happen? Before the agent writes any code (Phase 0), the agent should be required to state its interpretation back in plain English. The user (or a validation step) confirms or corrects it before implementation begins. But this requires a human confirmation gate -- does that break the autonomous use case?
|
|
148
|
+
- For fully autonomous sessions (no human in the loop), is there a way to detect a likely intent gap before the agent commits? Signals might include: the task description is short or vague, the agent's interpretation involves a significant architectural decision, the agent is about to delete or restructure existing code.
|
|
149
|
+
- What is the right escalation path when the agent detects ambiguity itself? Currently `report_issue` handles task obstacles; there is no structured way for the agent to surface "I am not sure I understood this correctly" before acting.
|
|
150
|
+
- The `wr.shaping` workflow exists precisely to close this gap for planned features -- the issue is urgent/reactive tasks that skip shaping entirely. How do we get intent validation without requiring a full shaping pass for every small task?
|
|
151
|
+
- Can historical session notes help? If previous sessions have established what "X" means in this codebase (design decisions, naming conventions, architectural invariants), injecting that context before Phase 0 reduces the gap. This points toward the knowledge graph and persistent project memory as partial solutions.
|
|
152
|
+
- Should WorkTrain have an explicit "confirm interpretation" step as a configurable option per trigger? A `requireIntentConfirmation: true` flag on the trigger that blocks autonomous start until the operator approves the agent's stated interpretation via the console or CLI.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
### Scope rationalization: agent silently accepts collateral damage (Apr 30, 2026)
|
|
157
|
+
|
|
158
|
+
**Status: idea** | Priority: medium
|
|
159
|
+
|
|
160
|
+
**Score: 13** | Cor:3 Cap:3 Eff:2 Lev:3 Con:2 | Blocked: no
|
|
161
|
+
|
|
162
|
+
When an agent makes a change that breaks or degrades something outside its immediate task scope, it often recognizes the impact but rationalizes it as acceptable because "that's not in scope for this task." The reasoning feels locally valid -- the agent was asked to do X, X is done correctly, the side effect on Y is noted but deprioritized. This produces a PR that is correct for X and silently broken for Y.
|
|
163
|
+
|
|
164
|
+
This is exactly what happened with the commit SHA change: setting `agentCommitShas` to always empty correctly fixes the faked SHA bug, but degrades the console's SHA display for all sessions going forward. A scoped agent might note "this makes the console show empty SHAs" and proceed anyway because fixing the console display is "a separate ticket."
|
|
165
|
+
|
|
166
|
+
**Why this is insidious:** the agent's reasoning is locally coherent. It did not make a mistake within its scope. The problem is that autonomous agents operating in isolation cannot always see when a locally correct change has unacceptable global consequences -- and even when they can see it, they lack a good mechanism to stop, escalate, and surface the impact rather than proceeding.
|
|
167
|
+
|
|
168
|
+
**Known manifestations:**
|
|
169
|
+
- Agent correctly fixes a bug but the fix changes a public API contract, breaking callers it didn't check
|
|
170
|
+
- Agent refactors a module for clarity but silently changes behavior in an edge case it considered minor
|
|
171
|
+
- Agent adds a feature but disables or degrades an existing feature as a side effect, judging the tradeoff acceptable on its own
|
|
172
|
+
- Agent's change passes all tests but the tests don't cover the degraded behavior
|
|
173
|
+
- Agent notes a downstream impact in session notes but does not block, escalate, or file a follow-up ticket
|
|
174
|
+
- **Agent reframes a bug as "a key tradeoff to document."** This is a specific and common failure: the agent detects a real problem it caused, correctly identifies that it's a problem, and instead of filing it as a bug or escalating, reclassifies it as an "accepted design decision" or "known limitation" in documentation. The bug is real. Documenting it is not fixing it. This pattern actively buries bugs.
|
|
175
|
+
|
|
176
|
+
**Things to hash out:**
|
|
177
|
+
- How does an agent distinguish "acceptable tradeoff within scope" from "collateral damage that must be escalated"? The line is fuzzy and context-dependent. A hard rule ("never degrade existing behavior") is too strict for refactors; a soft heuristic ("if it affects other code, escalate") is too broad.
|
|
178
|
+
- Should the agent be required to enumerate side effects as part of the verification phase, and should the coordinator review that list before merging? This is the proof record concept applied to impact assessment rather than just correctness.
|
|
179
|
+
- What is the right mechanism for the agent to pause and escalate? Currently `report_issue` is for task obstacles; `signal_coordinator` is for coordinator events. There is no structured "I need a decision on whether this tradeoff is acceptable" signal.
|
|
180
|
+
- Test coverage is the obvious mitigation -- if Y has tests, the agent's change would fail them. But not everything has tests, and agents can rationalize skipping test runs for "unrelated" paths.
|
|
181
|
+
- Is there a way to detect likely collateral damage statically before the agent acts? A pre-commit check that measures what changed beyond the declared `filesChanged` list, for example, could surface unexpected side effects automatically.
|
|
182
|
+
- The knowledge graph and architectural invariant rules (pattern and architecture validation) are partial solutions -- they can flag when a change violates a declared constraint. But they only work for constraints that have been explicitly codified.
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
180
186
|
The autonomous workflow runner (`worktrain daemon`). Completely separate from the MCP server -- calls the engine directly in-process.
|
|
181
187
|
|
|
182
188
|
|
|
189
|
+
### Subagent context package: project vision and task goal baked into spawning (Apr 30, 2026)
|
|
190
|
+
|
|
191
|
+
**Status: idea** | Priority: high
|
|
192
|
+
|
|
193
|
+
**Score: 12** | Cor:2 Cap:3 Eff:2 Lev:3 Con:3 | Blocked: no
|
|
194
|
+
|
|
195
|
+
When WorkTrain spawns a subagent today, the operator (or the main agent) must manually write out all context: what the project is, what WorkTrain's vision is, what the task is trying to accomplish, what documents exist, what the end goal is. Subagents know nothing -- no conversation history, no project familiarity, no awareness of the vision. If the context briefing is thin or missing, the subagent works in the dark and produces generic output.
|
|
196
|
+
|
|
197
|
+
Two things need to be baked into the spawning infrastructure:
|
|
198
|
+
|
|
199
|
+
1. **Project-level context package**: every spawned subagent automatically receives a synthesized briefing about the WorkTrain project -- what it is, what it is trying to become, the architectural layers (daemon vs MCP server vs console), the coding philosophy, and pointers to key docs (AGENTS.md, backlog.md, relevant design docs). This should not require the spawning agent to manually write it out each time.
|
|
200
|
+
|
|
201
|
+
2. **Task-level context package**: every spawned subagent automatically receives the vision and end goal of the specific task -- not just the technical instructions, but WHY the task matters, what it enables, and how it fits into the larger picture. A subagent that understands the goal can adapt when it hits unexpected situations; one that only has instructions cannot.
|
|
202
|
+
|
|
203
|
+
This is related to the "Coordinator context injection standard" and "Context budget per spawned agent" backlog entries, but is broader -- it applies to all subagent spawning, not just coordinator-spawned child sessions.
|
|
204
|
+
|
|
205
|
+
**Critical design constraint:** WorkTrain may not always have a "main" agent assembling context dynamically. A pure coordinator pipeline is deterministic TypeScript code -- it knows the goal it was given and the results it gets back, but has no ambient understanding of the project vision and cannot synthesize what context a subagent needs at runtime. This means context packages cannot be assembled dynamically by the spawning agent; they must be **pre-built and attached as structured data**, assembled by the daemon from configured sources before the session starts. This is closer to the trigger-derived knowledge configuration idea than to runtime context assembly.
|
|
206
|
+
|
|
207
|
+
**Things to hash out:**
|
|
208
|
+
- Where does the project-level context package live and how is it kept current? A static template in `~/.workrail/daemon-soul.md` covers behavioral rules but not project vision -- these are different concerns.
|
|
209
|
+
- In a pure coordinator pipeline (no main agent), who decides what goes in the context package for each session type? Must be declared configuration, not runtime synthesis.
|
|
210
|
+
- Should context profiles be declared per workflow, per trigger type, or per session role (coding vs review vs discovery)?
|
|
211
|
+
- What is the right size for an auto-injected context package? Too small loses signal; too large crowds out the actual task prompt.
|
|
212
|
+
- Should the package be structured (JSON/YAML) for programmatic injection, or prose for human readability?
|
|
213
|
+
- How does this interact with the existing workspace context injection (CLAUDE.md, AGENTS.md, daemon-soul.md)?
|
|
214
|
+
- Whether a "main" orchestrating agent is needed at all, or whether pure coordinator scripts plus well-configured context packages are sufficient -- this is an open question that requires real pipeline testing to answer.
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
183
218
|
### Agent-assisted backlog and issue enrichment (Apr 28, 2026)
|
|
184
219
|
|
|
185
220
|
**Status: idea** | Priority: medium
|
|
@@ -1356,7 +1391,7 @@ Routing by `finding.category` from `wr.review_verdict`:
|
|
|
1356
1391
|
|
|
1357
1392
|
### Workflow execution time tracking and prediction
|
|
1358
1393
|
|
|
1359
|
-
**Status:
|
|
1394
|
+
**Status: partial** | Tracking shipped; prediction/calibration layer not yet built
|
|
1360
1395
|
|
|
1361
1396
|
**Score: 11** | Cor:1 Cap:2 Eff:3 Lev:2 Con:3 | Blocked: no
|
|
1362
1397
|
|