@zhixuan92/multi-model-agent-core 4.2.1 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/config/schema.d.ts +3 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +7 -4
- package/dist/config/schema.js.map +1 -1
- package/dist/error-codes.d.ts +1 -0
- package/dist/error-codes.d.ts.map +1 -1
- package/dist/error-codes.js +2 -0
- package/dist/error-codes.js.map +1 -1
- package/dist/events/running-headline-sink.d.ts.map +1 -1
- package/dist/events/running-headline-sink.js +47 -7
- package/dist/events/running-headline-sink.js.map +1 -1
- package/dist/events/telemetry-types.d.ts +24 -20
- package/dist/events/telemetry-types.d.ts.map +1 -1
- package/dist/identity/auth-token-store.d.ts +36 -0
- package/dist/identity/auth-token-store.d.ts.map +1 -1
- package/dist/identity/auth-token-store.js +71 -2
- package/dist/identity/auth-token-store.js.map +1 -1
- package/dist/identity/cwd-validator.d.ts.map +1 -1
- package/dist/identity/cwd-validator.js +15 -3
- package/dist/identity/cwd-validator.js.map +1 -1
- package/dist/identity/main-model-resolver.d.ts +14 -0
- package/dist/identity/main-model-resolver.d.ts.map +1 -0
- package/dist/identity/main-model-resolver.js +83 -0
- package/dist/identity/main-model-resolver.js.map +1 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -2
- package/dist/index.js.map +1 -1
- package/dist/intake/brief-compiler-slots/delegate.d.ts +10 -11
- package/dist/intake/brief-compiler-slots/delegate.d.ts.map +1 -1
- package/dist/intake/brief-compiler-slots/delegate.js +12 -14
- package/dist/intake/brief-compiler-slots/delegate.js.map +1 -1
- package/dist/intake/brief-compiler-slots/execute-plan.js +3 -1
- package/dist/intake/brief-compiler-slots/execute-plan.js.map +1 -1
- package/dist/intake/context-overflow-estimator.d.ts +33 -0
- package/dist/intake/context-overflow-estimator.d.ts.map +1 -0
- package/dist/intake/context-overflow-estimator.js +36 -0
- package/dist/intake/context-overflow-estimator.js.map +1 -0
- package/dist/intake/pipeline.d.ts.map +1 -1
- package/dist/intake/pipeline.js +46 -0
- package/dist/intake/pipeline.js.map +1 -1
- package/dist/intake/plan-extractor.d.ts.map +1 -1
- package/dist/intake/plan-extractor.js +10 -1
- package/dist/intake/plan-extractor.js.map +1 -1
- package/dist/intake/types.d.ts +1 -0
- package/dist/intake/types.d.ts.map +1 -1
- package/dist/lifecycle/diff-tracker.d.ts +17 -1
- package/dist/lifecycle/diff-tracker.d.ts.map +1 -1
- package/dist/lifecycle/diff-tracker.js +115 -2
- package/dist/lifecycle/diff-tracker.js.map +1 -1
- package/dist/lifecycle/handlers/annotate-completion-handler.d.ts +9 -0
- package/dist/lifecycle/handlers/annotate-completion-handler.d.ts.map +1 -0
- package/dist/lifecycle/handlers/annotate-completion-handler.js +171 -0
- package/dist/lifecycle/handlers/annotate-completion-handler.js.map +1 -0
- package/dist/lifecycle/handlers/annotate-criteria-handler.d.ts +3 -0
- package/dist/lifecycle/handlers/annotate-criteria-handler.d.ts.map +1 -0
- package/dist/lifecycle/handlers/annotate-criteria-handler.js +67 -0
- package/dist/lifecycle/handlers/annotate-criteria-handler.js.map +1 -0
- package/dist/lifecycle/handlers/baseline-handlers.d.ts.map +1 -1
- package/dist/lifecycle/handlers/baseline-handlers.js +152 -65
- package/dist/lifecycle/handlers/baseline-handlers.js.map +1 -1
- package/dist/lifecycle/handlers/files-written-cross-check.d.ts +21 -0
- package/dist/lifecycle/handlers/files-written-cross-check.d.ts.map +1 -0
- package/dist/lifecycle/handlers/files-written-cross-check.js +85 -0
- package/dist/lifecycle/handlers/files-written-cross-check.js.map +1 -0
- package/dist/lifecycle/handlers/review-handler.d.ts +3 -0
- package/dist/lifecycle/handlers/review-handler.d.ts.map +1 -0
- package/dist/lifecycle/handlers/review-handler.js +141 -0
- package/dist/lifecycle/handlers/review-handler.js.map +1 -0
- package/dist/lifecycle/handlers/rework-handler.d.ts +3 -0
- package/dist/lifecycle/handlers/rework-handler.d.ts.map +1 -0
- package/dist/lifecycle/handlers/rework-handler.js +77 -0
- package/dist/lifecycle/handlers/rework-handler.js.map +1 -0
- package/dist/lifecycle/handlers/terminal-handlers.d.ts.map +1 -1
- package/dist/lifecycle/handlers/terminal-handlers.js +16 -3
- package/dist/lifecycle/handlers/terminal-handlers.js.map +1 -1
- package/dist/lifecycle/lifecycle-context.d.ts +4 -0
- package/dist/lifecycle/lifecycle-context.d.ts.map +1 -1
- package/dist/lifecycle/lifecycle-driver.d.ts.map +1 -1
- package/dist/lifecycle/lifecycle-driver.js +12 -7
- package/dist/lifecycle/lifecycle-driver.js.map +1 -1
- package/dist/lifecycle/parallel-criteria-routes.d.ts +1 -1
- package/dist/lifecycle/parallel-criteria-routes.d.ts.map +1 -1
- package/dist/lifecycle/parallel-criteria-routes.js +21 -1
- package/dist/lifecycle/parallel-criteria-routes.js.map +1 -1
- package/dist/lifecycle/shared-compute.d.ts +9 -0
- package/dist/lifecycle/shared-compute.d.ts.map +1 -1
- package/dist/lifecycle/shared-compute.js +35 -3
- package/dist/lifecycle/shared-compute.js.map +1 -1
- package/dist/lifecycle/stage-plan-builder.d.ts.map +1 -1
- package/dist/lifecycle/stage-plan-builder.js +65 -85
- package/dist/lifecycle/stage-plan-builder.js.map +1 -1
- package/dist/lifecycle/stage-plan-types.d.ts +48 -0
- package/dist/lifecycle/stage-plan-types.d.ts.map +1 -1
- package/dist/lifecycle/stage-progression.d.ts.map +1 -1
- package/dist/lifecycle/stage-progression.js +17 -24
- package/dist/lifecycle/stage-progression.js.map +1 -1
- package/dist/lifecycle/task-runner.d.ts.map +1 -1
- package/dist/lifecycle/task-runner.js +12 -1
- package/dist/lifecycle/task-runner.js.map +1 -1
- package/dist/model-profiles.json +192 -53
- package/dist/providers/anthropic-messages-adapter.d.ts +8 -0
- package/dist/providers/anthropic-messages-adapter.d.ts.map +1 -1
- package/dist/providers/anthropic-messages-adapter.js +16 -1
- package/dist/providers/anthropic-messages-adapter.js.map +1 -1
- package/dist/providers/file-tracker.d.ts +33 -0
- package/dist/providers/file-tracker.d.ts.map +1 -1
- package/dist/providers/file-tracker.js +54 -0
- package/dist/providers/file-tracker.js.map +1 -1
- package/dist/providers/provider-factory.d.ts.map +1 -1
- package/dist/providers/provider-factory.js +27 -2
- package/dist/providers/provider-factory.js.map +1 -1
- package/dist/providers/runner-shell-types.d.ts +14 -0
- package/dist/providers/runner-shell-types.d.ts.map +1 -1
- package/dist/providers/runner-shell.d.ts.map +1 -1
- package/dist/providers/runner-shell.js +103 -26
- package/dist/providers/runner-shell.js.map +1 -1
- package/dist/providers/tool-implementations.d.ts +12 -0
- package/dist/providers/tool-implementations.d.ts.map +1 -1
- package/dist/providers/tool-implementations.js +33 -0
- package/dist/providers/tool-implementations.js.map +1 -1
- package/dist/reporting/annotate-completion-parser.d.ts +39 -0
- package/dist/reporting/annotate-completion-parser.d.ts.map +1 -0
- package/dist/reporting/annotate-completion-parser.js +43 -0
- package/dist/reporting/annotate-completion-parser.js.map +1 -0
- package/dist/reporting/compose-running-headline.d.ts +15 -1
- package/dist/reporting/compose-running-headline.d.ts.map +1 -1
- package/dist/reporting/compose-running-headline.js +76 -1
- package/dist/reporting/compose-running-headline.js.map +1 -1
- package/dist/reporting/report-parser-slots/research-report.d.ts +1 -1
- package/dist/review/default-engines.d.ts.map +1 -1
- package/dist/review/default-engines.js +8 -4
- package/dist/review/default-engines.js.map +1 -1
- package/dist/review/parse-review-report.d.ts +6 -0
- package/dist/review/parse-review-report.d.ts.map +1 -0
- package/dist/review/parse-review-report.js +40 -0
- package/dist/review/parse-review-report.js.map +1 -0
- package/dist/review/reviewer-engine.d.ts +12 -3
- package/dist/review/reviewer-engine.d.ts.map +1 -1
- package/dist/review/reviewer-engine.js +4 -3
- package/dist/review/reviewer-engine.js.map +1 -1
- package/dist/review/templates/annotate-completion.d.ts +12 -0
- package/dist/review/templates/annotate-completion.d.ts.map +1 -0
- package/dist/review/templates/annotate-completion.js +72 -0
- package/dist/review/templates/annotate-completion.js.map +1 -0
- package/dist/review/templates/quality-review.d.ts +3 -0
- package/dist/review/templates/quality-review.d.ts.map +1 -0
- package/dist/review/templates/quality-review.js +40 -0
- package/dist/review/templates/quality-review.js.map +1 -0
- package/dist/review/templates/rework.d.ts +3 -0
- package/dist/review/templates/rework.d.ts.map +1 -0
- package/dist/review/templates/rework.js +42 -0
- package/dist/review/templates/rework.js.map +1 -0
- package/dist/review/templates/shared.d.ts +32 -0
- package/dist/review/templates/shared.d.ts.map +1 -1
- package/dist/review/templates/spec-review.d.ts +1 -16
- package/dist/review/templates/spec-review.d.ts.map +1 -1
- package/dist/review/templates/spec-review.js +23 -31
- package/dist/review/templates/spec-review.js.map +1 -1
- package/dist/stores/context-block-project-cap.d.ts +14 -0
- package/dist/stores/context-block-project-cap.d.ts.map +1 -0
- package/dist/stores/context-block-project-cap.js +68 -0
- package/dist/stores/context-block-project-cap.js.map +1 -0
- package/dist/stores/context-block-tool.d.ts +2 -0
- package/dist/stores/context-block-tool.d.ts.map +1 -1
- package/dist/stores/context-block-tool.js +3 -2
- package/dist/stores/context-block-tool.js.map +1 -1
- package/dist/stores/file-backed-context-block-store.d.ts +8 -1
- package/dist/stores/file-backed-context-block-store.d.ts.map +1 -1
- package/dist/stores/file-backed-context-block-store.js +118 -6
- package/dist/stores/file-backed-context-block-store.js.map +1 -1
- package/dist/tools/audit/plan-audit-criteria.d.ts +35 -0
- package/dist/tools/audit/plan-audit-criteria.d.ts.map +1 -0
- package/dist/tools/audit/plan-audit-criteria.js +136 -0
- package/dist/tools/audit/plan-audit-criteria.js.map +1 -0
- package/dist/tools/audit/plan-audit-verdict.d.ts +15 -0
- package/dist/tools/audit/plan-audit-verdict.d.ts.map +1 -0
- package/dist/tools/audit/plan-audit-verdict.js +44 -0
- package/dist/tools/audit/plan-audit-verdict.js.map +1 -0
- package/dist/tools/audit/schema.d.ts +1 -0
- package/dist/tools/audit/schema.d.ts.map +1 -1
- package/dist/tools/audit/schema.js +6 -3
- package/dist/tools/audit/schema.js.map +1 -1
- package/dist/tools/audit/tool-config.d.ts +3 -0
- package/dist/tools/audit/tool-config.d.ts.map +1 -1
- package/dist/tools/audit/tool-config.js +8 -0
- package/dist/tools/audit/tool-config.js.map +1 -1
- package/dist/tools/delegate/implementer-criteria.d.ts +31 -47
- package/dist/tools/delegate/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/delegate/implementer-criteria.js +60 -88
- package/dist/tools/delegate/implementer-criteria.js.map +1 -1
- package/dist/tools/delegate/tool-config.js +4 -4
- package/dist/tools/delegate/tool-config.js.map +1 -1
- package/dist/tools/execute-plan/implementer-criteria.d.ts +42 -37
- package/dist/tools/execute-plan/implementer-criteria.d.ts.map +1 -1
- package/dist/tools/execute-plan/implementer-criteria.js +79 -79
- package/dist/tools/execute-plan/implementer-criteria.js.map +1 -1
- package/dist/tools/execute-plan/tool-config.d.ts.map +1 -1
- package/dist/tools/execute-plan/tool-config.js +23 -13
- package/dist/tools/execute-plan/tool-config.js.map +1 -1
- package/dist/types/config.d.ts +1 -0
- package/dist/types/config.d.ts.map +1 -1
- package/dist/types/enums.d.ts +2 -2
- package/dist/types/run-result.d.ts +71 -0
- package/dist/types/run-result.d.ts.map +1 -1
- package/dist/types/task-spec.d.ts +14 -0
- package/dist/types/task-spec.d.ts.map +1 -1
- package/dist/types.d.ts +10 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +5 -1
- package/dist/lifecycle/handlers/quality-chain-handlers.d.ts +0 -22
- package/dist/lifecycle/handlers/quality-chain-handlers.d.ts.map +0 -1
- package/dist/lifecycle/handlers/quality-chain-handlers.js +0 -369
- package/dist/lifecycle/handlers/quality-chain-handlers.js.map +0 -1
- package/dist/lifecycle/handlers/review-diff-handler.d.ts +0 -31
- package/dist/lifecycle/handlers/review-diff-handler.d.ts.map +0 -1
- package/dist/lifecycle/handlers/review-diff-handler.js +0 -168
- package/dist/lifecycle/handlers/review-diff-handler.js.map +0 -1
- package/dist/lifecycle/handlers/run-verify-command-handler.d.ts +0 -25
- package/dist/lifecycle/handlers/run-verify-command-handler.d.ts.map +0 -1
- package/dist/lifecycle/handlers/run-verify-command-handler.js +0 -84
- package/dist/lifecycle/handlers/run-verify-command-handler.js.map +0 -1
- package/dist/lifecycle/handlers/spec-chain-handlers.d.ts +0 -21
- package/dist/lifecycle/handlers/spec-chain-handlers.d.ts.map +0 -1
- package/dist/lifecycle/handlers/spec-chain-handlers.js +0 -287
- package/dist/lifecycle/handlers/spec-chain-handlers.js.map +0 -1
- package/dist/review/templates/diff-review.d.ts +0 -11
- package/dist/review/templates/diff-review.d.ts.map +0 -1
- package/dist/review/templates/diff-review.js +0 -39
- package/dist/review/templates/diff-review.js.map +0 -1
- package/dist/review/templates/quality-review-artifact.d.ts +0 -16
- package/dist/review/templates/quality-review-artifact.d.ts.map +0 -1
- package/dist/review/templates/quality-review-artifact.js +0 -46
- package/dist/review/templates/quality-review-artifact.js.map +0 -1
|
@@ -1,114 +1,86 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Delegate
|
|
2
|
+
* Delegate worker criteria — 4.3.0 pipeline-redesign mindset.
|
|
3
3
|
*
|
|
4
|
-
* DELEGATE'S PURPOSE — read this before adding categories.
|
|
5
4
|
* mma-delegate is the generic dispatcher for ad-hoc implementation
|
|
6
|
-
* tasks.
|
|
7
|
-
*
|
|
8
|
-
* a diff a REVIEWER will read alongside the brief. The success
|
|
9
|
-
* criterion is:
|
|
5
|
+
* tasks. Caller hands you a `prompt` (and optionally a `done` acceptance
|
|
6
|
+
* criterion, `filePaths`, `verifyCommand`); your output is a diff.
|
|
10
7
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
8
|
+
* Pipeline mindset (different from earlier versions):
|
|
9
|
+
* - This is a SINGLE-PASS pipeline. There are NO rework rounds for you.
|
|
10
|
+
* - After your turn, a SPEC reviewer (complex tier, full editor tools)
|
|
11
|
+
* runs ONCE — it doesn't ask you to fix; it fixes inline itself.
|
|
12
|
+
* - Then a QUALITY reviewer (complex tier, full editor tools) runs ONCE
|
|
13
|
+
* for safety/correctness — same thing: fixes inline, doesn't ask you.
|
|
14
|
+
* - Then an annotator scores overall completion and the commit gate fires
|
|
15
|
+
* if the score is high enough.
|
|
14
16
|
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
* that satisfies the brief — minimal AND complete simultaneously.
|
|
20
|
-
*
|
|
21
|
-
* Delegate is artifact-producing — you write files. Cross-agent
|
|
22
|
-
* spec + quality + diff review applies. The spec the spec-reviewer
|
|
23
|
-
* checks against is the BRIEF (prompt + done), not your interpretation
|
|
24
|
-
* of it. The quality-reviewer checks safety / correctness / style.
|
|
17
|
+
* What this means for you: do your best ONE pass. You don't need to
|
|
18
|
+
* second-guess minor things — the reviewer will catch and fix them.
|
|
19
|
+
* Don't over-think; don't restart-loop; don't bail on uncertainty. The
|
|
20
|
+
* pipeline has a safety net BUT only one round of it.
|
|
25
21
|
*/
|
|
26
22
|
/**
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
* Without an explicit orientation, workers default to "implement
|
|
30
|
-
* something good" — which produces over-implementation (SCOPE CREEP)
|
|
31
|
-
* and under-implementation (SILENT PARTIAL FIX). With this orientation,
|
|
32
|
-
* the worker calibrates against the reviewer's standard: minimal +
|
|
33
|
-
* complete, the brief is the contract.
|
|
23
|
+
* Orientation — "smallest complete change" framing.
|
|
34
24
|
*/
|
|
35
25
|
export const DELEGATE_PURPOSE_ORIENTATION = [
|
|
36
|
-
'
|
|
37
|
-
'
|
|
26
|
+
'Your job: produce the SMALLEST COMPLETE CHANGE that satisfies the brief — minimal AND complete simultaneously.',
|
|
27
|
+
'A reviewer reads your diff alongside the brief and asks two questions: "did you finish it?" (silent partial fix → blocker) and "why did you also touch X?" (scope creep → blocker). Both must answer cleanly.',
|
|
38
28
|
'',
|
|
39
|
-
'
|
|
40
|
-
'- Implement EXACTLY what the brief asks for. Not less
|
|
41
|
-
'- If the brief lists `filePaths`, those are the authorized targets. Existing
|
|
42
|
-
'- If the brief includes a `done`
|
|
43
|
-
'- If the brief includes a `verifyCommand`, run it after your changes.
|
|
44
|
-
'-
|
|
45
|
-
'-
|
|
46
|
-
'- Do NOT modify tests or fixtures or specs to make a wrong implementation pass. If a test fails, fix the implementation, not the test (unless the brief explicitly says the test is wrong).',
|
|
47
|
-
'',
|
|
48
|
-
'The completion test: would a reviewer who reads ONLY the brief and your diff approve the merge — or would they raise a concern (gap, scope creep, drift, broken caller, undocumented assumption) you should have caught?',
|
|
29
|
+
'Rules:',
|
|
30
|
+
'- Implement EXACTLY what the brief asks for. Not less. Not more.',
|
|
31
|
+
'- If the brief lists `filePaths`, those are the authorized targets. Existing entries = read-and-modify; non-existent entries = create. Files outside the list are off-limits to write unless the brief\'s task genuinely requires it (call out any deviation in your summary).',
|
|
32
|
+
'- If the brief includes a `done` criterion, your diff must satisfy it precisely.',
|
|
33
|
+
'- If the brief includes a `verifyCommand`, run it after your changes. Green = part of complete; red = part of incomplete.',
|
|
34
|
+
'- If you change a public symbol (exported function signature, exported type, public method), update callers in the named files. Stale callers are an INCOMPLETE REFACTOR.',
|
|
35
|
+
'- Do NOT modify tests or fixtures to make a wrong implementation pass. If a test fails, fix the implementation.',
|
|
49
36
|
].join('\n');
|
|
50
|
-
/**
|
|
51
|
-
* The scope rule for delegate.
|
|
52
|
-
*
|
|
53
|
-
* Replaces the prior one-liner with a concrete contract about what
|
|
54
|
-
* is in scope, what is off-limits, and what to do at the boundary.
|
|
55
|
-
*/
|
|
56
37
|
export const DELEGATE_SCOPE_RULE = [
|
|
57
38
|
'Scope:',
|
|
58
39
|
'- Strictly what the brief\'s `prompt` (and `done` if present) requests. The brief is the contract.',
|
|
59
|
-
'- Reading: the named `filePaths` plus what the task obviously implies (caller files when the diff changes a public symbol; sibling test files when the brief changes behavior; types files when the diff changes
|
|
60
|
-
'- Writing:
|
|
61
|
-
'- Out of scope: refactors not in the brief, tangential cleanup
|
|
40
|
+
'- Reading: the named `filePaths` plus what the task obviously implies (caller files when the diff changes a public symbol; sibling test files when the brief changes behavior; types files when the diff changes an interface).',
|
|
41
|
+
'- Writing: only files within `filePaths` unless the brief\'s task genuinely requires touching others (e.g. updating a caller because the task changed a signature — note in summary).',
|
|
42
|
+
'- Out of scope: refactors not in the brief, tangential cleanup, modifying tests to mask wrong code, opportunistic style fixes.',
|
|
62
43
|
].join('\n');
|
|
63
44
|
/**
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
* (
|
|
68
|
-
* below are the specific patterns reviewers raise as merge-blockers.
|
|
45
|
+
* Top-4 failure modes — calibrated from observed reviewer rejections.
|
|
46
|
+
* Dropped from the original 9: WRONG FILE TARGET (subsumed by scope
|
|
47
|
+
* rule), CROSS-CUTTING DAMAGE, CONVENTION DRIFT, SPEC OVERREACH,
|
|
48
|
+
* UNDOCUMENTED ASSUMPTION (low signal, high noise for cheap models).
|
|
69
49
|
*/
|
|
70
50
|
export const DELEGATE_FAILURE_MODES = [
|
|
71
|
-
'
|
|
72
|
-
'',
|
|
73
|
-
'1. SCOPE CREEP — touched files / added features beyond the brief. The reviewer reads the diff and asks "why did you also change Y?" If you cannot answer with "the brief required it", remove the change.',
|
|
74
|
-
'2. SILENT PARTIAL FIX — declared done, work demonstrably incomplete. Naming a step in your summary as "done" when the diff does not contain it is the worst delegate failure mode. Either implement it or report explicitly that you did not.',
|
|
75
|
-
'3. WRONG FILE TARGET — wrote to a path not in `filePaths` (when the caller specified `filePaths`). Existing files outside `filePaths` are off-limits to write. New files outside `filePaths` are scope creep.',
|
|
76
|
-
'4. PHANTOM TEST PASS — claimed "tests pass" without actually running them, OR ran a non-affected suite (e.g. unit tests pass but the change is in a path covered by integration tests). If the brief includes `verifyCommand`, run that exact command and quote the output.',
|
|
77
|
-
'5. CROSS-CUTTING DAMAGE — your fix introduced an unrelated regression in the same edit (e.g. fixing a parser bug but breaking the formatter). Re-read the diff before declaring done; check that nothing OTHER than the brief\'s target changed semantically.',
|
|
78
|
-
'6. CONVENTION DRIFT — invented a naming / import / error-handling / formatting pattern instead of matching the surrounding code. The reviewer will flag this as "matches no neighboring file" — it slows merge.',
|
|
79
|
-
'7. INCOMPLETE REFACTOR — changed a public symbol (exported function signature, exported type, public method) and did not update its callers. Stale callers either crash at runtime or compile but behave wrong. Update callers in the named files; report in your summary if callers exist outside `filePaths`.',
|
|
80
|
-
'8. SPEC OVERREACH — modified tests, fixtures, or interface contracts to make a wrong implementation pass, instead of fixing the implementation. If a test is failing, the FIRST hypothesis is that the implementation is wrong, not the test.',
|
|
81
|
-
'9. UNDOCUMENTED ASSUMPTION — diff relies on the caller doing X (env var set, init function called, dependency installed) without saying so in the brief\'s authoring contract. Either remove the assumption, or document it in your summary so the reviewer can decide if it is acceptable.',
|
|
51
|
+
'The four ways delegation diverges from intent — check yourself against each before declaring done:',
|
|
82
52
|
'',
|
|
83
|
-
'
|
|
84
|
-
'
|
|
85
|
-
'
|
|
86
|
-
'
|
|
53
|
+
'1. SCOPE CREEP — touched files / added features beyond the brief. For every diff hunk, ask: "is this required by a brief item?" If no, remove it.',
|
|
54
|
+
'2. SILENT PARTIAL FIX — declared done with the work demonstrably incomplete. Naming a step as "done" when the diff doesn\'t contain it is the worst delegate failure. Either implement it or report explicitly that you did not.',
|
|
55
|
+
'3. PHANTOM TEST PASS — claimed "tests pass" without actually running them. If `verifyCommand` is set, run that exact command and quote the output. Otherwise run the focused test for the area you changed.',
|
|
56
|
+
'4. INCOMPLETE REFACTOR — changed a public symbol and did not update callers. Stale callers either crash at runtime or compile-but-misbehave. Update callers in the named files; report any callers outside `filePaths` in your summary.',
|
|
87
57
|
].join('\n');
|
|
88
58
|
/**
|
|
89
|
-
* Completeness reminder.
|
|
90
|
-
*
|
|
91
|
-
* The shared SEVERITY_LADDER does not apply to write tools. The
|
|
92
|
-
* counter-balance for delegate is opposite to read-only tools: the
|
|
93
|
-
* typical failure is OVER-IMPLEMENTATION (scope creep) and UNDER-
|
|
94
|
-
* IMPLEMENTATION (silent partial fix), often in the same task. This
|
|
95
|
-
* block tells the worker the load-bearing constraint is "minimal AND
|
|
96
|
-
* complete simultaneously".
|
|
59
|
+
* Completeness reminder — brief-vs-diff walk only. Worked example
|
|
60
|
+
* dropped (cheap models can apply the rule directly without it).
|
|
97
61
|
*/
|
|
98
62
|
export const COMPLETENESS_REMINDER_DELEGATE = [
|
|
99
|
-
'
|
|
100
|
-
'
|
|
101
|
-
'
|
|
102
|
-
'
|
|
103
|
-
'
|
|
104
|
-
'
|
|
105
|
-
'
|
|
106
|
-
'
|
|
63
|
+
'Brief-vs-diff walk (REQUIRED before declaring done):',
|
|
64
|
+
'',
|
|
65
|
+
'Walk the brief literally:',
|
|
66
|
+
' 1. List every requirement in `prompt` (and `done` if present).',
|
|
67
|
+
' 2. For each, locate the diff hunk that satisfies it. If you cannot, you are not done.',
|
|
68
|
+
' 3. Walk the diff in reverse: for each changed file/line, name the brief item it satisfies. If you cannot, the hunk is SCOPE CREEP — remove it.',
|
|
69
|
+
' 4. If `verifyCommand` is set, run it. Quote the relevant output line in your summary.',
|
|
70
|
+
'',
|
|
71
|
+
'"Smallest" means no extras. "Complete" means no gaps. Both at once.',
|
|
72
|
+
].join('\n');
|
|
73
|
+
/**
|
|
74
|
+
* Turn budget — calibration block. Same rationale as execute-plan's:
|
|
75
|
+
* cheap models default to "be thorough" and treat each turn as
|
|
76
|
+
* "re-verify by re-reading", which becomes a discovery loop. This
|
|
77
|
+
* block tells them to trust prior reads and edit confidently.
|
|
78
|
+
*/
|
|
79
|
+
export const TURN_BUDGET_DELEGATE = [
|
|
80
|
+
'Turn budget:',
|
|
81
|
+
'',
|
|
82
|
+
'A typical delegate task completes in 5-15 tool calls total: read each file once, edit each file once, run verification once. If you find yourself reading the same file twice, STOP and edit — the content from your first read is in your context window. If you find yourself reading >5 files without writing any, STOP and write — you have enough context to make progress.',
|
|
107
83
|
'',
|
|
108
|
-
'
|
|
109
|
-
'- For each item in the brief\'s `prompt` and `done`, locate the diff hunk that satisfies it. If you cannot, the item is unsatisfied.',
|
|
110
|
-
'- For each diff hunk, name the brief item it satisfies. If you cannot, the hunk is scope creep.',
|
|
111
|
-
'- Worked example. Brief: "fix the off-by-one in `paginate(page, total)` — `total < pageSize` should still produce one page; add a regression test in `tests/pagination.test.ts`." Naive worker rewrites `paginate` as a clean three-liner with new docstrings, skips the test → SILENT PARTIAL FIX (no test) + SCOPE CREEP (rewrote a function that needed a one-line fix). Correct worker: changes one boundary condition in `paginate` (one line of diff in the implementation file), adds one test in `tests/pagination.test.ts` covering the `total < pageSize` case, runs `verifyCommand` if set, quotes the test name and "1 passed" in the summary, stops. Two diff hunks total, both directly tied to the brief.',
|
|
112
|
-
'- Most workers miss findings of this shape on first pass because the rewrite "feels cleaner". The brief-vs-diff walk forces the question "what did the brief ACTUALLY ask for?".',
|
|
84
|
+
'Trust your prior reads. Trust your prior edits. The most common cheap-worker failure is restart-looping instead of editing.',
|
|
113
85
|
].join('\n');
|
|
114
86
|
//# sourceMappingURL=implementer-criteria.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"implementer-criteria.js","sourceRoot":"","sources":["../../../src/tools/delegate/implementer-criteria.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"implementer-criteria.js","sourceRoot":"","sources":["../../../src/tools/delegate/implementer-criteria.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,4BAA4B,GAAG;IAC1C,gHAAgH;IAChH,+MAA+M;IAC/M,EAAE;IACF,QAAQ;IACR,kEAAkE;IAClE,gRAAgR;IAChR,kFAAkF;IAClF,2HAA2H;IAC3H,2KAA2K;IAC3K,iHAAiH;CAClH,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,MAAM,CAAC,MAAM,mBAAmB,GAAG;IACjC,QAAQ;IACR,oGAAoG;IACpG,iOAAiO;IACjO,uLAAuL;IACvL,gIAAgI;CACjI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;GAKG;AACH,MAAM,CAAC,MAAM,sBAAsB,GAAG;IACpC,oGAAoG;IACpG,EAAE;IACF,mJAAmJ;IACnJ,kOAAkO;IAClO,6MAA6M;IAC7M,yOAAyO;CAC1O,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;GAGG;AACH,MAAM,CAAC,MAAM,8BAA8B,GAAG;IAC5C,sDAAsD;IACtD,EAAE;IACF,2BAA2B;IAC3B,kEAAkE;IAClE,yFAAyF;IACzF,kJAAkJ;IAClJ,yFAAyF;IACzF,EAAE;IACF,qEAAqE;CACtE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;GAKG;AACH,MAAM,CAAC,MAAM,oBAAoB,GAAG;IAClC,cAAc;IACd,EAAE;IACF,kXAAkX;IAClX,EAAE;IACF,6HAA6H;CAC9H,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -2,7 +2,7 @@ import { inputSchema } from './schema.js';
|
|
|
2
2
|
import { delegateHeadlineTemplate } from '../../reporting/headline-templates/delegate.js';
|
|
3
3
|
import { delegateReportSchema } from '../../reporting/report-parser-slots/delegate-report.js';
|
|
4
4
|
import { compileDelegatePrompt } from '../../intake/brief-compiler-slots/delegate.js';
|
|
5
|
-
import {
|
|
5
|
+
import { specLintTemplate, qualityLintTemplate } from '../../review/reviewer-engine.js';
|
|
6
6
|
import { DEFAULT_TASK_TIMEOUT_MS } from '../../config/schema.js';
|
|
7
7
|
export function registerDelegate(registry) {
|
|
8
8
|
registry.register({
|
|
@@ -48,9 +48,9 @@ export const toolConfig = {
|
|
|
48
48
|
reportSchema: delegateReportSchema,
|
|
49
49
|
headlineTemplate: delegateHeadlineTemplate,
|
|
50
50
|
reviewTemplates: {
|
|
51
|
-
spec:
|
|
52
|
-
qualityAP:
|
|
53
|
-
diff:
|
|
51
|
+
spec: specLintTemplate,
|
|
52
|
+
qualityAP: qualityLintTemplate,
|
|
53
|
+
diff: specLintTemplate, // pipeline-redesign: diff path unused; field retained for type compat
|
|
54
54
|
},
|
|
55
55
|
};
|
|
56
56
|
//# sourceMappingURL=tool-config.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-config.js","sourceRoot":"","sources":["../../../src/tools/delegate/tool-config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAG1C,OAAO,EAAE,wBAAwB,EAAE,MAAM,gDAAgD,CAAC;AAC1F,OAAO,EAAE,oBAAoB,EAAE,MAAM,wDAAwD,CAAC;AAC9F,OAAO,EAAE,qBAAqB,EAAE,MAAM,+CAA+C,CAAC;AAEtF,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"tool-config.js","sourceRoot":"","sources":["../../../src/tools/delegate/tool-config.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAG1C,OAAO,EAAE,wBAAwB,EAAE,MAAM,gDAAgD,CAAC;AAC1F,OAAO,EAAE,oBAAoB,EAAE,MAAM,wDAAwD,CAAC;AAC9F,OAAO,EAAE,qBAAqB,EAAE,MAAM,+CAA+C,CAAC;AAEtF,OAAO,EAAE,gBAAgB,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACxF,OAAO,EAAE,uBAAuB,EAAE,MAAM,wBAAwB,CAAC;AAEjE,MAAM,UAAU,gBAAgB,CAAC,QAA6B;IAC5D,QAAQ,CAAC,QAAQ,CAAC;QAChB,SAAS,EAAE,UAAU;QACrB,UAAU,EAAE,MAAM;QAClB,QAAQ,EAAE,WAAW;QACrB,OAAO,EAAE,MAAM;QACf,MAAM,EAAE,WAAW;QACnB,YAAY,EAAE,oBAAoB;QAClC,gBAAgB,EAAE,UAAU;QAC5B,oBAAoB,EAAE,IAAI;QAC1B,iBAAiB,EAAE,eAAe;KACnC,CAAC,CAAC;AACL,CAAC;AAaD,MAAM,CAAC,MAAM,UAAU,GAA8C;IACnE,IAAI,EAAE,UAAU;IAChB,QAAQ,EAAE,oBAAoB;IAC9B,SAAS,EAAE,UAAU;IACrB,SAAS,EAAE,CAAC,KAAK,EAAE,EAAE,CACnB,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACtB,MAAM,EAAE,qBAAqB,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;QACnD,IAAI,EAAE,CAAC,CAAC,IAAI;QACZ,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,SAAS,EAAE,CAAC,CAAC,SAAS,IAAI,UAAU;QACpC,YAAY,EAAE,CAAC,CAAC,YAAY,IAAI,MAAM;QACtC,eAAe,EAAE,CAAC,CAAC,eAAe;QAClC,aAAa,EAAE,CAAC,CAAC,aAAa;QAC9B,UAAU,EAAE,CAAC,CAAC,UAAU;KACzB,CAAC,CAAC;IACL,aAAa,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;QAC9B,MAAM,EAAE,KAAK,CAAC,MAAM;QACpB,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,eAAe,EAAE,KAAK,CAAC,eAAe;QACtC,aAAa,EAAE,KAAK,CAAC,aAAa;QAClC,UAAU,EAAE,KAAK,CAAC,UAAU;QAC5B,GAAG,EAAE,GAAG,CAAC,cAAc,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG;QACvC,KAAK,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,MAAM;QAC3C,SAAS,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,uBAAuB;QACpE,aAAa,EAAE,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,IAAI,UAAU;KAChE,CAAC;IACF,YAAY,EAAE,oBAAoB;IAClC,gBAAgB,EAAE,wBAAwB;IAC1C,eAAe,EAAE;QACf,IAAI,EAAE,gBAAgB;QACtB,SAAS,EAAE,mBAAmB;QAC9B,IAAI,EAAE,gBAAgB,EAAG,sEAAsE;KAChG;CACF,CAAC"}
|
|
@@ -1,52 +1,57 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Execute-plan
|
|
2
|
+
* Execute-plan worker criteria — 4.3.0 pipeline-redesign mindset.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* higher-capability model. Your output is a diff the PLAN AUTHOR will
|
|
7
|
-
* read. They wrote the plan precisely; your job is execution, not
|
|
8
|
-
* improvement. The success criterion is:
|
|
4
|
+
* mma-execute-plan implements one named task from a plan written by a
|
|
5
|
+
* higher-capability model. The plan is the spec; your output is a diff.
|
|
9
6
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
7
|
+
* Pipeline mindset (different from earlier versions):
|
|
8
|
+
* - This is a SINGLE-PASS pipeline. There are NO rework rounds for you.
|
|
9
|
+
* - After your turn, a SPEC reviewer (complex tier, full editor tools)
|
|
10
|
+
* runs ONCE — it doesn't ask you to fix gaps; it fixes them inline
|
|
11
|
+
* itself. Plan-fidelity gaps (CODE SUBSTITUTION, STEP SKIP) it can
|
|
12
|
+
* detect, it can also fix.
|
|
13
|
+
* - Then a QUALITY reviewer (complex tier, full editor tools) runs
|
|
14
|
+
* ONCE for safety / correctness — same thing.
|
|
15
|
+
* - Then an annotator scores completion based on the plan's steps.
|
|
16
|
+
* Commit fires if completionPercent ≥ 80.
|
|
13
17
|
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
* Plan execution is artifact-producing — you write files. Cross-agent
|
|
20
|
-
* spec + quality review still applies. But the spec the spec-reviewer
|
|
21
|
-
* checks against is the PLAN, not your interpretation of it.
|
|
18
|
+
* What this means for you: do the mechanical task in ONE pass and
|
|
19
|
+
* report what you did. You don't need to anticipate every reviewer
|
|
20
|
+
* concern — they fix things, they don't ping-pong with you. Don't
|
|
21
|
+
* restart-loop, don't bail on uncertainty, don't over-verify. The
|
|
22
|
+
* pipeline has a safety net BUT only one round of it.
|
|
22
23
|
*/
|
|
23
24
|
/**
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
* Without an explicit fidelity statement, workers default to "implement
|
|
27
|
-
* the goal" — which produces "improvements" that diverge from the plan
|
|
28
|
-
* (CODE SUBSTITUTION, ACCEPTANCE-CRITERIA OVERRUN). With this
|
|
29
|
-
* orientation, the worker treats the plan as authoritative and reports
|
|
30
|
-
* defects rather than silently working around them.
|
|
25
|
+
* Orientation — fidelity-first framing. Goes at the TOP of every
|
|
26
|
+
* execute-plan worker prompt.
|
|
31
27
|
*/
|
|
32
28
|
export declare const EXECUTE_PLAN_PURPOSE_ORIENTATION: string;
|
|
33
29
|
export declare const EXECUTE_PLAN_SCOPE_RULE: string;
|
|
34
30
|
/**
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
38
|
-
* improvements" to plans they think are imperfect. The 9 categories
|
|
39
|
-
* below are the specific ways execution diverges from intent.
|
|
31
|
+
* Top-4 failure modes — calibrated from observed worker output, not
|
|
32
|
+
* speculative. The full taxonomy of 9 was dropped to reduce cognitive
|
|
33
|
+
* load on cheap models.
|
|
40
34
|
*/
|
|
41
35
|
export declare const EXECUTE_PLAN_FAILURE_MODES: string;
|
|
42
36
|
/**
|
|
43
|
-
* Plan-
|
|
44
|
-
*
|
|
45
|
-
*
|
|
46
|
-
*
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
37
|
+
* Plan-vs-source reconciliation — handles the case where the plan names
|
|
38
|
+
* a symbol/path that doesn't exist in source (because the plan was
|
|
39
|
+
* authored against an older snapshot). Without this rule, workers either
|
|
40
|
+
* invent the missing symbol (introducing real bugs) or freeze and bail.
|
|
41
|
+
*/
|
|
42
|
+
export declare const PLAN_VS_SOURCE_RECONCILIATION: string;
|
|
43
|
+
/**
|
|
44
|
+
* Self-verification — workers must run the plan-listed verification
|
|
45
|
+
* commands themselves before declaring done. Reviewers do not execute
|
|
46
|
+
* code; the worker has shell access and is the source of truth for
|
|
47
|
+
* "do these tests pass?".
|
|
48
|
+
*/
|
|
49
|
+
export declare const SELF_VERIFICATION: string;
|
|
50
|
+
/**
|
|
51
|
+
* Turn budget — calibration block. Cheap models default to "be
|
|
52
|
+
* thorough" and treat each turn as "let me re-verify state by
|
|
53
|
+
* re-reading", which becomes a discovery loop. This block tells them
|
|
54
|
+
* to trust their prior reads and edit confidently.
|
|
50
55
|
*/
|
|
51
|
-
export declare const
|
|
56
|
+
export declare const TURN_BUDGET: string;
|
|
52
57
|
//# sourceMappingURL=implementer-criteria.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"implementer-criteria.d.ts","sourceRoot":"","sources":["../../../src/tools/execute-plan/implementer-criteria.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"implementer-criteria.d.ts","sourceRoot":"","sources":["../../../src/tools/execute-plan/implementer-criteria.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH;;;GAGG;AACH,eAAO,MAAM,gCAAgC,QAUjC,CAAC;AAEb,eAAO,MAAM,uBAAuB,QAKxB,CAAC;AAEb;;;;GAIG;AACH,eAAO,MAAM,0BAA0B,QAO3B,CAAC;AAEb;;;;;GAKG;AACH,eAAO,MAAM,6BAA6B,QAU9B,CAAC;AAEb;;;;;GAKG;AACH,eAAO,MAAM,iBAAiB,QASlB,CAAC;AAEb;;;;;GAKG;AACH,eAAO,MAAM,WAAW,QAMZ,CAAC"}
|
|
@@ -1,104 +1,104 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Execute-plan
|
|
2
|
+
* Execute-plan worker criteria — 4.3.0 pipeline-redesign mindset.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* higher-capability model. Your output is a diff the PLAN AUTHOR will
|
|
7
|
-
* read. They wrote the plan precisely; your job is execution, not
|
|
8
|
-
* improvement. The success criterion is:
|
|
4
|
+
* mma-execute-plan implements one named task from a plan written by a
|
|
5
|
+
* higher-capability model. The plan is the spec; your output is a diff.
|
|
9
6
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
7
|
+
* Pipeline mindset (different from earlier versions):
|
|
8
|
+
* - This is a SINGLE-PASS pipeline. There are NO rework rounds for you.
|
|
9
|
+
* - After your turn, a SPEC reviewer (complex tier, full editor tools)
|
|
10
|
+
* runs ONCE — it doesn't ask you to fix gaps; it fixes them inline
|
|
11
|
+
* itself. Plan-fidelity gaps (CODE SUBSTITUTION, STEP SKIP) it can
|
|
12
|
+
* detect, it can also fix.
|
|
13
|
+
* - Then a QUALITY reviewer (complex tier, full editor tools) runs
|
|
14
|
+
* ONCE for safety / correctness — same thing.
|
|
15
|
+
* - Then an annotator scores completion based on the plan's steps.
|
|
16
|
+
* Commit fires if completionPercent ≥ 80.
|
|
13
17
|
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
* Plan execution is artifact-producing — you write files. Cross-agent
|
|
20
|
-
* spec + quality review still applies. But the spec the spec-reviewer
|
|
21
|
-
* checks against is the PLAN, not your interpretation of it.
|
|
18
|
+
* What this means for you: do the mechanical task in ONE pass and
|
|
19
|
+
* report what you did. You don't need to anticipate every reviewer
|
|
20
|
+
* concern — they fix things, they don't ping-pong with you. Don't
|
|
21
|
+
* restart-loop, don't bail on uncertainty, don't over-verify. The
|
|
22
|
+
* pipeline has a safety net BUT only one round of it.
|
|
22
23
|
*/
|
|
23
24
|
/**
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
* Without an explicit fidelity statement, workers default to "implement
|
|
27
|
-
* the goal" — which produces "improvements" that diverge from the plan
|
|
28
|
-
* (CODE SUBSTITUTION, ACCEPTANCE-CRITERIA OVERRUN). With this
|
|
29
|
-
* orientation, the worker treats the plan as authoritative and reports
|
|
30
|
-
* defects rather than silently working around them.
|
|
25
|
+
* Orientation — fidelity-first framing. Goes at the TOP of every
|
|
26
|
+
* execute-plan worker prompt.
|
|
31
27
|
*/
|
|
32
28
|
export const EXECUTE_PLAN_PURPOSE_ORIENTATION = [
|
|
33
|
-
'
|
|
34
|
-
'
|
|
35
|
-
'',
|
|
36
|
-
'The completion test: would the plan author, reading your diff, say "yes, that\'s exactly what I wrote" — or would they say "close, but you took liberties" / "wrong, you missed step 3"?',
|
|
29
|
+
'You are the mechanical executor of one task from a plan written by a higher-capability model.',
|
|
30
|
+
'Your job: implement the task EXACTLY as the plan specifies. Not improve it. Not redesign it.',
|
|
37
31
|
'',
|
|
38
|
-
'
|
|
39
|
-
'- Follow the plan EXACTLY as written. If the plan provides code blocks, use them VERBATIM (same names, same signatures, same comments, same imports).',
|
|
40
|
-
'- Do NOT redesign. Do NOT substitute your own approach. Do NOT improve names you find unidiomatic.',
|
|
41
|
-
'- Do NOT add steps the plan does not list. Do NOT skip steps the plan does list.',
|
|
42
|
-
'- Do NOT widen scope ("while I\'m here…"). Touch only what this task heading authorizes; another task probably owns the rest.',
|
|
43
|
-
'- If the plan looks wrong (typo, contradiction, undefined symbol, missing dependency): REPORT IT in your summary and stop. Do NOT silently work around it. Do NOT silently fix it.',
|
|
44
|
-
'- The plan was written by a higher-capability model than you. Your judgment about "what would be cleaner" is not load-bearing here; the plan is.',
|
|
32
|
+
'Completion test: would the plan author, reading your diff, say "yes, that\'s exactly what I wrote" — or "close, but you took liberties / missed step 3"?',
|
|
45
33
|
'',
|
|
46
|
-
'
|
|
47
|
-
'-
|
|
48
|
-
'-
|
|
34
|
+
'Three rules that override your usual coding instincts:',
|
|
35
|
+
'- Code blocks the plan provides are VERBATIM contracts. Copy them character-for-character (same names, signatures, comments, control flow). Do not rename, do not reformat, do not "simplify".',
|
|
36
|
+
'- Steps the plan lists are REQUIRED unless explicitly marked optional. Do not skip, do not reorder, do not add steps the plan does not list.',
|
|
37
|
+
'- Files outside the task\'s authorized scope are off-limits. Other tasks own other files; touching them creates merge conflicts.',
|
|
49
38
|
].join('\n');
|
|
50
39
|
export const EXECUTE_PLAN_SCOPE_RULE = [
|
|
51
40
|
'Scope:',
|
|
52
|
-
'- Strictly the task the descriptor names. Other tasks
|
|
53
|
-
'- Touch only
|
|
54
|
-
'-
|
|
55
|
-
'- Genuinely necessary cross-cutting work (e.g. updating a caller because the plan changed a signature): allowed when the plan implies it. When in doubt, REPORT it as part of your summary and let the caller decide.',
|
|
41
|
+
'- Strictly the task the descriptor names. Other tasks have other workers.',
|
|
42
|
+
'- Touch only files the named task authorizes (explicit file paths in the plan section, or files clearly implied).',
|
|
43
|
+
'- No "while I\'m here" cleanup, no refactors not in the plan, no renaming code blocks the plan provided verbatim.',
|
|
56
44
|
].join('\n');
|
|
57
45
|
/**
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
* improvements" to plans they think are imperfect. The 9 categories
|
|
62
|
-
* below are the specific ways execution diverges from intent.
|
|
46
|
+
* Top-4 failure modes — calibrated from observed worker output, not
|
|
47
|
+
* speculative. The full taxonomy of 9 was dropped to reduce cognitive
|
|
48
|
+
* load on cheap models.
|
|
63
49
|
*/
|
|
64
50
|
export const EXECUTE_PLAN_FAILURE_MODES = [
|
|
65
|
-
'
|
|
51
|
+
'The four ways execution diverges from intent — check yourself against each before declaring done:',
|
|
52
|
+
'',
|
|
53
|
+
'1. CODE SUBSTITUTION — the plan provided a code block; you wrote different code that "does the same thing". The plan\'s code is the contract — copy it verbatim. Even renaming an identifier or removing a comment is substitution.',
|
|
54
|
+
'2. STEP SKIP — the plan listed multiple steps; you did some and silently omitted others. Every step is a required deliverable unless marked optional.',
|
|
55
|
+
'3. PLAN REWRITE — you decided the plan was suboptimal and improved it. The plan author treats the plan as the contract; your improvements are a contract violation.',
|
|
56
|
+
'4. PROBLEM-NOT-FLAGGED — you noticed a defect in the plan (typo, undefined symbol, broken example) and silently worked around it. Defects must be reported in your summary so the caller can correct the plan.',
|
|
57
|
+
].join('\n');
|
|
58
|
+
/**
|
|
59
|
+
* Plan-vs-source reconciliation — handles the case where the plan names
|
|
60
|
+
* a symbol/path that doesn't exist in source (because the plan was
|
|
61
|
+
* authored against an older snapshot). Without this rule, workers either
|
|
62
|
+
* invent the missing symbol (introducing real bugs) or freeze and bail.
|
|
63
|
+
*/
|
|
64
|
+
export const PLAN_VS_SOURCE_RECONCILIATION = [
|
|
65
|
+
'Plan-vs-source reconciliation:',
|
|
66
|
+
'',
|
|
67
|
+
'When the plan names a symbol/path/import that grep against the named source files returns ZERO matches for, AND source has a single obvious near-match (same kind of symbol, Levenshtein 1-5):',
|
|
66
68
|
'',
|
|
67
|
-
'1.
|
|
68
|
-
'2.
|
|
69
|
-
'3.
|
|
70
|
-
'4. CODE SUBSTITUTION — the plan provided a code block (function body, import line, type definition) and you wrote DIFFERENT code that "does the same thing". The plan\'s code is verbatim; copy it. Renaming, reformatting, or replacing with idiomatic equivalents is substitution.',
|
|
71
|
-
'5. ACCEPTANCE-CRITERIA OVERRUN — the plan listed criteria A and B; you also delivered C ("seemed natural"). Adding extras the plan did not list is scope creep — even if C is technically good code.',
|
|
72
|
-
'6. ACCEPTANCE-CRITERIA UNDERRUN — the plan implies sub-criteria (e.g. "add the function" implies "add the export to the index file"; "fix the bug" implies "add a regression test"). Missing implicit sub-criteria is the most common silent-partial-fix in plan execution.',
|
|
73
|
-
'7. WRONG-TASK MATCH — you matched a different plan section than the descriptor names (e.g. matched "Step 4: foo" when descriptor said "Step 4: bar"). The descriptor must match the plan heading verbatim; if no unique match exists, report that and stop.',
|
|
74
|
-
'8. CROSS-TASK CONTAMINATION — you touched files the named task does not authorize, on the assumption that another task in the plan will eventually need them. Other tasks have other workers; touching their files creates merge conflicts and ownership ambiguity.',
|
|
75
|
-
'9. PROBLEM-NOT-FLAGGED — you noticed a defect in the plan (typo, contradiction, undefined symbol, broken example) and silently worked around it. The defect must be reported in your summary so the caller can correct the plan; silent workarounds make the next plan execution harder.',
|
|
69
|
+
'1. Use the actual source symbol, not the plan\'s.',
|
|
70
|
+
'2. Add a "Reconciliations" section to your final summary listing each: "Plan said X; source has Y; used Y."',
|
|
71
|
+
'3. Continue the rest of the task. Do NOT bail on "plan defect detected".',
|
|
76
72
|
'',
|
|
77
|
-
'
|
|
78
|
-
'- Plan defects you notice: ALWAYS report. The caller may have a fix or may want to update the plan first.',
|
|
79
|
-
'- Sub-criteria you cannot satisfy without deviating from the plan: report and stop. Do not pick a workaround unilaterally.',
|
|
80
|
-
'- Sub-criteria that are clearly implied but not literally stated: implement them, name them in your summary as "implicit per the task heading".',
|
|
73
|
+
'Reconciliation is NOT improvement. If the plan\'s symbol DOES exist in source and you chose a different one because it felt cleaner, that\'s CODE SUBSTITUTION (forbidden). Reconciliation is only for the genuine doesn\'t-exist-AND-near-match-exists case. If multiple plausible matches or no near-match: report and stop.',
|
|
81
74
|
].join('\n');
|
|
82
75
|
/**
|
|
83
|
-
*
|
|
84
|
-
*
|
|
85
|
-
*
|
|
86
|
-
*
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
76
|
+
* Self-verification — workers must run the plan-listed verification
|
|
77
|
+
* commands themselves before declaring done. Reviewers do not execute
|
|
78
|
+
* code; the worker has shell access and is the source of truth for
|
|
79
|
+
* "do these tests pass?".
|
|
80
|
+
*/
|
|
81
|
+
export const SELF_VERIFICATION = [
|
|
82
|
+
'Self-verification before declaring done:',
|
|
83
|
+
'',
|
|
84
|
+
'Scan the plan section for verification commands ("Run: <cmd>", "Expected: PASS", a code block under "Verify"). Execute each via your shell tool BEFORE writing your final summary. Include in your summary:',
|
|
85
|
+
'',
|
|
86
|
+
' Self-verification:',
|
|
87
|
+
' - $ <command> PASS / FAIL (<N> tests)',
|
|
88
|
+
'',
|
|
89
|
+
'If any command FAILS: do NOT declare "done". Investigate, fix, re-run. A failing test is your output, not the reviewer\'s problem. If you cannot run a command (shell unavailable, dependency missing): say so explicitly AND treat the task as incomplete.',
|
|
90
|
+
].join('\n');
|
|
91
|
+
/**
|
|
92
|
+
* Turn budget — calibration block. Cheap models default to "be
|
|
93
|
+
* thorough" and treat each turn as "let me re-verify state by
|
|
94
|
+
* re-reading", which becomes a discovery loop. This block tells them
|
|
95
|
+
* to trust their prior reads and edit confidently.
|
|
90
96
|
*/
|
|
91
|
-
export const
|
|
92
|
-
'
|
|
93
|
-
'
|
|
94
|
-
'-
|
|
95
|
-
'- "Smallest faithful change" — touch the minimum the task authorizes, in the order the plan specifies, with the code the plan provides verbatim where provided.',
|
|
96
|
-
'- If the plan is wrong: report and stop. Do NOT silently fix the plan.',
|
|
97
|
+
export const TURN_BUDGET = [
|
|
98
|
+
'Turn budget:',
|
|
99
|
+
'',
|
|
100
|
+
'A typical plan task completes in 5-15 tool calls total: read each file once, edit each file once, run verification once. If you find yourself reading the same file twice, STOP and edit — the content from your first read is in your context window. If you find yourself reading >5 files without writing any, STOP and write — you have enough context to make progress.',
|
|
97
101
|
'',
|
|
98
|
-
'
|
|
99
|
-
'- For each code block in the matched plan section, ask: did I copy this verbatim? Same names, same signatures, same comments, same imports?',
|
|
100
|
-
'- If no — what did I change? Why? Is the change required by the task or am I improving?',
|
|
101
|
-
'- Worked example. A plan section says: "Step 2: create `src/parser.ts` with content (verbatim): `export function parse(input: string): Token[] { ... }`". Naive worker writes `src/parser.ts` exporting `parseTokens` (renamed for clarity) with JSDoc added. Result: CODE SUBSTITUTION + ACCEPTANCE-CRITERIA OVERRUN. The downstream code that imports `parse` now breaks; the plan author reads the diff and says "I wrote `parse`, why is this `parseTokens`?". Correct worker creates `src/parser.ts` with exactly the named export `parse`, no JSDoc additions, no rename. If JSDoc would be valuable, mention it in the summary as a follow-up rather than adding it here.',
|
|
102
|
-
'- Most workers miss findings of this shape on first pass because the renamed/reformatted version "feels right" and they trust their instincts. The faithfulness walk forces the verbatim check.',
|
|
102
|
+
'Trust your prior reads. Trust your prior edits. The most common cheap-worker failure is restart-looping ("let me re-read both files first" repeated 50 times) instead of editing.',
|
|
103
103
|
].join('\n');
|
|
104
104
|
//# sourceMappingURL=implementer-criteria.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"implementer-criteria.js","sourceRoot":"","sources":["../../../src/tools/execute-plan/implementer-criteria.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"implementer-criteria.js","sourceRoot":"","sources":["../../../src/tools/execute-plan/implementer-criteria.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH;;;GAGG;AACH,MAAM,CAAC,MAAM,gCAAgC,GAAG;IAC9C,+FAA+F;IAC/F,8FAA8F;IAC9F,EAAE;IACF,0JAA0J;IAC1J,EAAE;IACF,wDAAwD;IACxD,gMAAgM;IAChM,8IAA8I;IAC9I,kIAAkI;CACnI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb,MAAM,CAAC,MAAM,uBAAuB,GAAG;IACrC,QAAQ;IACR,2EAA2E;IAC3E,mHAAmH;IACnH,mHAAmH;CACpH,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;GAIG;AACH,MAAM,CAAC,MAAM,0BAA0B,GAAG;IACxC,mGAAmG;IACnG,EAAE;IACF,qOAAqO;IACrO,uJAAuJ;IACvJ,qKAAqK;IACrK,gNAAgN;CACjN,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;GAKG;AACH,MAAM,CAAC,MAAM,6BAA6B,GAAG;IAC3C,gCAAgC;IAChC,EAAE;IACF,gMAAgM;IAChM,EAAE;IACF,mDAAmD;IACnD,6GAA6G;IAC7G,0EAA0E;IAC1E,EAAE;IACF,gUAAgU;CACjU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;GAKG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,0CAA0C;IAC1C,EAAE;IACF,6MAA6M;IAC7M,EAAE;IACF,sBAAsB;IACtB,0CAA0C;IAC1C,EAAE;IACF,6PAA6P;CAC9P,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAEb;;;;;GAKG;AACH,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB,cAAc;IACd,EAAE;IACF,8WAA8W;IAC9W,EAAE;IACF,mLAAmL;CACpL,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-config.d.ts","sourceRoot":"","sources":["../../../src/tools/execute-plan/tool-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,mBAAmB,EAAE,MAAM,6CAA6C,CAAC;
|
|
1
|
+
{"version":3,"file":"tool-config.d.ts","sourceRoot":"","sources":["../../../src/tools/execute-plan/tool-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,mBAAmB,EAAE,MAAM,6CAA6C,CAAC;AAKlF,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,sCAAsC,CAAC;AACvE,OAAO,EAA4B,KAAK,oBAAoB,EAAE,MAAM,mDAAmD,CAAC;AAaxH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;kBAOxB,CAAC;AAEZ,MAAM,MAAM,oBAAoB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE1E,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,mBAAmB,GAAG,IAAI,CAYvE;AAgFD,eAAO,MAAM,UAAU,EAAE,UAAU,CAAC,oBAAoB,EAAE,oBAAoB,CA4B7E,CAAC"}
|