muonroi-cli 1.4.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +122 -122
- package/dist/packages/agent-harness-core/src/predicate.d.ts +1 -1
- package/dist/src/agent-harness/__tests__/mock-model.spec.js +48 -1
- package/dist/src/agent-harness/mock-model.d.ts +11 -0
- package/dist/src/agent-harness/mock-model.js +21 -0
- package/dist/src/cli/cost-forensics.js +12 -12
- package/dist/src/council/__tests__/clarification-prompt.test.js +51 -0
- package/dist/src/council/__tests__/clarifier-ready-gate.test.js +32 -0
- package/dist/src/council/__tests__/decisions-lock.test.js +17 -1
- package/dist/src/council/__tests__/oauth-reachable.test.d.ts +1 -0
- package/dist/src/council/__tests__/oauth-reachable.test.js +31 -0
- package/dist/src/council/__tests__/parse-outcome-fallback.test.js +11 -0
- package/dist/src/council/clarifier.js +9 -1
- package/dist/src/council/debate.js +5 -1
- package/dist/src/council/decisions-lock.js +3 -3
- package/dist/src/council/index.js +12 -5
- package/dist/src/council/leader.d.ts +0 -17
- package/dist/src/council/leader.js +22 -15
- package/dist/src/council/planner.js +1 -1
- package/dist/src/council/prompts.js +63 -57
- package/dist/src/council/types.d.ts +7 -0
- package/dist/src/ee/__tests__/ee-onboarding.test.d.ts +1 -0
- package/dist/src/ee/__tests__/ee-onboarding.test.js +32 -0
- package/dist/src/ee/artifact-cache.d.ts +56 -0
- package/dist/src/ee/artifact-cache.js +155 -0
- package/dist/src/ee/artifact-cache.test.d.ts +1 -0
- package/dist/src/ee/artifact-cache.test.js +69 -0
- package/dist/src/ee/auth.d.ts +9 -0
- package/dist/src/ee/auth.js +19 -0
- package/dist/src/ee/ee-onboarding.d.ts +5 -0
- package/dist/src/ee/ee-onboarding.js +76 -0
- package/dist/src/ee/search.js +7 -5
- package/dist/src/ee/search.test.d.ts +1 -0
- package/dist/src/ee/search.test.js +23 -0
- package/dist/src/generated/version.d.ts +1 -1
- package/dist/src/generated/version.js +1 -1
- package/dist/src/headless/output.js +6 -4
- package/dist/src/headless/output.test.js +4 -3
- package/dist/src/index.js +20 -1
- package/dist/src/mcp/__tests__/auto-setup.test.js +74 -0
- package/dist/src/mcp/__tests__/client-pool.spec.d.ts +1 -0
- package/dist/src/mcp/__tests__/client-pool.spec.js +98 -0
- package/dist/src/mcp/__tests__/parallel-build.spec.d.ts +1 -0
- package/dist/src/mcp/__tests__/parallel-build.spec.js +67 -0
- package/dist/src/mcp/__tests__/smart-filter.test.js +56 -0
- package/dist/src/mcp/auto-setup.js +56 -2
- package/dist/src/mcp/client-pool.d.ts +46 -0
- package/dist/src/mcp/client-pool.js +212 -0
- package/dist/src/mcp/oauth-callback.js +2 -2
- package/dist/src/mcp/parse-headers.test.js +14 -14
- package/dist/src/mcp/runtime.d.ts +28 -0
- package/dist/src/mcp/runtime.js +117 -51
- package/dist/src/mcp/self-verify-runner.d.ts +14 -0
- package/dist/src/mcp/self-verify-runner.js +38 -0
- package/dist/src/mcp/setup-guide-text.d.ts +9 -0
- package/dist/src/mcp/setup-guide-text.js +84 -0
- package/dist/src/mcp/smart-filter.js +49 -0
- package/dist/src/mcp/smoke.test.js +43 -43
- package/dist/src/mcp/tools-server.d.ts +7 -0
- package/dist/src/mcp/tools-server.js +19 -22
- package/dist/src/models/catalog.json +349 -349
- package/dist/src/ops/__tests__/doctor-ee-health.test.js +21 -0
- package/dist/src/ops/doctor.d.ts +3 -2
- package/dist/src/ops/doctor.js +47 -11
- package/dist/src/ops/doctor.test.js +4 -3
- package/dist/src/orchestrator/__tests__/mcp-capability-block.test.d.ts +1 -0
- package/dist/src/orchestrator/__tests__/mcp-capability-block.test.js +39 -0
- package/dist/src/orchestrator/__tests__/project-stack.test.d.ts +1 -0
- package/dist/src/orchestrator/__tests__/project-stack.test.js +65 -0
- package/dist/src/orchestrator/batch-turn-runner.js +7 -11
- package/dist/src/orchestrator/compaction.d.ts +2 -0
- package/dist/src/orchestrator/compaction.js +14 -1
- package/dist/src/orchestrator/compaction.test.js +25 -1
- package/dist/src/orchestrator/message-processor.js +72 -32
- package/dist/src/orchestrator/orchestrator.js +26 -0
- package/dist/src/orchestrator/prompts.d.ts +51 -0
- package/dist/src/orchestrator/prompts.js +257 -134
- package/dist/src/orchestrator/scope-ceiling.js +6 -1
- package/dist/src/orchestrator/scope-reminder.d.ts +12 -0
- package/dist/src/orchestrator/scope-reminder.js +16 -0
- package/dist/src/orchestrator/scope-reminder.test.js +22 -1
- package/dist/src/orchestrator/stream-runner.js +23 -15
- package/dist/src/orchestrator/subagent-compactor.d.ts +14 -5
- package/dist/src/orchestrator/subagent-compactor.js +30 -8
- package/dist/src/orchestrator/subagent-compactor.spec.js +18 -0
- package/dist/src/orchestrator/text-tool-call-detector.test.js +13 -13
- package/dist/src/pil/__tests__/clarity-gate.test.js +24 -215
- package/dist/src/pil/__tests__/config.test.js +1 -17
- package/dist/src/pil/__tests__/discovery.test.js +144 -11
- package/dist/src/pil/__tests__/layer1-intent-trace.test.js +7 -2
- package/dist/src/pil/__tests__/layer1-intent.test.js +3 -0
- package/dist/src/pil/__tests__/layer16-clarity.test.js +32 -116
- package/dist/src/pil/__tests__/layer4-gsd.test.js +37 -0
- package/dist/src/pil/__tests__/layer6-output.test.js +158 -18
- package/dist/src/pil/__tests__/llm-classify.test.js +49 -2
- package/dist/src/pil/__tests__/surface-compaction-artifacts.test.d.ts +1 -0
- package/dist/src/pil/__tests__/surface-compaction-artifacts.test.js +112 -0
- package/dist/src/pil/agent-operating-contract.d.ts +1 -1
- package/dist/src/pil/agent-operating-contract.js +2 -0
- package/dist/src/pil/agent-operating-contract.test.js +7 -2
- package/dist/src/pil/cheap-model-playbook.js +35 -35
- package/dist/src/pil/cheap-model-workbooks.js +16 -13
- package/dist/src/pil/clarity-gate.d.ts +21 -19
- package/dist/src/pil/clarity-gate.js +26 -153
- package/dist/src/pil/config.d.ts +9 -1
- package/dist/src/pil/config.js +15 -4
- package/dist/src/pil/discovery.js +211 -136
- package/dist/src/pil/layer1-intent.d.ts +12 -0
- package/dist/src/pil/layer1-intent.js +283 -38
- package/dist/src/pil/layer1-intent.test.js +210 -4
- package/dist/src/pil/layer16-clarity.d.ts +25 -11
- package/dist/src/pil/layer16-clarity.js +19 -306
- package/dist/src/pil/layer3-ee-injection.d.ts +19 -0
- package/dist/src/pil/layer3-ee-injection.js +96 -4
- package/dist/src/pil/layer4-gsd.js +18 -6
- package/dist/src/pil/layer6-output.d.ts +2 -0
- package/dist/src/pil/layer6-output.js +151 -25
- package/dist/src/pil/llm-classify.d.ts +26 -0
- package/dist/src/pil/llm-classify.js +34 -5
- package/dist/src/pil/native-capabilities-workbook.d.ts +1 -1
- package/dist/src/pil/native-capabilities-workbook.js +82 -76
- package/dist/src/pil/pipeline.js +15 -9
- package/dist/src/pil/schema.d.ts +8 -0
- package/dist/src/pil/schema.js +12 -1
- package/dist/src/pil/task-tier-map.js +4 -0
- package/dist/src/pil/types.d.ts +11 -1
- package/dist/src/product-loop/done-gate.js +3 -3
- package/dist/src/product-loop/loop-driver.js +18 -18
- package/dist/src/product-loop/progress-snapshot.js +4 -4
- package/dist/src/providers/auth/gemini-oauth.js +6 -15
- package/dist/src/providers/auth/grok-oauth.js +6 -15
- package/dist/src/providers/auth/openai-oauth.js +6 -15
- package/dist/src/providers/mcp-vision-bridge.js +48 -48
- package/dist/src/reporter/index.js +1 -1
- package/dist/src/scaffold/bb-ecosystem-apply.js +47 -47
- package/dist/src/scaffold/bb-quality-gate.js +5 -5
- package/dist/src/scaffold/continuation-prompt.js +60 -60
- package/dist/src/scaffold/init-new.js +453 -453
- package/dist/src/self-qa/__tests__/scenario-planner.test.js +3 -3
- package/dist/src/self-qa/agentic-loop.js +24 -19
- package/dist/src/self-qa/spec-emitter.js +26 -23
- package/dist/src/storage/__tests__/migrations.test.js +2 -2
- package/dist/src/storage/interaction-log.js +5 -5
- package/dist/src/storage/migrations.js +122 -122
- package/dist/src/storage/sessions.js +42 -42
- package/dist/src/storage/transcript.js +91 -84
- package/dist/src/storage/usage.js +14 -14
- package/dist/src/storage/workspaces.js +12 -12
- package/dist/src/tools/__tests__/native-tools.test.d.ts +1 -0
- package/dist/src/tools/__tests__/native-tools.test.js +53 -0
- package/dist/src/tools/git-safety.d.ts +61 -0
- package/dist/src/tools/git-safety.js +141 -0
- package/dist/src/tools/git-safety.test.d.ts +1 -0
- package/dist/src/tools/git-safety.test.js +111 -0
- package/dist/src/tools/native-tools.d.ts +31 -0
- package/dist/src/tools/native-tools.js +273 -0
- package/dist/src/tools/registry-ee-query.test.js +18 -1
- package/dist/src/tools/registry-git-safety.test.d.ts +7 -0
- package/dist/src/tools/registry-git-safety.test.js +92 -0
- package/dist/src/tools/registry.js +52 -6
- package/dist/src/ui/__tests__/markdown-render.test.d.ts +1 -0
- package/dist/src/ui/__tests__/markdown-render.test.js +48 -0
- package/dist/src/ui/app.js +0 -0
- package/dist/src/ui/components/message-view.js +4 -1
- package/dist/src/ui/components/structured-response-view.js +7 -3
- package/dist/src/ui/components/tool-group.js +7 -1
- package/dist/src/ui/markdown-render.d.ts +41 -0
- package/dist/src/ui/markdown-render.js +223 -0
- package/dist/src/ui/markdown.d.ts +10 -0
- package/dist/src/ui/markdown.js +12 -35
- package/dist/src/ui/slash/council-inspect.js +4 -4
- package/dist/src/ui/slash/export.js +4 -4
- package/dist/src/ui/utils/text.d.ts +8 -0
- package/dist/src/ui/utils/text.js +16 -0
- package/dist/src/ui/utils/text.test.d.ts +1 -0
- package/dist/src/ui/utils/text.test.js +23 -0
- package/dist/src/usage/ledger.js +48 -15
- package/dist/src/utils/__tests__/footprint-gitignore.test.d.ts +1 -0
- package/dist/src/utils/__tests__/footprint-gitignore.test.js +50 -0
- package/dist/src/utils/clipboard-image.js +23 -23
- package/dist/src/utils/open-url.d.ts +56 -0
- package/dist/src/utils/open-url.js +58 -0
- package/dist/src/utils/open-url.test.d.ts +1 -0
- package/dist/src/utils/open-url.test.js +86 -0
- package/dist/src/utils/settings.d.ts +12 -0
- package/dist/src/utils/settings.js +48 -0
- package/dist/src/utils/side-question.js +2 -2
- package/dist/src/utils/skills.js +3 -3
- package/dist/src/verify/__tests__/coverage-parsers.test.js +30 -30
- package/dist/src/verify/environment.js +2 -1
- package/package.json +1 -1
- package/dist/src/pil/layer16-clarity.test.js +0 -31
- /package/dist/src/{pil/layer16-clarity.test.d.ts → council/__tests__/clarification-prompt.test.d.ts} +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { beforeEach, describe, expect, it, vi } from "vitest";
|
|
2
|
-
import { applyPilSuffix, getResponseToolSet, layer6Output } from "../layer6-output.js";
|
|
2
|
+
import { applyPilSuffix, getResponseToolSet, isImplementationIntent, isQuestionLike, layer6Output, } from "../layer6-output.js";
|
|
3
3
|
// Mock bridge for PIL-03 classifyViaBrain tests
|
|
4
4
|
vi.mock("../../ee/bridge.js", () => ({
|
|
5
5
|
classifyViaBrain: vi.fn().mockResolvedValue(null),
|
|
@@ -55,6 +55,38 @@ describe("applyPilSuffix — per-task-type suffixes", () => {
|
|
|
55
55
|
expect(result).toMatch(/Tôi sẽ/); // bilingual
|
|
56
56
|
}
|
|
57
57
|
});
|
|
58
|
+
it("de-robotized: NO_PREAMBLE bans only openers, not end-of-turn summary or inter-tool narration", () => {
|
|
59
|
+
// The summary + inter-tool bans were removed because they stripped natural
|
|
60
|
+
// connective tissue (the "máy móc" feel). Inter-tool spam is still removed
|
|
61
|
+
// structurally by stripInterToolNarration() in reasoning.ts. This guards
|
|
62
|
+
// against the bans silently creeping back into the system prompt.
|
|
63
|
+
const result = applyPilSuffix("S", makeCtx("debug", "concise"));
|
|
64
|
+
expect(result).toMatch(/FORBIDDEN OPENERS/);
|
|
65
|
+
expect(result).not.toMatch(/FORBIDDEN END-OF-TURN SUMMARY/);
|
|
66
|
+
expect(result).not.toMatch(/FORBIDDEN INTER-TOOL NARRATION/);
|
|
67
|
+
});
|
|
68
|
+
it("de-robotized: debug suffix is guidance, not a rigid arrow skeleton", () => {
|
|
69
|
+
// "Format = Hypothesis → Root cause → Fix → Verify" produced stilted,
|
|
70
|
+
// label-prefixed answers. It must read as guidance now.
|
|
71
|
+
const result = applyPilSuffix("S", makeCtx("debug", "concise"));
|
|
72
|
+
expect(result).toContain("OUTPUT RULES (debug)");
|
|
73
|
+
expect(result).not.toMatch(/Format = Hypothesis/);
|
|
74
|
+
});
|
|
75
|
+
it("E: appends the anti-bookkeeping note on the natural path for non-question turns", () => {
|
|
76
|
+
// The contract's REPORTING rule leaks as a provenance footer ("evidence only
|
|
77
|
+
// from this turn") on imperative answer turns; the natural path now guards it.
|
|
78
|
+
const result = applyPilSuffix("S", makeCtx("analyze", "concise"));
|
|
79
|
+
expect(result).toMatch(/WRITE FOR THE READER/);
|
|
80
|
+
expect(result).toMatch(/provenance/i);
|
|
81
|
+
});
|
|
82
|
+
it("E: skips the anti-bookkeeping note for question turns (L4 QUESTION directive covers them)", () => {
|
|
83
|
+
const ctx = { ...makeCtx("analyze", "concise"), raw: "why does the enrichment layer fail?" };
|
|
84
|
+
expect(applyPilSuffix("S", ctx)).not.toMatch(/WRITE FOR THE READER/);
|
|
85
|
+
});
|
|
86
|
+
it("E: response-tools path does not add the natural-path bookkeeping note", () => {
|
|
87
|
+
const result = applyPilSuffix("S", makeCtx("analyze", "balanced"), true);
|
|
88
|
+
expect(result).not.toMatch(/WRITE FOR THE READER/);
|
|
89
|
+
});
|
|
58
90
|
it("PIL-04: response-tools path skips budget+preamble (tool already enforces structure)", () => {
|
|
59
91
|
const result = applyPilSuffix("S", makeCtx("analyze", "balanced"), true);
|
|
60
92
|
expect(result).toContain("respond_analyze");
|
|
@@ -74,29 +106,33 @@ describe("applyPilSuffix — per-task-type suffixes", () => {
|
|
|
74
106
|
expect(result).toMatch(/Do NOT append an evidence-provenance footer/);
|
|
75
107
|
});
|
|
76
108
|
});
|
|
77
|
-
describe("getResponseToolSet —
|
|
78
|
-
|
|
79
|
-
|
|
109
|
+
describe("getResponseToolSet — narrow gating (de-robotizing)", () => {
|
|
110
|
+
// Override raw on a typed ctx so the report/question discriminator is exercised.
|
|
111
|
+
const ctxRaw = (raw, t) => ({ ...makeCtx(t, null), raw });
|
|
112
|
+
it("returns response tool for analyze on an explicit report/list request", () => {
|
|
113
|
+
const tools = getResponseToolSet(ctxRaw("audit the orchestrator and list all cost-leak findings", "analyze"));
|
|
80
114
|
expect(Object.keys(tools)).toContain("respond_analyze");
|
|
81
115
|
});
|
|
82
|
-
it("returns response tool for plan
|
|
83
|
-
const tools = getResponseToolSet(
|
|
116
|
+
it("returns response tool for plan on an explicit plan request", () => {
|
|
117
|
+
const tools = getResponseToolSet(ctxRaw("plan the migration to the new auth flow step by step", "plan"));
|
|
84
118
|
expect(Object.keys(tools)).toContain("respond_plan");
|
|
85
119
|
});
|
|
120
|
+
it("returns response tool for debug only on an explicit report request", () => {
|
|
121
|
+
const tools = getResponseToolSet(ctxRaw("audit the failing suite and list each root cause", "debug"));
|
|
122
|
+
expect(Object.keys(tools)).toContain("respond_debug");
|
|
123
|
+
});
|
|
86
124
|
it("returns empty toolset for generate (code-heavy, markdown wins)", () => {
|
|
87
125
|
expect(getResponseToolSet(makeCtx("generate", null))).toEqual({});
|
|
88
126
|
});
|
|
89
127
|
it("returns empty toolset for refactor (diff-heavy, markdown wins)", () => {
|
|
90
128
|
expect(getResponseToolSet(makeCtx("refactor", null))).toEqual({});
|
|
91
129
|
});
|
|
92
|
-
it("returns response tool for debug (bounded schema, structural enforcement wins)", () => {
|
|
93
|
-
const tools = getResponseToolSet(makeCtx("debug", null));
|
|
94
|
-
expect(Object.keys(tools)).toContain("respond_debug");
|
|
95
|
-
});
|
|
96
130
|
it("returns empty toolset for documentation (prose-heavy)", () => {
|
|
97
131
|
expect(getResponseToolSet(makeCtx("documentation", null))).toEqual({});
|
|
98
132
|
});
|
|
99
|
-
it("returns response tool for general
|
|
133
|
+
it("returns response tool for general regardless of report signal (renders as plain markdown)", () => {
|
|
134
|
+
// general is exempt from the report/question gate: GeneralSchema is pure text
|
|
135
|
+
// and its renderer shows plain markdown, so respond_general is never robotic.
|
|
100
136
|
const tools = getResponseToolSet(makeCtx("general", null));
|
|
101
137
|
expect(Object.keys(tools)).toContain("respond_general");
|
|
102
138
|
});
|
|
@@ -113,24 +149,87 @@ describe("getResponseToolSet — PIL-04 Tier 1.1 gating", () => {
|
|
|
113
149
|
it("returns empty toolset when taskType is null", () => {
|
|
114
150
|
expect(getResponseToolSet(makeCtx(null, null))).toEqual({});
|
|
115
151
|
});
|
|
152
|
+
it("gates the response tool for chitchat turns", () => {
|
|
153
|
+
const ctx = { ...makeCtx("general", null), intentKind: "chitchat" };
|
|
154
|
+
expect(getResponseToolSet(ctx)).toEqual({});
|
|
155
|
+
});
|
|
156
|
+
it("DROPS respond_<task> for question-style debug/analyze/plan (natural markdown path)", () => {
|
|
157
|
+
// The de-robotizing change: a plain QUESTION must not be forced into the
|
|
158
|
+
// rigid respond_* schema + labeled renderer. It falls through to the softened
|
|
159
|
+
// markdown OUTPUT RULES so the answer reads as natural prose.
|
|
160
|
+
expect(getResponseToolSet(ctxRaw("why does the build fail intermittently?", "debug"))).toEqual({});
|
|
161
|
+
expect(getResponseToolSet(ctxRaw("analyze how the enrichment function works", "analyze"))).toEqual({});
|
|
162
|
+
expect(getResponseToolSet(ctxRaw("what is the cleanest way to structure this module?", "plan"))).toEqual({});
|
|
163
|
+
});
|
|
164
|
+
it("KEEPS respond_<task> for explicit report / list / plan requests (EN + VI)", () => {
|
|
165
|
+
const keep = (raw, t) => Object.keys(getResponseToolSet(ctxRaw(raw, t)));
|
|
166
|
+
expect(keep("list all cost leaks in the orchestrator", "analyze")).toContain("respond_analyze");
|
|
167
|
+
expect(keep("review the module and report each finding by severity", "analyze")).toContain("respond_analyze");
|
|
168
|
+
expect(keep("lập kế hoạch migration sang auth flow mới", "plan")).toContain("respond_plan");
|
|
169
|
+
});
|
|
170
|
+
it("DROPS respond_<task> for a QUESTION that merely mentions plan/list (narrow-gate fix)", () => {
|
|
171
|
+
// Live bug (grok interview): a question that QUOTED the phrase "state a 2-3
|
|
172
|
+
// line plan" matched the bare word 'plan' in STRUCTURED_REPORT_RE and forced
|
|
173
|
+
// respond_plan, cramming an introspective answer into a rigid plan schema. A
|
|
174
|
+
// question-shaped prompt must stay on the natural markdown path even when it
|
|
175
|
+
// contains plan/list words.
|
|
176
|
+
expect(getResponseToolSet(ctxRaw("what rules constrain you, e.g. the 'state a 2-3 line plan' directive?", "plan"))).toEqual({});
|
|
177
|
+
expect(getResponseToolSet(ctxRaw("can you list the main points?", "analyze"))).toEqual({});
|
|
178
|
+
expect(getResponseToolSet(ctxRaw("how would you plan the rollout?", "plan"))).toEqual({});
|
|
179
|
+
// Imperative delivery requests are NOT question-shaped → still structured.
|
|
180
|
+
expect(Object.keys(getResponseToolSet(ctxRaw("plan the rollout step by step", "plan")))).toContain("respond_plan");
|
|
181
|
+
});
|
|
116
182
|
it("drops respond_<task> on an IMPLEMENTATION-intent prompt (no premature terminal answer)", () => {
|
|
117
183
|
// Live (grok session 19fa8895c41c): an "Improve … implement these fixes"
|
|
118
184
|
// prompt classified `debug` got respond_debug; the model called it mid-task
|
|
119
185
|
// as a plan and the turn ended before the edits completed. Implementation
|
|
120
186
|
// turns must fall through to markdown OUTPUT RULES, not a terminal tool.
|
|
187
|
+
// Implementation intent takes precedence over a report signal.
|
|
121
188
|
const impl = (raw, t) => ({ ...makeCtx(t, null), raw });
|
|
122
189
|
expect(getResponseToolSet(impl("Improve the story-list screen. Implement these prioritized fixes: …", "debug"))).toEqual({});
|
|
123
190
|
expect(getResponseToolSet(impl("Edit ONLY these two files and fix the empty span", "debug"))).toEqual({});
|
|
124
191
|
expect(getResponseToolSet(impl("refactor the genre dropdown and wire up keyboard handlers", "analyze"))).toEqual({});
|
|
125
192
|
expect(getResponseToolSet(impl("triển khai các cải tiến đã đề xuất", "plan"))).toEqual({});
|
|
126
193
|
});
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
194
|
+
});
|
|
195
|
+
describe("getResponseToolSet — Phase 2b deliverableKind consume (model overrides regex)", () => {
|
|
196
|
+
const ctxD = (raw, t, deliverableKind) => ({
|
|
197
|
+
...makeCtx(t, null),
|
|
198
|
+
raw,
|
|
199
|
+
deliverableKind,
|
|
200
|
+
});
|
|
201
|
+
it("deliverableKind='code' DROPS respond_* even when the prompt reads as a report/list", () => {
|
|
202
|
+
// Legacy regex (prefersStructuredReport) would KEEP the tool on "list all …".
|
|
203
|
+
// The model said the deliverable is code → drop it (edits, not a report).
|
|
204
|
+
expect(getResponseToolSet(ctxD("list all cost leaks in the orchestrator", "analyze", "code"))).toEqual({});
|
|
205
|
+
});
|
|
206
|
+
it("deliverableKind='report' KEEPS respond_* even when the prompt is question-shaped", () => {
|
|
207
|
+
// Legacy regex (isQuestionLike) would DROP the tool on "why does …?". The
|
|
208
|
+
// model said the deliverable is a structured report → keep it.
|
|
209
|
+
const tools = getResponseToolSet(ctxD("why does the suite fail — break it down by cause", "analyze", "report"));
|
|
210
|
+
expect(Object.keys(tools)).toContain("respond_analyze");
|
|
211
|
+
});
|
|
212
|
+
it("deliverableKind='answer' DROPS respond_* for non-general even on a report-shaped request", () => {
|
|
213
|
+
expect(getResponseToolSet(ctxD("plan the migration step by step", "plan", "answer"))).toEqual({});
|
|
214
|
+
});
|
|
215
|
+
it("deliverableKind='answer' KEEPS respond_general (general is exempt — renders as plain markdown)", () => {
|
|
216
|
+
const tools = getResponseToolSet(ctxD("what does the enrichment layer do?", "general", "answer"));
|
|
217
|
+
expect(Object.keys(tools)).toContain("respond_general");
|
|
218
|
+
});
|
|
219
|
+
it("falls back to the legacy regex when deliverableKind is absent (null)", () => {
|
|
220
|
+
// No model signal → legacy path: question-shaped analyze drops the tool.
|
|
221
|
+
expect(getResponseToolSet({ ...makeCtx("analyze", null), raw: "why does the build fail?" })).toEqual({});
|
|
222
|
+
// …and an explicit report request keeps it.
|
|
223
|
+
expect(Object.keys(getResponseToolSet({ ...makeCtx("analyze", null), raw: "list all cost leaks" }))).toContain("respond_analyze");
|
|
224
|
+
});
|
|
225
|
+
it("DROPS respond_* on an implement turn even when mis-classified as report (session 2b7a10219499)", () => {
|
|
226
|
+
// "lên plan rồi improvement … cải thiện X" is an implement turn the model
|
|
227
|
+
// tagged deliverable=report; the report-exception used to KEEP respond_plan,
|
|
228
|
+
// so the model stated a plan and ended the turn with edits done but
|
|
229
|
+
// uncommitted/unreported. Implementation intent must suppress the terminal
|
|
230
|
+
// tool BEFORE the deliverable branch is consulted.
|
|
231
|
+
expect(getResponseToolSet(ctxD("lên plan rồi improvement nhé, focus cải thiện Compaction", "plan", "report"))).toEqual({});
|
|
232
|
+
expect(getResponseToolSet(ctxD("improve the compactor and implement the fix", "plan", "report"))).toEqual({});
|
|
134
233
|
});
|
|
135
234
|
});
|
|
136
235
|
describe("applyPilSuffix — outputStyle variants", () => {
|
|
@@ -294,4 +393,45 @@ describe("layer6Output", () => {
|
|
|
294
393
|
expect(result.enriched).toBe(ctx.enriched);
|
|
295
394
|
});
|
|
296
395
|
});
|
|
396
|
+
describe("isQuestionLike — Vietnamese yes/no question frames (regression: session f6f7881a5fae)", () => {
|
|
397
|
+
it("detects the live miss: 'check ... dùng được mcp ... không nhé'", () => {
|
|
398
|
+
// The exact prompt that was mis-routed to the implement/verify scaffold.
|
|
399
|
+
expect(isQuestionLike("bạn check xem dùng được mcp muonroi-docs không nhé")).toBe(true);
|
|
400
|
+
// It is NOT an implementation intent, so layer4-gsd's informational gate fires.
|
|
401
|
+
expect(isImplementationIntent("bạn check xem dùng được mcp muonroi-docs không nhé")).toBe(false);
|
|
402
|
+
});
|
|
403
|
+
it("detects common VI yes/no tails", () => {
|
|
404
|
+
expect(isQuestionLike("dùng được không")).toBe(true);
|
|
405
|
+
expect(isQuestionLike("cái này chạy được không vậy")).toBe(true);
|
|
406
|
+
expect(isQuestionLike("đúng không")).toBe(true);
|
|
407
|
+
expect(isQuestionLike("phải không nhỉ")).toBe(true);
|
|
408
|
+
expect(isQuestionLike("test đã pass chưa")).toBe(true);
|
|
409
|
+
expect(isQuestionLike("xong chưa ạ")).toBe(true);
|
|
410
|
+
expect(isQuestionLike("có chạy được không?")).toBe(true);
|
|
411
|
+
});
|
|
412
|
+
it("does NOT treat a mid-sentence negation as a question", () => {
|
|
413
|
+
// "không là hỏng" = "or it breaks" — 'không' is not the clause-final particle.
|
|
414
|
+
expect(isQuestionLike("đừng commit file .env không là lộ key")).toBe(false);
|
|
415
|
+
// Plain imperative with a 'nhé' softener (no 'không'/'chưa' tail) stays a task.
|
|
416
|
+
expect(isQuestionLike("sửa giúp tôi cái này nhé")).toBe(false);
|
|
417
|
+
expect(isQuestionLike("triển khai tính năng login")).toBe(false);
|
|
418
|
+
});
|
|
419
|
+
it("still detects the pre-existing EN/VI question shapes", () => {
|
|
420
|
+
expect(isQuestionLike("why does the build fail?")).toBe(true);
|
|
421
|
+
expect(isQuestionLike("tại sao build lỗi")).toBe(true);
|
|
422
|
+
expect(isQuestionLike("explain the pipeline")).toBe(true);
|
|
423
|
+
});
|
|
424
|
+
});
|
|
425
|
+
describe("isImplementationIntent — improve / cải thiện (regression: session 2b7a10219499)", () => {
|
|
426
|
+
it("recognises improve/improvement + VI cải thiện as implement turns", () => {
|
|
427
|
+
expect(isImplementationIntent("improve the compactor")).toBe(true);
|
|
428
|
+
expect(isImplementationIntent("lên plan rồi improvement nhé")).toBe(true);
|
|
429
|
+
expect(isImplementationIntent("focus cải thiện Compaction")).toBe(true);
|
|
430
|
+
expect(isImplementationIntent("cai thien phan compaction")).toBe(true);
|
|
431
|
+
});
|
|
432
|
+
it("does not over-match analysis questions that merely describe behaviour", () => {
|
|
433
|
+
expect(isImplementationIntent("what does the enrichment layer do?")).toBe(false);
|
|
434
|
+
expect(isImplementationIntent("why does the suite fail — break it down")).toBe(false);
|
|
435
|
+
});
|
|
436
|
+
});
|
|
297
437
|
//# sourceMappingURL=layer6-output.test.js.map
|
|
@@ -23,6 +23,32 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
|
|
|
23
23
|
expect(result?.outputStyle).toBe("concise");
|
|
24
24
|
expect(result?.confidence).toBeGreaterThan(0.5);
|
|
25
25
|
});
|
|
26
|
+
it("parses the three-word reply and marks chitchat from the intent word", async () => {
|
|
27
|
+
const handle = installMockModel({ fixture: { stream: textOnlyStream("general,concise,chat") } });
|
|
28
|
+
cleanup = handle.uninstall;
|
|
29
|
+
const factory = (() => handle.model);
|
|
30
|
+
const classify = createLlmClassifier(factory, "deepseek-v4-flash");
|
|
31
|
+
const result = await classify("cảm ơn bạn nhé");
|
|
32
|
+
expect(result?.taskType).toBe("general");
|
|
33
|
+
expect(result?.intentKind).toBe("chitchat");
|
|
34
|
+
});
|
|
35
|
+
it("treats a general QUESTION as task, not chitchat (keep-tools)", async () => {
|
|
36
|
+
const handle = installMockModel({ fixture: { stream: textOnlyStream("general,concise,task") } });
|
|
37
|
+
cleanup = handle.uninstall;
|
|
38
|
+
const factory = (() => handle.model);
|
|
39
|
+
const classify = createLlmClassifier(factory, "deepseek-v4-flash");
|
|
40
|
+
const result = await classify("bạn thử call tool setup_guide xem được không");
|
|
41
|
+
expect(result?.intentKind).toBe("task");
|
|
42
|
+
});
|
|
43
|
+
it("defaults intentKind to task when the model omits the third word (backward compatible)", async () => {
|
|
44
|
+
const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise") } });
|
|
45
|
+
cleanup = handle.uninstall;
|
|
46
|
+
const factory = (() => handle.model);
|
|
47
|
+
const classify = createLlmClassifier(factory, "deepseek-v4-flash");
|
|
48
|
+
const result = await classify("fix the failing build");
|
|
49
|
+
expect(result?.taskType).toBe("debug");
|
|
50
|
+
expect(result?.intentKind).toBe("task");
|
|
51
|
+
});
|
|
26
52
|
it("returns null when the reply cannot be parsed", async () => {
|
|
27
53
|
const handle = installMockModel({ fixture: { stream: textOnlyStream("¯\\_(ツ)_/¯") } });
|
|
28
54
|
cleanup = handle.uninstall;
|
|
@@ -100,14 +126,35 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
|
|
|
100
126
|
expect(result?.taskType).toBe("debug");
|
|
101
127
|
expect(result?.outputStyle).toBe("concise");
|
|
102
128
|
});
|
|
103
|
-
it("keeps
|
|
129
|
+
it("keeps a tiny output budget for non-reasoning models (24 — four comma words)", async () => {
|
|
104
130
|
const handle = installMockModel({ fixture: { stream: textOnlyStream("generate,concise") } });
|
|
105
131
|
cleanup = handle.uninstall;
|
|
106
132
|
const factory = (() => handle.model);
|
|
107
133
|
const classify = createLlmClassifier(factory, "Qwen/Qwen3-8B"); // reasoning:false
|
|
108
134
|
await classify("add a new endpoint");
|
|
109
135
|
const call = handle.calls[0];
|
|
110
|
-
expect(call.maxOutputTokens).toBe(
|
|
136
|
+
expect(call.maxOutputTokens).toBe(24);
|
|
137
|
+
});
|
|
138
|
+
it("parses the fourth word as the output deliverable (Phase 2b)", async () => {
|
|
139
|
+
const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code") } });
|
|
140
|
+
cleanup = handle.uninstall;
|
|
141
|
+
const factory = (() => handle.model);
|
|
142
|
+
const classify = createLlmClassifier(factory, "deepseek-v4-flash");
|
|
143
|
+
const result = await classify("fix the crash in src/auth/login.ts");
|
|
144
|
+
expect(result?.taskType).toBe("debug");
|
|
145
|
+
expect(result?.deliverableKind).toBe("code");
|
|
146
|
+
});
|
|
147
|
+
it("recovers the deliverable position-independently and defaults to null when absent", async () => {
|
|
148
|
+
const reportHandle = installMockModel({ fixture: { stream: textOnlyStream("analyze,concise,task,report") } });
|
|
149
|
+
cleanup = reportHandle.uninstall;
|
|
150
|
+
const reportClassify = createLlmClassifier((() => reportHandle.model), "deepseek-v4-flash");
|
|
151
|
+
expect((await reportClassify("list every env var the CLI reads"))?.deliverableKind).toBe("report");
|
|
152
|
+
reportHandle.uninstall();
|
|
153
|
+
// Model omits the 4th word → deliverableKind null (consumers fall back to regex).
|
|
154
|
+
const bareHandle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise") } });
|
|
155
|
+
cleanup = bareHandle.uninstall;
|
|
156
|
+
const bareClassify = createLlmClassifier((() => bareHandle.model), "deepseek-v4-flash");
|
|
157
|
+
expect((await bareClassify("fix it"))?.deliverableKind).toBeNull();
|
|
111
158
|
});
|
|
112
159
|
});
|
|
113
160
|
//# sourceMappingURL=llm-classify.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { beforeEach, describe, expect, test, vi } from "vitest";
|
|
2
|
+
import { surfaceCompactionArtifacts } from "../layer3-ee-injection.js";
|
|
3
|
+
// Issue #4 — targeted complement to layer3's checkpoint arm on meta turns.
|
|
4
|
+
// layer3 (now run on meta after issue #2) surfaces checkpoints via a FIXED
|
|
5
|
+
// recency query; this arm searches by the meta question (ctx.raw) to surface the
|
|
6
|
+
// elided tool-artifacts relevant to it, and DEFERS when layer3 already injected a
|
|
7
|
+
// checkpoint block. Mock the EE search + the audit log so the test stays offline.
|
|
8
|
+
vi.mock("../../ee/bridge.js", () => ({
|
|
9
|
+
searchByText: vi.fn().mockResolvedValue([]),
|
|
10
|
+
}));
|
|
11
|
+
vi.mock("../../storage/interaction-log.js", () => ({
|
|
12
|
+
logInteraction: vi.fn(),
|
|
13
|
+
}));
|
|
14
|
+
import { searchByText } from "../../ee/bridge.js";
|
|
15
|
+
function makeCtx(overrides = {}) {
|
|
16
|
+
return {
|
|
17
|
+
raw: "compaction cần cải thiện gì trong CLI",
|
|
18
|
+
enriched: "compaction cần cải thiện gì trong CLI",
|
|
19
|
+
taskType: "general",
|
|
20
|
+
domain: null,
|
|
21
|
+
confidence: 0.85,
|
|
22
|
+
outputStyle: "balanced",
|
|
23
|
+
tokenBudget: 2000,
|
|
24
|
+
metrics: null,
|
|
25
|
+
layers: [],
|
|
26
|
+
sessionId: "sess-meta-1",
|
|
27
|
+
...overrides,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
const artifactPoint = {
|
|
31
|
+
id: "art1",
|
|
32
|
+
score: 0.9,
|
|
33
|
+
payload: {
|
|
34
|
+
text: "tool-artifact id=call_7 toolName=read_file elided 4200 chars: src/orchestrator/compaction.ts createCompactionSummaryMessage ...",
|
|
35
|
+
},
|
|
36
|
+
collection: "experience-behavioral",
|
|
37
|
+
};
|
|
38
|
+
const checkpointPoint = {
|
|
39
|
+
id: "cp1",
|
|
40
|
+
score: 0.8,
|
|
41
|
+
payload: { text: "Context checkpoint summary ✔ DONE: extended IMPORTANT_TOOL_NAMES; tests 16/16" },
|
|
42
|
+
collection: "experience-behavioral",
|
|
43
|
+
};
|
|
44
|
+
const genericPoint = {
|
|
45
|
+
id: "gen1",
|
|
46
|
+
score: 0.97,
|
|
47
|
+
payload: { text: "Always run the full test suite before pushing" },
|
|
48
|
+
collection: "experience-behavioral",
|
|
49
|
+
};
|
|
50
|
+
describe("surfaceCompactionArtifacts (issue #4 — meta-turn auto-surface)", () => {
|
|
51
|
+
beforeEach(() => {
|
|
52
|
+
vi.mocked(searchByText).mockReset();
|
|
53
|
+
vi.mocked(searchByText).mockResolvedValue([]);
|
|
54
|
+
});
|
|
55
|
+
test("auto-surfaces [artifact] + checkpoint refs (and the rehydrate instruction) into enriched", async () => {
|
|
56
|
+
// biome-ignore lint/suspicious/noExplicitAny: test fixture shape mirrors EEPoint
|
|
57
|
+
vi.mocked(searchByText).mockResolvedValue([artifactPoint, checkpointPoint]);
|
|
58
|
+
const ctx = makeCtx();
|
|
59
|
+
const out = await surfaceCompactionArtifacts(ctx);
|
|
60
|
+
expect(out.enriched).toContain("[artifact]"); // artifact-typed line
|
|
61
|
+
expect(out.enriched).toContain("ee.query tool"); // how to rehydrate the full output
|
|
62
|
+
expect(out.enriched).toContain("call_7"); // the concrete tool-artifact id the agent can fetch
|
|
63
|
+
const layer = out.layers.find((l) => l.name === "ee-meta-artifacts");
|
|
64
|
+
expect(layer?.applied).toBe(true);
|
|
65
|
+
expect(layer?.delta).toContain("artifacts=2");
|
|
66
|
+
// Searches only the behavioral collection (where tool-artifacts are persisted).
|
|
67
|
+
expect(vi.mocked(searchByText)).toHaveBeenCalledWith(expect.stringContaining("tool-artifact"), ["experience-behavioral"], expect.any(Number), expect.any(Object));
|
|
68
|
+
});
|
|
69
|
+
test("no sessionId → unchanged, no EE call (no prior compaction to rehydrate)", async () => {
|
|
70
|
+
const ctx = makeCtx({ sessionId: undefined });
|
|
71
|
+
const out = await surfaceCompactionArtifacts(ctx);
|
|
72
|
+
expect(out.enriched).toBe(ctx.enriched);
|
|
73
|
+
expect(out.layers.find((l) => l.name === "ee-meta-artifacts")?.delta).toBe("no-session");
|
|
74
|
+
expect(vi.mocked(searchByText)).not.toHaveBeenCalled();
|
|
75
|
+
});
|
|
76
|
+
test("search failure is fail-open + recorded (delta=error=…, enriched unchanged)", async () => {
|
|
77
|
+
vi.mocked(searchByText).mockRejectedValue(new Error("EE down"));
|
|
78
|
+
const ctx = makeCtx();
|
|
79
|
+
const out = await surfaceCompactionArtifacts(ctx);
|
|
80
|
+
expect(out.enriched).toBe(ctx.enriched);
|
|
81
|
+
expect(out.layers.find((l) => l.name === "ee-meta-artifacts")?.delta).toMatch(/^error=/);
|
|
82
|
+
});
|
|
83
|
+
test("generic behavioral hits are filtered out (not mislabelled as artifacts)", async () => {
|
|
84
|
+
// biome-ignore lint/suspicious/noExplicitAny: test fixture shape mirrors EEPoint
|
|
85
|
+
vi.mocked(searchByText).mockResolvedValue([genericPoint]);
|
|
86
|
+
const ctx = makeCtx();
|
|
87
|
+
const out = await surfaceCompactionArtifacts(ctx);
|
|
88
|
+
expect(out.enriched).toBe(ctx.enriched);
|
|
89
|
+
expect(out.layers.find((l) => l.name === "ee-meta-artifacts")?.delta).toBe("no-artifacts");
|
|
90
|
+
});
|
|
91
|
+
test("defers to layer3 — skips with NO EE call when a checkpoint block is already present", async () => {
|
|
92
|
+
// layer3 ran first this turn and injected a checkpoint block (its marker is
|
|
93
|
+
// in enriched). The complement must not duplicate it or pay a 2nd round-trip.
|
|
94
|
+
const enriched = `${makeCtx().raw}\n[task checkpoints …]\n<!-- ee-checkpoint-injected:0123456789abcdef -->`;
|
|
95
|
+
const out = await surfaceCompactionArtifacts(makeCtx({ enriched }));
|
|
96
|
+
expect(out.layers.find((l) => l.name === "ee-meta-artifacts")?.delta).toBe("already-surfaced");
|
|
97
|
+
expect(out.enriched).toBe(enriched); // unchanged
|
|
98
|
+
expect(vi.mocked(searchByText)).not.toHaveBeenCalled();
|
|
99
|
+
});
|
|
100
|
+
test("idempotent — a second pass on its own output defers (marker it wrote is seen)", async () => {
|
|
101
|
+
// biome-ignore lint/suspicious/noExplicitAny: test fixture shape mirrors EEPoint
|
|
102
|
+
vi.mocked(searchByText).mockResolvedValue([artifactPoint]);
|
|
103
|
+
const first = await surfaceCompactionArtifacts(makeCtx());
|
|
104
|
+
expect(first.enriched).toContain("[artifact]");
|
|
105
|
+
expect(vi.mocked(searchByText)).toHaveBeenCalledTimes(1);
|
|
106
|
+
const second = await surfaceCompactionArtifacts(makeCtx({ enriched: first.enriched }));
|
|
107
|
+
expect(second.layers.find((l) => l.name === "ee-meta-artifacts")?.delta).toBe("already-surfaced");
|
|
108
|
+
expect(second.enriched).toBe(first.enriched); // not grown a second time
|
|
109
|
+
expect(vi.mocked(searchByText)).toHaveBeenCalledTimes(1); // no second round-trip
|
|
110
|
+
});
|
|
111
|
+
});
|
|
112
|
+
//# sourceMappingURL=surface-compaction-artifacts.test.js.map
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
* one imperative line targeting that phase's most damaging failure mode. Kept
|
|
38
38
|
* tight (primacy matters more than detail; tokens are the cost).
|
|
39
39
|
*/
|
|
40
|
-
export declare const AGENT_OPERATING_CONTRACT = "[AGENT OPERATING CONTRACT \u2014 read first; applies to every step]\n\n1. BEFORE ACTING: do only what was asked. Never assume scope or facts \u2014 if ambiguous, ask or use defaults; never invent requirements.\n2. READING: base statements on what you read/ran THIS turn. Do not infer contents of files you did not open.\n3. EXECUTING: smallest correct change; never widen scope or mask failures (no `|| true`, skipped tests, or swallowed catch).\n4. WHEN UNSURE: verify and cross-check BEFORE concluding. Bugs need a reproduction; reading code is not proof.\n5. REPORTING: answer ONLY what was asked. Every fact or file:line MUST come from this turn; else label \"unverified\"; do not guess. Synthesize evidence gracefully \u2014 do NOT dump massive verbatim tool outputs into the final answer. Cite concise file:line references. Never claim a build/test ran, or describe edits, you did not actually do this turn; if a check can't run, fix it or say so \u2014 don't imply success.\n\n6. LANGUAGE: Reply in user's detected language for final output. Internal reasoning, tools, and code remain in English.\n\n7. ANTI-M\u00D9 / COMPACTION: After seeing \"[pre-compaction warning at step...\" or \"[context compacted at step...\", decide if you need full prior tool results. Emit PRESERVE_FULL_CONTEXT for full veto this turn, or the lighter KEEP_TOOL_IDS: id1,id2 (ids from prior stub \"(id=...)\") to protect only high-value results (read_file/grep on src/PLAN/error etc are auto-protected). Use the ee_query tool with \"tool-artifact id=XXX\" for on-demand full re-hydrate of elided ones. Self-check \"task finished?\" / \"compacted yet?\". Use EE checkpoints.\n\n[END CONTRACT \u2014 instructions follow]";
|
|
40
|
+
export declare const AGENT_OPERATING_CONTRACT = "[AGENT OPERATING CONTRACT \u2014 read first; applies to every step]\n\n1. BEFORE ACTING: do only what was asked. Never assume scope or facts \u2014 if ambiguous, ask or use defaults; never invent requirements.\n2. READING: base statements on what you read/ran THIS turn. Do not infer contents of files you did not open.\n3. EXECUTING: smallest correct change; never widen scope or mask failures (no `|| true`, skipped tests, or swallowed catch).\n4. WHEN UNSURE: verify and cross-check BEFORE concluding. Bugs need a reproduction; reading code is not proof.\n5. REPORTING: answer ONLY what was asked. Every fact or file:line MUST come from this turn; else label \"unverified\"; do not guess. Synthesize evidence gracefully \u2014 do NOT dump massive verbatim tool outputs into the final answer. Cite concise file:line references. Never claim a build/test ran, or describe edits, you did not actually do this turn; if a check can't run, fix it or say so \u2014 don't imply success.\n\n6. LANGUAGE: Reply in user's detected language for final output. Internal reasoning, tools, and code remain in English.\n\n7. ANTI-M\u00D9 / COMPACTION: After seeing \"[pre-compaction warning at step...\" or \"[context compacted at step...\", decide if you need full prior tool results. Emit PRESERVE_FULL_CONTEXT for full veto this turn, or the lighter KEEP_TOOL_IDS: id1,id2 (ids from prior stub \"(id=...)\") to protect only high-value results (read_file/grep on src/PLAN/error etc are auto-protected). Use the ee_query tool with \"tool-artifact id=XXX\" for on-demand full re-hydrate of elided ones. Self-check \"task finished?\" / \"compacted yet?\". Use EE checkpoints.\n\n8. GIT SAFETY: never push on red \u2014 run the check, await its result in a SEPARATE step, confirm 0 failures, then push. Never `git add -A`/`commit -a`; stage explicitly so secrets (.env, .muonroi-cli/, keys) aren't committed. Never `--no-verify`.\n\n[END CONTRACT \u2014 instructions follow]";
|
|
41
41
|
export interface ContractSectionOptions {
|
|
42
42
|
/** Chitchat turns carry no tools and make no factual claims — skip the contract. */
|
|
43
43
|
chitchat?: boolean;
|
|
@@ -49,6 +49,8 @@ export const AGENT_OPERATING_CONTRACT = `[AGENT OPERATING CONTRACT — read firs
|
|
|
49
49
|
|
|
50
50
|
7. ANTI-MÙ / COMPACTION: After seeing "[pre-compaction warning at step..." or "[context compacted at step...", decide if you need full prior tool results. Emit PRESERVE_FULL_CONTEXT for full veto this turn, or the lighter KEEP_TOOL_IDS: id1,id2 (ids from prior stub "(id=...)") to protect only high-value results (read_file/grep on src/PLAN/error etc are auto-protected). Use the ee_query tool with "tool-artifact id=XXX" for on-demand full re-hydrate of elided ones. Self-check "task finished?" / "compacted yet?". Use EE checkpoints.
|
|
51
51
|
|
|
52
|
+
8. GIT SAFETY: never push on red — run the check, await its result in a SEPARATE step, confirm 0 failures, then push. Never \`git add -A\`/\`commit -a\`; stage explicitly so secrets (.env, .muonroi-cli/, keys) aren't committed. Never \`--no-verify\`.
|
|
53
|
+
|
|
52
54
|
[END CONTRACT — instructions follow]`;
|
|
53
55
|
/**
|
|
54
56
|
* Build the contract block for insertion at the front of the system prompt.
|
|
@@ -42,8 +42,13 @@ describe("AGENT_OPERATING_CONTRACT", () => {
|
|
|
42
42
|
expect(AGENT_OPERATING_CONTRACT).toMatch(/AGENT OPERATING CONTRACT/i);
|
|
43
43
|
expect(AGENT_OPERATING_CONTRACT).toMatch(/END CONTRACT/i);
|
|
44
44
|
});
|
|
45
|
-
it("
|
|
46
|
-
expect(AGENT_OPERATING_CONTRACT
|
|
45
|
+
it("carries the git-safety rule (never push on red; no broad git add of secrets)", () => {
|
|
46
|
+
expect(AGENT_OPERATING_CONTRACT).toMatch(/GIT SAFETY/i);
|
|
47
|
+
expect(AGENT_OPERATING_CONTRACT).toMatch(/push on red|never push/i);
|
|
48
|
+
expect(AGENT_OPERATING_CONTRACT).toMatch(/git add -A|stage explicitly/i);
|
|
49
|
+
});
|
|
50
|
+
it("stays compact (under 1900 chars) to preserve attention budget on every turn (git-safety rule added)", () => {
|
|
51
|
+
expect(AGENT_OPERATING_CONTRACT.length).toBeLessThan(1900);
|
|
47
52
|
});
|
|
48
53
|
});
|
|
49
54
|
describe("buildContractSection", () => {
|
|
@@ -25,41 +25,41 @@
|
|
|
25
25
|
* Wrapped with the `[CRITICAL TOOL-USE RULES ...]` marker so the model knows
|
|
26
26
|
* to treat these as overrides to anything that follows.
|
|
27
27
|
*/
|
|
28
|
-
export const CHEAP_MODEL_PLAYBOOK = `[CRITICAL TOOL-USE RULES — read before invoking any tool; these override defaults that follow]
|
|
29
|
-
|
|
30
|
-
1. Bash output is AUTOMATICALLY cached. Every \`bash\` call returns a \`run_id\`
|
|
31
|
-
(e.g. \`bash-1\`) you can re-query via \`bash_output_get(run_id, mode=tail|head|grep|lines)\`.
|
|
32
|
-
- When you want only the last N lines: do NOT pipe \`| tail -N\`. Run the
|
|
33
|
-
bare command, then call \`bash_output_get(run_id, mode=tail, lines=N)\`.
|
|
34
|
-
- Same for \`| head\`, \`| grep PATTERN\`, \`> file\`. Pipes/redirects HIDE
|
|
35
|
-
the full output from the cache; \`bash_output_get\` reads from the cache
|
|
36
|
-
without re-running.
|
|
37
|
-
- This applies to EVERY bash call, not just retries.
|
|
38
|
-
- To VIEW a file use \`read_file\` (start_line/end_line) — never sed/cat a
|
|
39
|
-
file. \`bash_output_get\` is for COMMAND output, not files.
|
|
40
|
-
|
|
41
|
-
2. Before reading more than 3 files to understand a topic, delegate to
|
|
42
|
-
\`task(agent="explore")\`. The sub-agent returns a compressed summary;
|
|
43
|
-
you save reading tokens.
|
|
44
|
-
|
|
45
|
-
3. Use the \`grep\` tool (ripgrep) for content search — NOT \`bash\` with
|
|
46
|
-
\`grep\` / \`find\` piped.
|
|
47
|
-
|
|
48
|
-
4. When a tool returns \`ERROR: ...\`, do NOT retry the identical call.
|
|
49
|
-
Pick a different tool, change inputs meaningfully, or stop and report.
|
|
50
|
-
|
|
51
|
-
5. Fix the ROOT CAUSE, never mask a failure to make it "pass"
|
|
52
|
-
(\`continue-on-error\`, swallowed try/catch, skipped/deleted test, \`|| true\`).
|
|
53
|
-
If a step fails from a missing secret/config, make it CONDITIONAL (skip when
|
|
54
|
-
absent) so it still runs when present — do NOT blanket-ignore it.
|
|
55
|
-
|
|
56
|
-
6. For a build / CI / test failure, read the ACTUAL failure log or stack trace
|
|
57
|
-
BEFORE hypothesizing — fix the real error, not a guess from source alone.
|
|
58
|
-
|
|
59
|
-
7. ANTI-MÙ / COMPACTION (for long sessions): On pre-warn or "[context compacted at step...", emit PRESERVE_FULL_CONTEXT (full veto) or lighter KEEP_TOOL_IDS: id1,id2 (from stub id=) to protect specific high-value results. read_file/grep/lsp/bash on src/PLAN/error are auto-kept (idea 1). Use ee.query tool with "tool-artifact id=XXX" for on-demand full. Self-check "task finished?" / "compacted yet?". Use EE checkpoints.
|
|
60
|
-
|
|
61
|
-
[END CRITICAL TOOL-USE RULES — your regular instructions begin below]
|
|
62
|
-
|
|
28
|
+
export const CHEAP_MODEL_PLAYBOOK = `[CRITICAL TOOL-USE RULES — read before invoking any tool; these override defaults that follow]
|
|
29
|
+
|
|
30
|
+
1. Bash output is AUTOMATICALLY cached. Every \`bash\` call returns a \`run_id\`
|
|
31
|
+
(e.g. \`bash-1\`) you can re-query via \`bash_output_get(run_id, mode=tail|head|grep|lines)\`.
|
|
32
|
+
- When you want only the last N lines: do NOT pipe \`| tail -N\`. Run the
|
|
33
|
+
bare command, then call \`bash_output_get(run_id, mode=tail, lines=N)\`.
|
|
34
|
+
- Same for \`| head\`, \`| grep PATTERN\`, \`> file\`. Pipes/redirects HIDE
|
|
35
|
+
the full output from the cache; \`bash_output_get\` reads from the cache
|
|
36
|
+
without re-running.
|
|
37
|
+
- This applies to EVERY bash call, not just retries.
|
|
38
|
+
- To VIEW a file use \`read_file\` (start_line/end_line) — never sed/cat a
|
|
39
|
+
file. \`bash_output_get\` is for COMMAND output, not files.
|
|
40
|
+
|
|
41
|
+
2. Before reading more than 3 files to understand a topic, delegate to
|
|
42
|
+
\`task(agent="explore")\`. The sub-agent returns a compressed summary;
|
|
43
|
+
you save reading tokens.
|
|
44
|
+
|
|
45
|
+
3. Use the \`grep\` tool (ripgrep) for content search — NOT \`bash\` with
|
|
46
|
+
\`grep\` / \`find\` piped.
|
|
47
|
+
|
|
48
|
+
4. When a tool returns \`ERROR: ...\`, do NOT retry the identical call.
|
|
49
|
+
Pick a different tool, change inputs meaningfully, or stop and report.
|
|
50
|
+
|
|
51
|
+
5. Fix the ROOT CAUSE, never mask a failure to make it "pass"
|
|
52
|
+
(\`continue-on-error\`, swallowed try/catch, skipped/deleted test, \`|| true\`).
|
|
53
|
+
If a step fails from a missing secret/config, make it CONDITIONAL (skip when
|
|
54
|
+
absent) so it still runs when present — do NOT blanket-ignore it.
|
|
55
|
+
|
|
56
|
+
6. For a build / CI / test failure, read the ACTUAL failure log or stack trace
|
|
57
|
+
BEFORE hypothesizing — fix the real error, not a guess from source alone.
|
|
58
|
+
|
|
59
|
+
7. ANTI-MÙ / COMPACTION (for long sessions): On pre-warn or "[context compacted at step...", emit PRESERVE_FULL_CONTEXT (full veto) or lighter KEEP_TOOL_IDS: id1,id2 (from stub id=) to protect specific high-value results. read_file/grep/lsp/bash on src/PLAN/error are auto-kept (idea 1). Use ee.query tool with "tool-artifact id=XXX" for on-demand full. Self-check "task finished?" / "compacted yet?". Use EE checkpoints.
|
|
60
|
+
|
|
61
|
+
[END CRITICAL TOOL-USE RULES — your regular instructions begin below]
|
|
62
|
+
|
|
63
63
|
`;
|
|
64
64
|
/**
|
|
65
65
|
* Predicate gating playbook injection.
|
|
@@ -23,19 +23,19 @@
|
|
|
23
23
|
* Universal anti-ramble convergence block — applies to every task type.
|
|
24
24
|
* Kept tight; the per-task addendum below specialises it.
|
|
25
25
|
*/
|
|
26
|
-
export const CHEAP_MODEL_CONVERGENCE = `[CONVERGENCE — minimise tool calls; the system prompt + tools are re-sent every call, so each extra step is expensive]
|
|
27
|
-
|
|
28
|
-
- Plan the FEWEST reads you need, then read the specific file/section directly.
|
|
29
|
-
Do NOT broad-grep, re-read a file you already read, or explore "just in case".
|
|
30
|
-
- The moment you have enough to act, STOP investigating and make the change.
|
|
31
|
-
- Make the SMALLEST correct change for the request; do not widen scope.
|
|
32
|
-
- Finish the action before you answer — never stop mid-step (e.g. "I'm verifying…").
|
|
33
|
-
When done, state completion in ONE line (what changed + that it's verified);
|
|
34
|
-
no recap, no next-steps padding.
|
|
35
|
-
- GROUND every claim in what you actually read or ran THIS turn: cite real
|
|
36
|
-
file:line, and never invent counts, line numbers, names, or bugs. If a number
|
|
37
|
-
(test/file count) is not verified by a command you ran, run the check or mark
|
|
38
|
-
it "unverified" — do NOT guess a value or assert a finding you did not observe.
|
|
26
|
+
export const CHEAP_MODEL_CONVERGENCE = `[CONVERGENCE — minimise tool calls; the system prompt + tools are re-sent every call, so each extra step is expensive]
|
|
27
|
+
|
|
28
|
+
- Plan the FEWEST reads you need, then read the specific file/section directly.
|
|
29
|
+
Do NOT broad-grep, re-read a file you already read, or explore "just in case".
|
|
30
|
+
- The moment you have enough to act, STOP investigating and make the change.
|
|
31
|
+
- Make the SMALLEST correct change for the request; do not widen scope.
|
|
32
|
+
- Finish the action before you answer — never stop mid-step (e.g. "I'm verifying…").
|
|
33
|
+
When done, state completion in ONE line (what changed + that it's verified);
|
|
34
|
+
no recap, no next-steps padding.
|
|
35
|
+
- GROUND every claim in what you actually read or ran THIS turn: cite real
|
|
36
|
+
file:line, and never invent counts, line numbers, names, or bugs. If a number
|
|
37
|
+
(test/file count) is not verified by a command you ran, run the check or mark
|
|
38
|
+
it "unverified" — do NOT guess a value or assert a finding you did not observe.
|
|
39
39
|
- ANTI-MÙ: After compaction note or pre-warn, emit PRESERVE_FULL_CONTEXT (full) or KEEP_TOOL_IDS: id1,id2 to protect high-value (auto for read_file/grep on src/PLAN/error). Use the ee_query tool with "tool-artifact id=XXX" for on-demand full re-hydrate. Recall checkpoints. `;
|
|
40
40
|
/**
|
|
41
41
|
* Per-task-type addenda. Each is 1–2 tight lines targeting that type's most
|
|
@@ -47,6 +47,9 @@ const TASK_WORKBOOKS = {
|
|
|
47
47
|
"Never mask a failure to make it pass (no continue-on-error, swallowed catch, skipped test, `|| true`).",
|
|
48
48
|
generate: "GENERATE: confirm the target file + the surrounding pattern, write the new code to match it, then stop. " +
|
|
49
49
|
"Do not scaffold extras or restructure unrelated code.",
|
|
50
|
+
build: "BUILD: scaffold the MINIMUM runnable project/feature that satisfies the request, matching the chosen stack's " +
|
|
51
|
+
"conventions. Wire it end-to-end and verify it builds/runs before stopping. Do NOT add speculative features, " +
|
|
52
|
+
"extra files, or config the request did not ask for.",
|
|
50
53
|
refactor: "REFACTOR: change only what was named (rename/extract/move). Preserve behaviour; add nothing new.",
|
|
51
54
|
analyze: "ANALYZE: answer from what you have already read — do not read the whole codebase. Bullet findings, no narrative. " +
|
|
52
55
|
"For a repo/code review, base findings on the ACTUAL code you inspect (file sizes, structure, key modules), not just AGENTS.md/CLAUDE.md docs.",
|