muonroi-cli 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +122 -122
  3. package/dist/packages/agent-harness-core/src/predicate.d.ts +1 -1
  4. package/dist/src/agent-harness/__tests__/mock-model.spec.js +48 -1
  5. package/dist/src/agent-harness/mock-model.d.ts +11 -0
  6. package/dist/src/agent-harness/mock-model.js +21 -0
  7. package/dist/src/cli/cost-forensics.js +12 -12
  8. package/dist/src/council/__tests__/clarification-prompt.test.js +51 -0
  9. package/dist/src/council/__tests__/clarifier-ready-gate.test.js +32 -0
  10. package/dist/src/council/__tests__/decisions-lock.test.js +17 -1
  11. package/dist/src/council/__tests__/oauth-reachable.test.d.ts +1 -0
  12. package/dist/src/council/__tests__/oauth-reachable.test.js +31 -0
  13. package/dist/src/council/__tests__/parse-outcome-fallback.test.js +11 -0
  14. package/dist/src/council/clarifier.js +9 -1
  15. package/dist/src/council/debate.js +5 -1
  16. package/dist/src/council/decisions-lock.js +3 -3
  17. package/dist/src/council/index.js +12 -5
  18. package/dist/src/council/leader.d.ts +0 -17
  19. package/dist/src/council/leader.js +22 -15
  20. package/dist/src/council/planner.js +1 -1
  21. package/dist/src/council/prompts.js +63 -57
  22. package/dist/src/council/types.d.ts +7 -0
  23. package/dist/src/ee/__tests__/ee-onboarding.test.d.ts +1 -0
  24. package/dist/src/ee/__tests__/ee-onboarding.test.js +32 -0
  25. package/dist/src/ee/auth.d.ts +9 -0
  26. package/dist/src/ee/auth.js +19 -0
  27. package/dist/src/ee/ee-onboarding.d.ts +5 -0
  28. package/dist/src/ee/ee-onboarding.js +76 -0
  29. package/dist/src/generated/version.d.ts +1 -1
  30. package/dist/src/generated/version.js +1 -1
  31. package/dist/src/headless/output.js +6 -4
  32. package/dist/src/headless/output.test.js +4 -3
  33. package/dist/src/index.js +20 -1
  34. package/dist/src/mcp/__tests__/auto-setup.test.js +74 -0
  35. package/dist/src/mcp/__tests__/client-pool.spec.d.ts +1 -0
  36. package/dist/src/mcp/__tests__/client-pool.spec.js +98 -0
  37. package/dist/src/mcp/__tests__/parallel-build.spec.d.ts +1 -0
  38. package/dist/src/mcp/__tests__/parallel-build.spec.js +67 -0
  39. package/dist/src/mcp/__tests__/smart-filter.test.js +56 -0
  40. package/dist/src/mcp/auto-setup.js +56 -2
  41. package/dist/src/mcp/client-pool.d.ts +46 -0
  42. package/dist/src/mcp/client-pool.js +212 -0
  43. package/dist/src/mcp/oauth-callback.js +2 -2
  44. package/dist/src/mcp/parse-headers.test.js +14 -14
  45. package/dist/src/mcp/runtime.d.ts +28 -0
  46. package/dist/src/mcp/runtime.js +117 -51
  47. package/dist/src/mcp/self-verify-runner.d.ts +14 -0
  48. package/dist/src/mcp/self-verify-runner.js +38 -0
  49. package/dist/src/mcp/setup-guide-text.d.ts +9 -0
  50. package/dist/src/mcp/setup-guide-text.js +84 -0
  51. package/dist/src/mcp/smart-filter.js +49 -0
  52. package/dist/src/mcp/smoke.test.js +43 -43
  53. package/dist/src/mcp/tools-server.d.ts +7 -0
  54. package/dist/src/mcp/tools-server.js +19 -22
  55. package/dist/src/models/catalog.json +349 -349
  56. package/dist/src/ops/__tests__/doctor-ee-health.test.js +21 -0
  57. package/dist/src/ops/doctor.d.ts +3 -2
  58. package/dist/src/ops/doctor.js +47 -11
  59. package/dist/src/ops/doctor.test.js +4 -3
  60. package/dist/src/orchestrator/__tests__/mcp-capability-block.test.d.ts +1 -0
  61. package/dist/src/orchestrator/__tests__/mcp-capability-block.test.js +39 -0
  62. package/dist/src/orchestrator/__tests__/project-stack.test.d.ts +1 -0
  63. package/dist/src/orchestrator/__tests__/project-stack.test.js +65 -0
  64. package/dist/src/orchestrator/batch-turn-runner.js +7 -11
  65. package/dist/src/orchestrator/message-processor.js +57 -27
  66. package/dist/src/orchestrator/orchestrator.js +26 -0
  67. package/dist/src/orchestrator/prompts.d.ts +51 -0
  68. package/dist/src/orchestrator/prompts.js +257 -134
  69. package/dist/src/orchestrator/scope-ceiling.js +6 -1
  70. package/dist/src/orchestrator/stream-runner.js +20 -15
  71. package/dist/src/orchestrator/text-tool-call-detector.test.js +13 -13
  72. package/dist/src/pil/__tests__/clarity-gate.test.js +24 -215
  73. package/dist/src/pil/__tests__/config.test.js +1 -17
  74. package/dist/src/pil/__tests__/discovery.test.js +144 -11
  75. package/dist/src/pil/__tests__/layer1-intent-trace.test.js +7 -2
  76. package/dist/src/pil/__tests__/layer1-intent.test.js +3 -0
  77. package/dist/src/pil/__tests__/layer16-clarity.test.js +32 -116
  78. package/dist/src/pil/__tests__/layer4-gsd.test.js +37 -0
  79. package/dist/src/pil/__tests__/layer6-output.test.js +137 -18
  80. package/dist/src/pil/__tests__/llm-classify.test.js +49 -2
  81. package/dist/src/pil/agent-operating-contract.d.ts +1 -1
  82. package/dist/src/pil/agent-operating-contract.js +2 -0
  83. package/dist/src/pil/agent-operating-contract.test.js +7 -2
  84. package/dist/src/pil/cheap-model-playbook.js +35 -35
  85. package/dist/src/pil/cheap-model-workbooks.js +16 -13
  86. package/dist/src/pil/clarity-gate.d.ts +21 -19
  87. package/dist/src/pil/clarity-gate.js +26 -153
  88. package/dist/src/pil/config.d.ts +9 -1
  89. package/dist/src/pil/config.js +15 -4
  90. package/dist/src/pil/discovery.js +211 -136
  91. package/dist/src/pil/layer1-intent.d.ts +12 -0
  92. package/dist/src/pil/layer1-intent.js +283 -38
  93. package/dist/src/pil/layer1-intent.test.js +210 -4
  94. package/dist/src/pil/layer16-clarity.d.ts +25 -11
  95. package/dist/src/pil/layer16-clarity.js +19 -306
  96. package/dist/src/pil/layer4-gsd.js +18 -6
  97. package/dist/src/pil/layer6-output.d.ts +2 -0
  98. package/dist/src/pil/layer6-output.js +137 -22
  99. package/dist/src/pil/llm-classify.d.ts +26 -0
  100. package/dist/src/pil/llm-classify.js +34 -5
  101. package/dist/src/pil/native-capabilities-workbook.d.ts +1 -1
  102. package/dist/src/pil/native-capabilities-workbook.js +82 -76
  103. package/dist/src/pil/schema.d.ts +8 -0
  104. package/dist/src/pil/schema.js +12 -1
  105. package/dist/src/pil/task-tier-map.js +4 -0
  106. package/dist/src/pil/types.d.ts +11 -1
  107. package/dist/src/product-loop/done-gate.js +3 -3
  108. package/dist/src/product-loop/loop-driver.js +18 -18
  109. package/dist/src/product-loop/progress-snapshot.js +4 -4
  110. package/dist/src/providers/auth/gemini-oauth.js +6 -15
  111. package/dist/src/providers/auth/grok-oauth.js +6 -15
  112. package/dist/src/providers/auth/openai-oauth.js +6 -15
  113. package/dist/src/providers/mcp-vision-bridge.js +48 -48
  114. package/dist/src/reporter/index.js +1 -1
  115. package/dist/src/scaffold/bb-ecosystem-apply.js +47 -47
  116. package/dist/src/scaffold/bb-quality-gate.js +5 -5
  117. package/dist/src/scaffold/continuation-prompt.js +60 -60
  118. package/dist/src/scaffold/init-new.js +453 -453
  119. package/dist/src/self-qa/__tests__/scenario-planner.test.js +3 -3
  120. package/dist/src/self-qa/agentic-loop.js +24 -19
  121. package/dist/src/self-qa/spec-emitter.js +26 -23
  122. package/dist/src/storage/__tests__/migrations.test.js +2 -2
  123. package/dist/src/storage/interaction-log.js +5 -5
  124. package/dist/src/storage/migrations.js +122 -122
  125. package/dist/src/storage/sessions.js +42 -42
  126. package/dist/src/storage/transcript.js +91 -84
  127. package/dist/src/storage/usage.js +14 -14
  128. package/dist/src/storage/workspaces.js +12 -12
  129. package/dist/src/tools/__tests__/native-tools.test.d.ts +1 -0
  130. package/dist/src/tools/__tests__/native-tools.test.js +53 -0
  131. package/dist/src/tools/git-safety.d.ts +61 -0
  132. package/dist/src/tools/git-safety.js +141 -0
  133. package/dist/src/tools/git-safety.test.d.ts +1 -0
  134. package/dist/src/tools/git-safety.test.js +111 -0
  135. package/dist/src/tools/native-tools.d.ts +31 -0
  136. package/dist/src/tools/native-tools.js +273 -0
  137. package/dist/src/tools/registry-git-safety.test.d.ts +7 -0
  138. package/dist/src/tools/registry-git-safety.test.js +92 -0
  139. package/dist/src/tools/registry.js +39 -4
  140. package/dist/src/ui/__tests__/markdown-render.test.d.ts +1 -0
  141. package/dist/src/ui/__tests__/markdown-render.test.js +48 -0
  142. package/dist/src/ui/app.js +0 -0
  143. package/dist/src/ui/components/message-view.js +4 -1
  144. package/dist/src/ui/components/structured-response-view.js +7 -3
  145. package/dist/src/ui/components/tool-group.js +7 -1
  146. package/dist/src/ui/markdown-render.d.ts +41 -0
  147. package/dist/src/ui/markdown-render.js +223 -0
  148. package/dist/src/ui/markdown.d.ts +10 -0
  149. package/dist/src/ui/markdown.js +12 -35
  150. package/dist/src/ui/slash/council-inspect.js +4 -4
  151. package/dist/src/ui/slash/export.js +4 -4
  152. package/dist/src/ui/utils/text.d.ts +8 -0
  153. package/dist/src/ui/utils/text.js +16 -0
  154. package/dist/src/ui/utils/text.test.d.ts +1 -0
  155. package/dist/src/ui/utils/text.test.js +23 -0
  156. package/dist/src/usage/ledger.js +48 -15
  157. package/dist/src/utils/__tests__/footprint-gitignore.test.d.ts +1 -0
  158. package/dist/src/utils/__tests__/footprint-gitignore.test.js +50 -0
  159. package/dist/src/utils/clipboard-image.js +23 -23
  160. package/dist/src/utils/open-url.d.ts +56 -0
  161. package/dist/src/utils/open-url.js +58 -0
  162. package/dist/src/utils/open-url.test.d.ts +1 -0
  163. package/dist/src/utils/open-url.test.js +86 -0
  164. package/dist/src/utils/settings.d.ts +12 -0
  165. package/dist/src/utils/settings.js +48 -0
  166. package/dist/src/utils/side-question.js +2 -2
  167. package/dist/src/utils/skills.js +3 -3
  168. package/dist/src/verify/__tests__/coverage-parsers.test.js +30 -30
  169. package/dist/src/verify/environment.js +2 -1
  170. package/package.json +1 -1
  171. package/dist/src/pil/layer16-clarity.test.js +0 -31
  172. /package/dist/src/{pil/layer16-clarity.test.d.ts → council/__tests__/clarification-prompt.test.d.ts} +0 -0
@@ -1,5 +1,9 @@
1
1
  import { describe, expect, it } from "vitest";
2
- import { buildInterviewQuestion, detectClarityGaps, resolveGapsNonInteractive } from "../layer16-clarity.js";
2
+ import { buildInterviewQuestion, resolveGapsNonInteractive } from "../layer16-clarity.js";
3
+ // Phase 2 (2026-06-16): detectClarityGaps + its keyword option-builders were
4
+ // removed (the model now generates every clarification). The surviving helpers
5
+ // — buildInterviewQuestion (render) and resolveGapsNonInteractive (headless
6
+ // default-answer resolution) — are exercised here with model-shaped gaps.
3
7
  const EMPTY_PROJECT = {
4
8
  language: "typescript",
5
9
  framework: null,
@@ -10,122 +14,10 @@ const EMPTY_PROJECT = {
10
14
  { path: "src/billing/", name: "billing", entryFiles: [], exportedSymbols: [] },
11
15
  ],
12
16
  eePatterns: [],
13
- relevantModules: [],
17
+ relevantModules: [{ path: "src/auth/", relevance: "named in prompt", exists: true }],
14
18
  scannedAt: Date.now(),
15
19
  cwd: "/proj",
16
20
  };
17
- describe("detectClarityGaps()", () => {
18
- it("detects outcome gap for vague non-debug prompt", () => {
19
- // PIL-L6 fix — debug now joins the autofill set, so vague debug prompts
20
- // ("fix auth") no longer trigger an outcome question. Use a generate
21
- // prompt instead to still cover the gap-detection path.
22
- const gaps = detectClarityGaps("build something", "generate", 0.7, EMPTY_PROJECT);
23
- const outcomeGap = gaps.find((g) => g.dimension === "outcome");
24
- expect(outcomeGap).toBeDefined();
25
- });
26
- it("does NOT detect outcome gap for vague debug prompt (autofilled)", () => {
27
- const gaps = detectClarityGaps("fix auth", "debug", 0.7, EMPTY_PROJECT);
28
- const outcomeGap = gaps.find((g) => g.dimension === "outcome");
29
- expect(outcomeGap).toBeUndefined();
30
- });
31
- it("does NOT detect an outcome gap for a vague general prompt (B2 intent-swallow guard)", () => {
32
- // B2 — a `general` prompt's only outcome options are tautological
33
- // ("Task completed" / "Issue resolved"). Asking them lets the default
34
- // answer overwrite the user's real request, so the intent collapses to
35
- // "general: Task completed" and the original prompt is lost. Skip the
36
- // askcard so the outcome falls back to the raw request downstream.
37
- const gaps = detectClarityGaps("the project feels messy", "general", 0.7, EMPTY_PROJECT);
38
- const outcomeGap = gaps.find((g) => g.dimension === "outcome");
39
- expect(outcomeGap).toBeUndefined();
40
- });
41
- it("detects scope gap when no file reference", () => {
42
- const gaps = detectClarityGaps("fix auth", "debug", 0.7, EMPTY_PROJECT);
43
- const scopeGap = gaps.find((g) => g.dimension === "scope");
44
- expect(scopeGap).toBeDefined();
45
- });
46
- it("returns no gaps for specific prompt", () => {
47
- const gaps = detectClarityGaps("fix TypeError in src/auth/login.ts:42", "debug", 0.9, EMPTY_PROJECT);
48
- expect(gaps).toHaveLength(0);
49
- });
50
- it("scope options include matching bounded contexts", () => {
51
- const gaps = detectClarityGaps("fix auth", "debug", 0.7, EMPTY_PROJECT);
52
- const scopeGap = gaps.find((g) => g.dimension === "scope");
53
- expect(scopeGap?.options.some((o) => o.includes("auth"))).toBe(true);
54
- });
55
- it("does NOT detect a scope gap for a general prompt with no codebase signal (B2-symmetric scope guard)", () => {
56
- // Live drive (session 8a87aa060c6a): the pure non-codebase prompt "Reply
57
- // with exactly one word: PONG" fired the scope askcard "Which part of the
58
- // codebase should this target?" because countFileReferences /
59
- // hasExplicitScope / hasOperationalScope were all empty — the detector
60
- // assumes every prompt is a codebase task. A general/unclassified prompt
61
- // has no codebase dimension to scope, so the question is nonsensical (and
62
- // its acceptance card is downstream noise). Skip it, symmetric to the B2
63
- // outcome guard; scope falls back to project-root downstream.
64
- const gaps = detectClarityGaps("Reply with exactly one word: PONG", "general", 0.6, EMPTY_PROJECT);
65
- expect(gaps.find((g) => g.dimension === "scope")).toBeUndefined();
66
- // The only candidate gap was scope → general prompt now yields zero gaps,
67
- // so discovery never marks interviewed=true and shows no acceptance card.
68
- expect(gaps).toHaveLength(0);
69
- });
70
- it("STILL detects a scope gap for a classified (non-general) task with no file reference", () => {
71
- // Guard must stay narrow: a real code task that simply omitted a path still
72
- // benefits from the scope-narrowing askcard. Only general/null is skipped.
73
- const gaps = detectClarityGaps("implement the search feature", "generate", 0.7, EMPTY_PROJECT);
74
- expect(gaps.find((g) => g.dimension === "scope")).toBeDefined();
75
- });
76
- it("does NOT detect a scope gap for an image-analysis prompt (image is the scope)", () => {
77
- // Live drive (PR#34 probe): "Take a screenshot of the homepage and analyze
78
- // the diagram.png image to describe its layout" fired the codebase-scope
79
- // askcard "Which part of the codebase should this target?" — nonsensical for
80
- // an image-analysis task. The image (screenshot / diagram.png) IS the scope,
81
- // symmetric to how operational (CI/build) prompts are scoped to the pipeline.
82
- const gaps = detectClarityGaps("Take a screenshot of the homepage and analyze the diagram.png image to describe its layout", "analyze", 0.7, EMPTY_PROJECT);
83
- expect(gaps.find((g) => g.dimension === "scope")).toBeUndefined();
84
- // analyze autofills outcome, so with scope suppressed there are zero gaps →
85
- // no interview, no acceptance card.
86
- expect(gaps).toHaveLength(0);
87
- });
88
- it("STILL detects a scope gap for a code task that mentions an ambiguous non-image word", () => {
89
- // Narrowness guard: image-scope suppression must not swallow real codebase
90
- // tasks. "add a logo to the header" carries no concrete image signal (no
91
- // file extension / screenshot / photo), so the scope askcard stays.
92
- const gaps = detectClarityGaps("add a logo to the header", "generate", 0.7, EMPTY_PROJECT);
93
- expect(gaps.find((g) => g.dimension === "scope")).toBeDefined();
94
- });
95
- it("does NOT detect a scope gap for a web-search / external-info prompt", () => {
96
- // Live drive (tavily probe, session d7a45a2dba30): "search the web for the
97
- // latest vitest release notes" classified taskType=analyze fired the
98
- // codebase-scope askcard and recorded a wrong scope ("src/mcp"). A
99
- // web-search task is scoped to the web, not the codebase — symmetric to the
100
- // image-scope and operational-scope guards.
101
- const gaps = detectClarityGaps("search the web for the latest vitest release notes", "analyze", 0.7, EMPTY_PROJECT);
102
- expect(gaps.find((g) => g.dimension === "scope")).toBeUndefined();
103
- expect(gaps).toHaveLength(0);
104
- });
105
- it("does NOT detect a scope gap for a self-contained computation prompt (data is inline)", () => {
106
- // Live drive (deepseek-vs-grok A/B, session 17fc23f0): "Compute f([3,1,2])
107
- // where f sorts the list ascending then returns the sum of the first two
108
- // elements." classified taskType=analyze (regex:read matched the bare word
109
- // "list", conf 0.80 → skipped the brain) fired BOTH the pil-interview scope
110
- // askcard ("Which part of the codebase should this target?" → auto "Entire
111
- // project") AND the pil-acceptance card. The operand [3,1,2] is supplied
112
- // inline — the task has no codebase dimension to scope. Symmetric to the
113
- // image / web / operational scope guards.
114
- const gaps = detectClarityGaps("Compute f([3,1,2]) where f sorts the list ascending then returns the sum of the first two elements.", "analyze", 0.8, EMPTY_PROJECT);
115
- expect(gaps.find((g) => g.dimension === "scope")).toBeUndefined();
116
- // analyze autofills outcome, so with scope suppressed there are zero gaps →
117
- // no interview, no acceptance card.
118
- expect(gaps).toHaveLength(0);
119
- });
120
- it("STILL detects a scope gap for a code task that embeds a literal but no compute framing", () => {
121
- // Narrowness guard: the inline-literal suppression must not swallow real
122
- // codebase tasks. "set the default retry delays to [100, 200, 400] in the
123
- // config" carries a literal but is scoped to the codebase (no compute verb),
124
- // so the scope askcard stays.
125
- const gaps = detectClarityGaps("set the default retry delays to [100, 200, 400] in the config", "generate", 0.7, EMPTY_PROJECT);
126
- expect(gaps.find((g) => g.dimension === "scope")).toBeDefined();
127
- });
128
- });
129
21
  describe("buildInterviewQuestion()", () => {
130
22
  it("builds a CouncilQuestionData with pil-interview phase", () => {
131
23
  const gap = {
@@ -141,11 +33,35 @@ describe("buildInterviewQuestion()", () => {
141
33
  expect(q.options).toBeDefined();
142
34
  expect(q.options.some((o) => o.kind === "freetext")).toBe(true);
143
35
  });
36
+ it("surfaces the model's reason (gap.description) as the askcard context", () => {
37
+ const gap = {
38
+ dimension: "outcome",
39
+ description: "answering this changes whether we add OAuth or just API keys",
40
+ suggestedQuestion: "Which auth method?",
41
+ options: ["OAuth", "API keys"],
42
+ defaultIndex: 0,
43
+ };
44
+ const q = buildInterviewQuestion(gap, "q-2");
45
+ expect(q.context).toBe("answering this changes whether we add OAuth or just API keys");
46
+ });
144
47
  });
145
48
  describe("resolveGapsNonInteractive()", () => {
146
- it("fills gaps with best-effort from project context", () => {
147
- const gaps = detectClarityGaps("fix auth", "debug", 0.7, EMPTY_PROJECT);
49
+ it("fills gaps with best-effort defaults from the model options + project context", () => {
50
+ const gaps = [
51
+ {
52
+ dimension: "outcome",
53
+ description: "Model-generated clarification #1",
54
+ suggestedQuestion: "What outcome do you expect?",
55
+ options: ["Error resolved", "Other (type free answer)"],
56
+ defaultIndex: 0,
57
+ },
58
+ ];
148
59
  const resolved = resolveGapsNonInteractive(gaps, EMPTY_PROJECT, "fix auth");
60
+ expect(resolved.outcome).toBe("Error resolved");
61
+ expect(resolved.scope.length).toBeGreaterThan(0);
62
+ });
63
+ it("falls back to the raw-derived outcome when there is no outcome gap", () => {
64
+ const resolved = resolveGapsNonInteractive([], EMPTY_PROJECT, "fix the login bug");
149
65
  expect(resolved.outcome).toBeTruthy();
150
66
  expect(resolved.scope.length).toBeGreaterThan(0);
151
67
  });
@@ -75,6 +75,43 @@ describe("layer4Gsd (gsd-native)", () => {
75
75
  const result = await layer4Gsd(makeCtx({ raw: "review the pull request" }));
76
76
  expect(["review", "discuss", "execute"]).toContain(result.gsdPhase);
77
77
  });
78
+ it("routes a question-shaped analyze/debug prompt to the QUESTION directive (no 'state a plan')", async () => {
79
+ // De-robotizing: a plain question must not get the STANDARD "state a 2-3 line
80
+ // plan" scaffold even when L1 classifies it analyze/debug (not "general").
81
+ const q = "why does the build fail intermittently?";
82
+ const result = await layer4Gsd(makeCtx({ raw: q, enriched: q, taskType: "debug", intentKind: "task" }));
83
+ expect(result.enriched).toContain("QUESTION / explanatory");
84
+ expect(result.enriched).not.toContain("State a 2-3 line plan");
85
+ });
86
+ it("treats a genuine general question (general + task) as informational", async () => {
87
+ const q = "what does the enrichment layer do?";
88
+ const result = await layer4Gsd(makeCtx({ raw: q, enriched: q, taskType: "general", intentKind: "task" }));
89
+ expect(result.enriched).toContain("QUESTION / explanatory");
90
+ });
91
+ it("does NOT treat an implementation request as informational even if phrased as a question", async () => {
92
+ // isImplementationIntent guards the question clause: "can you refactor … and
93
+ // wire up …" is a real edit task → STANDARD scaffold, not the QUESTION directive.
94
+ const q = "can you refactor the dropdown and wire up the keyboard handlers?";
95
+ const result = await layer4Gsd(makeCtx({ raw: q, enriched: q, taskType: "refactor", intentKind: "task" }));
96
+ expect(result.enriched).not.toContain("QUESTION / explanatory");
97
+ });
98
+ it("Phase 2b: deliverableKind='answer' is informational even for an imperative (no '?') prompt", async () => {
99
+ // The raw text is a plain imperative — the legacy regex (isQuestionLike /
100
+ // isMetaAnalysisPrompt) would NOT mark it informational. The model's
101
+ // deliverableKind='answer' must override that and route to the QUESTION
102
+ // directive — proving L4 consumes the model signal, not the regex.
103
+ const raw = "go over the auth module and tell me what it does";
104
+ const result = await layer4Gsd(makeCtx({ raw, enriched: raw, taskType: "analyze", intentKind: "task", deliverableKind: "answer" }));
105
+ expect(result.enriched).toContain("QUESTION / explanatory");
106
+ });
107
+ it("Phase 2b: deliverableKind='code' is NOT informational even for a question-shaped prompt", async () => {
108
+ // The raw text reads as a question — the legacy regex would mark it
109
+ // informational. The model's deliverableKind='code' must override that so
110
+ // the STANDARD implement scaffold is used (the deliverable is file edits).
111
+ const raw = "why not just refactor the dropdown and wire the keyboard handlers?";
112
+ const result = await layer4Gsd(makeCtx({ raw, enriched: raw, taskType: "refactor", intentKind: "task", deliverableKind: "code" }));
113
+ expect(result.enriched).not.toContain("QUESTION / explanatory");
114
+ });
78
115
  it("uses ctx.gsdPhase from L1 (unified path) without calling routeTask", async () => {
79
116
  const { routeTask } = await import("../../ee/bridge.js");
80
117
  vi.mocked(routeTask).mockClear();
@@ -1,5 +1,5 @@
1
1
  import { beforeEach, describe, expect, it, vi } from "vitest";
2
- import { applyPilSuffix, getResponseToolSet, layer6Output } from "../layer6-output.js";
2
+ import { applyPilSuffix, getResponseToolSet, isImplementationIntent, isQuestionLike, layer6Output, } from "../layer6-output.js";
3
3
  // Mock bridge for PIL-03 classifyViaBrain tests
4
4
  vi.mock("../../ee/bridge.js", () => ({
5
5
  classifyViaBrain: vi.fn().mockResolvedValue(null),
@@ -55,6 +55,38 @@ describe("applyPilSuffix — per-task-type suffixes", () => {
55
55
  expect(result).toMatch(/Tôi sẽ/); // bilingual
56
56
  }
57
57
  });
58
+ it("de-robotized: NO_PREAMBLE bans only openers, not end-of-turn summary or inter-tool narration", () => {
59
+ // The summary + inter-tool bans were removed because they stripped natural
60
+ // connective tissue (the "máy móc" feel). Inter-tool spam is still removed
61
+ // structurally by stripInterToolNarration() in reasoning.ts. This guards
62
+ // against the bans silently creeping back into the system prompt.
63
+ const result = applyPilSuffix("S", makeCtx("debug", "concise"));
64
+ expect(result).toMatch(/FORBIDDEN OPENERS/);
65
+ expect(result).not.toMatch(/FORBIDDEN END-OF-TURN SUMMARY/);
66
+ expect(result).not.toMatch(/FORBIDDEN INTER-TOOL NARRATION/);
67
+ });
68
+ it("de-robotized: debug suffix is guidance, not a rigid arrow skeleton", () => {
69
+ // "Format = Hypothesis → Root cause → Fix → Verify" produced stilted,
70
+ // label-prefixed answers. It must read as guidance now.
71
+ const result = applyPilSuffix("S", makeCtx("debug", "concise"));
72
+ expect(result).toContain("OUTPUT RULES (debug)");
73
+ expect(result).not.toMatch(/Format = Hypothesis/);
74
+ });
75
+ it("E: appends the anti-bookkeeping note on the natural path for non-question turns", () => {
76
+ // The contract's REPORTING rule leaks as a provenance footer ("evidence only
77
+ // from this turn") on imperative answer turns; the natural path now guards it.
78
+ const result = applyPilSuffix("S", makeCtx("analyze", "concise"));
79
+ expect(result).toMatch(/WRITE FOR THE READER/);
80
+ expect(result).toMatch(/provenance/i);
81
+ });
82
+ it("E: skips the anti-bookkeeping note for question turns (L4 QUESTION directive covers them)", () => {
83
+ const ctx = { ...makeCtx("analyze", "concise"), raw: "why does the enrichment layer fail?" };
84
+ expect(applyPilSuffix("S", ctx)).not.toMatch(/WRITE FOR THE READER/);
85
+ });
86
+ it("E: response-tools path does not add the natural-path bookkeeping note", () => {
87
+ const result = applyPilSuffix("S", makeCtx("analyze", "balanced"), true);
88
+ expect(result).not.toMatch(/WRITE FOR THE READER/);
89
+ });
58
90
  it("PIL-04: response-tools path skips budget+preamble (tool already enforces structure)", () => {
59
91
  const result = applyPilSuffix("S", makeCtx("analyze", "balanced"), true);
60
92
  expect(result).toContain("respond_analyze");
@@ -74,29 +106,33 @@ describe("applyPilSuffix — per-task-type suffixes", () => {
74
106
  expect(result).toMatch(/Do NOT append an evidence-provenance footer/);
75
107
  });
76
108
  });
77
- describe("getResponseToolSet — PIL-04 Tier 1.1 gating", () => {
78
- it("returns response tool for analyze (list-shaped, JSON wins)", () => {
79
- const tools = getResponseToolSet(makeCtx("analyze", null));
109
+ describe("getResponseToolSet — narrow gating (de-robotizing)", () => {
110
+ // Override raw on a typed ctx so the report/question discriminator is exercised.
111
+ const ctxRaw = (raw, t) => ({ ...makeCtx(t, null), raw });
112
+ it("returns response tool for analyze on an explicit report/list request", () => {
113
+ const tools = getResponseToolSet(ctxRaw("audit the orchestrator and list all cost-leak findings", "analyze"));
80
114
  expect(Object.keys(tools)).toContain("respond_analyze");
81
115
  });
82
- it("returns response tool for plan (list-shaped, JSON wins)", () => {
83
- const tools = getResponseToolSet(makeCtx("plan", null));
116
+ it("returns response tool for plan on an explicit plan request", () => {
117
+ const tools = getResponseToolSet(ctxRaw("plan the migration to the new auth flow step by step", "plan"));
84
118
  expect(Object.keys(tools)).toContain("respond_plan");
85
119
  });
120
+ it("returns response tool for debug only on an explicit report request", () => {
121
+ const tools = getResponseToolSet(ctxRaw("audit the failing suite and list each root cause", "debug"));
122
+ expect(Object.keys(tools)).toContain("respond_debug");
123
+ });
86
124
  it("returns empty toolset for generate (code-heavy, markdown wins)", () => {
87
125
  expect(getResponseToolSet(makeCtx("generate", null))).toEqual({});
88
126
  });
89
127
  it("returns empty toolset for refactor (diff-heavy, markdown wins)", () => {
90
128
  expect(getResponseToolSet(makeCtx("refactor", null))).toEqual({});
91
129
  });
92
- it("returns response tool for debug (bounded schema, structural enforcement wins)", () => {
93
- const tools = getResponseToolSet(makeCtx("debug", null));
94
- expect(Object.keys(tools)).toContain("respond_debug");
95
- });
96
130
  it("returns empty toolset for documentation (prose-heavy)", () => {
97
131
  expect(getResponseToolSet(makeCtx("documentation", null))).toEqual({});
98
132
  });
99
- it("returns response tool for general when no providerId is passed (back-compat)", () => {
133
+ it("returns response tool for general regardless of report signal (renders as plain markdown)", () => {
134
+ // general is exempt from the report/question gate: GeneralSchema is pure text
135
+ // and its renderer shows plain markdown, so respond_general is never robotic.
100
136
  const tools = getResponseToolSet(makeCtx("general", null));
101
137
  expect(Object.keys(tools)).toContain("respond_general");
102
138
  });
@@ -113,24 +149,78 @@ describe("getResponseToolSet — PIL-04 Tier 1.1 gating", () => {
113
149
  it("returns empty toolset when taskType is null", () => {
114
150
  expect(getResponseToolSet(makeCtx(null, null))).toEqual({});
115
151
  });
152
+ it("gates the response tool for chitchat turns", () => {
153
+ const ctx = { ...makeCtx("general", null), intentKind: "chitchat" };
154
+ expect(getResponseToolSet(ctx)).toEqual({});
155
+ });
156
+ it("DROPS respond_<task> for question-style debug/analyze/plan (natural markdown path)", () => {
157
+ // The de-robotizing change: a plain QUESTION must not be forced into the
158
+ // rigid respond_* schema + labeled renderer. It falls through to the softened
159
+ // markdown OUTPUT RULES so the answer reads as natural prose.
160
+ expect(getResponseToolSet(ctxRaw("why does the build fail intermittently?", "debug"))).toEqual({});
161
+ expect(getResponseToolSet(ctxRaw("analyze how the enrichment function works", "analyze"))).toEqual({});
162
+ expect(getResponseToolSet(ctxRaw("what is the cleanest way to structure this module?", "plan"))).toEqual({});
163
+ });
164
+ it("KEEPS respond_<task> for explicit report / list / plan requests (EN + VI)", () => {
165
+ const keep = (raw, t) => Object.keys(getResponseToolSet(ctxRaw(raw, t)));
166
+ expect(keep("list all cost leaks in the orchestrator", "analyze")).toContain("respond_analyze");
167
+ expect(keep("review the module and report each finding by severity", "analyze")).toContain("respond_analyze");
168
+ expect(keep("lập kế hoạch migration sang auth flow mới", "plan")).toContain("respond_plan");
169
+ });
170
+ it("DROPS respond_<task> for a QUESTION that merely mentions plan/list (narrow-gate fix)", () => {
171
+ // Live bug (grok interview): a question that QUOTED the phrase "state a 2-3
172
+ // line plan" matched the bare word 'plan' in STRUCTURED_REPORT_RE and forced
173
+ // respond_plan, cramming an introspective answer into a rigid plan schema. A
174
+ // question-shaped prompt must stay on the natural markdown path even when it
175
+ // contains plan/list words.
176
+ expect(getResponseToolSet(ctxRaw("what rules constrain you, e.g. the 'state a 2-3 line plan' directive?", "plan"))).toEqual({});
177
+ expect(getResponseToolSet(ctxRaw("can you list the main points?", "analyze"))).toEqual({});
178
+ expect(getResponseToolSet(ctxRaw("how would you plan the rollout?", "plan"))).toEqual({});
179
+ // Imperative delivery requests are NOT question-shaped → still structured.
180
+ expect(Object.keys(getResponseToolSet(ctxRaw("plan the rollout step by step", "plan")))).toContain("respond_plan");
181
+ });
116
182
  it("drops respond_<task> on an IMPLEMENTATION-intent prompt (no premature terminal answer)", () => {
117
183
  // Live (grok session 19fa8895c41c): an "Improve … implement these fixes"
118
184
  // prompt classified `debug` got respond_debug; the model called it mid-task
119
185
  // as a plan and the turn ended before the edits completed. Implementation
120
186
  // turns must fall through to markdown OUTPUT RULES, not a terminal tool.
187
+ // Implementation intent takes precedence over a report signal.
121
188
  const impl = (raw, t) => ({ ...makeCtx(t, null), raw });
122
189
  expect(getResponseToolSet(impl("Improve the story-list screen. Implement these prioritized fixes: …", "debug"))).toEqual({});
123
190
  expect(getResponseToolSet(impl("Edit ONLY these two files and fix the empty span", "debug"))).toEqual({});
124
191
  expect(getResponseToolSet(impl("refactor the genre dropdown and wire up keyboard handlers", "analyze"))).toEqual({});
125
192
  expect(getResponseToolSet(impl("triển khai các cải tiến đã đề xuất", "plan"))).toEqual({});
126
193
  });
127
- it("KEEPS respond_<task> for pure analysis/plan prompts (narrowness guard)", () => {
128
- // The deliverable here IS a structured report must not be suppressed.
129
- const ana = (raw, t) => ({ ...makeCtx(t, null), raw });
130
- expect(Object.keys(getResponseToolSet(ana("analyze the orchestrator for cost leaks", "analyze")))).toContain("respond_analyze");
131
- expect(Object.keys(getResponseToolSet(ana("why does the build fail intermittently?", "debug")))).toContain("respond_debug");
132
- expect(Object.keys(getResponseToolSet(ana("plan the migration to the new auth flow", "plan")))).toContain("respond_plan");
133
- expect(Object.keys(getResponseToolSet(ana("review the auth module and explain the design", "analyze")))).toContain("respond_analyze");
194
+ });
195
+ describe("getResponseToolSet Phase 2b deliverableKind consume (model overrides regex)", () => {
196
+ const ctxD = (raw, t, deliverableKind) => ({
197
+ ...makeCtx(t, null),
198
+ raw,
199
+ deliverableKind,
200
+ });
201
+ it("deliverableKind='code' DROPS respond_* even when the prompt reads as a report/list", () => {
202
+ // Legacy regex (prefersStructuredReport) would KEEP the tool on "list all …".
203
+ // The model said the deliverable is code → drop it (edits, not a report).
204
+ expect(getResponseToolSet(ctxD("list all cost leaks in the orchestrator", "analyze", "code"))).toEqual({});
205
+ });
206
+ it("deliverableKind='report' KEEPS respond_* even when the prompt is question-shaped", () => {
207
+ // Legacy regex (isQuestionLike) would DROP the tool on "why does …?". The
208
+ // model said the deliverable is a structured report → keep it.
209
+ const tools = getResponseToolSet(ctxD("why does the suite fail — break it down by cause", "analyze", "report"));
210
+ expect(Object.keys(tools)).toContain("respond_analyze");
211
+ });
212
+ it("deliverableKind='answer' DROPS respond_* for non-general even on a report-shaped request", () => {
213
+ expect(getResponseToolSet(ctxD("plan the migration step by step", "plan", "answer"))).toEqual({});
214
+ });
215
+ it("deliverableKind='answer' KEEPS respond_general (general is exempt — renders as plain markdown)", () => {
216
+ const tools = getResponseToolSet(ctxD("what does the enrichment layer do?", "general", "answer"));
217
+ expect(Object.keys(tools)).toContain("respond_general");
218
+ });
219
+ it("falls back to the legacy regex when deliverableKind is absent (null)", () => {
220
+ // No model signal → legacy path: question-shaped analyze drops the tool.
221
+ expect(getResponseToolSet({ ...makeCtx("analyze", null), raw: "why does the build fail?" })).toEqual({});
222
+ // …and an explicit report request keeps it.
223
+ expect(Object.keys(getResponseToolSet({ ...makeCtx("analyze", null), raw: "list all cost leaks" }))).toContain("respond_analyze");
134
224
  });
135
225
  });
136
226
  describe("applyPilSuffix — outputStyle variants", () => {
@@ -294,4 +384,33 @@ describe("layer6Output", () => {
294
384
  expect(result.enriched).toBe(ctx.enriched);
295
385
  });
296
386
  });
387
+ describe("isQuestionLike — Vietnamese yes/no question frames (regression: session f6f7881a5fae)", () => {
388
+ it("detects the live miss: 'check ... dùng được mcp ... không nhé'", () => {
389
+ // The exact prompt that was mis-routed to the implement/verify scaffold.
390
+ expect(isQuestionLike("bạn check xem dùng được mcp muonroi-docs không nhé")).toBe(true);
391
+ // It is NOT an implementation intent, so layer4-gsd's informational gate fires.
392
+ expect(isImplementationIntent("bạn check xem dùng được mcp muonroi-docs không nhé")).toBe(false);
393
+ });
394
+ it("detects common VI yes/no tails", () => {
395
+ expect(isQuestionLike("dùng được không")).toBe(true);
396
+ expect(isQuestionLike("cái này chạy được không vậy")).toBe(true);
397
+ expect(isQuestionLike("đúng không")).toBe(true);
398
+ expect(isQuestionLike("phải không nhỉ")).toBe(true);
399
+ expect(isQuestionLike("test đã pass chưa")).toBe(true);
400
+ expect(isQuestionLike("xong chưa ạ")).toBe(true);
401
+ expect(isQuestionLike("có chạy được không?")).toBe(true);
402
+ });
403
+ it("does NOT treat a mid-sentence negation as a question", () => {
404
+ // "không là hỏng" = "or it breaks" — 'không' is not the clause-final particle.
405
+ expect(isQuestionLike("đừng commit file .env không là lộ key")).toBe(false);
406
+ // Plain imperative with a 'nhé' softener (no 'không'/'chưa' tail) stays a task.
407
+ expect(isQuestionLike("sửa giúp tôi cái này nhé")).toBe(false);
408
+ expect(isQuestionLike("triển khai tính năng login")).toBe(false);
409
+ });
410
+ it("still detects the pre-existing EN/VI question shapes", () => {
411
+ expect(isQuestionLike("why does the build fail?")).toBe(true);
412
+ expect(isQuestionLike("tại sao build lỗi")).toBe(true);
413
+ expect(isQuestionLike("explain the pipeline")).toBe(true);
414
+ });
415
+ });
297
416
  //# sourceMappingURL=layer6-output.test.js.map
@@ -23,6 +23,32 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
23
23
  expect(result?.outputStyle).toBe("concise");
24
24
  expect(result?.confidence).toBeGreaterThan(0.5);
25
25
  });
26
+ it("parses the three-word reply and marks chitchat from the intent word", async () => {
27
+ const handle = installMockModel({ fixture: { stream: textOnlyStream("general,concise,chat") } });
28
+ cleanup = handle.uninstall;
29
+ const factory = (() => handle.model);
30
+ const classify = createLlmClassifier(factory, "deepseek-v4-flash");
31
+ const result = await classify("cảm ơn bạn nhé");
32
+ expect(result?.taskType).toBe("general");
33
+ expect(result?.intentKind).toBe("chitchat");
34
+ });
35
+ it("treats a general QUESTION as task, not chitchat (keep-tools)", async () => {
36
+ const handle = installMockModel({ fixture: { stream: textOnlyStream("general,concise,task") } });
37
+ cleanup = handle.uninstall;
38
+ const factory = (() => handle.model);
39
+ const classify = createLlmClassifier(factory, "deepseek-v4-flash");
40
+ const result = await classify("bạn thử call tool setup_guide xem được không");
41
+ expect(result?.intentKind).toBe("task");
42
+ });
43
+ it("defaults intentKind to task when the model omits the third word (backward compatible)", async () => {
44
+ const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise") } });
45
+ cleanup = handle.uninstall;
46
+ const factory = (() => handle.model);
47
+ const classify = createLlmClassifier(factory, "deepseek-v4-flash");
48
+ const result = await classify("fix the failing build");
49
+ expect(result?.taskType).toBe("debug");
50
+ expect(result?.intentKind).toBe("task");
51
+ });
26
52
  it("returns null when the reply cannot be parsed", async () => {
27
53
  const handle = installMockModel({ fixture: { stream: textOnlyStream("¯\\_(ツ)_/¯") } });
28
54
  cleanup = handle.uninstall;
@@ -100,14 +126,35 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
100
126
  expect(result?.taskType).toBe("debug");
101
127
  expect(result?.outputStyle).toBe("concise");
102
128
  });
103
- it("keeps the tiny 16-token budget for non-reasoning models", async () => {
129
+ it("keeps a tiny output budget for non-reasoning models (24 — four comma words)", async () => {
104
130
  const handle = installMockModel({ fixture: { stream: textOnlyStream("generate,concise") } });
105
131
  cleanup = handle.uninstall;
106
132
  const factory = (() => handle.model);
107
133
  const classify = createLlmClassifier(factory, "Qwen/Qwen3-8B"); // reasoning:false
108
134
  await classify("add a new endpoint");
109
135
  const call = handle.calls[0];
110
- expect(call.maxOutputTokens).toBe(16);
136
+ expect(call.maxOutputTokens).toBe(24);
137
+ });
138
+ it("parses the fourth word as the output deliverable (Phase 2b)", async () => {
139
+ const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code") } });
140
+ cleanup = handle.uninstall;
141
+ const factory = (() => handle.model);
142
+ const classify = createLlmClassifier(factory, "deepseek-v4-flash");
143
+ const result = await classify("fix the crash in src/auth/login.ts");
144
+ expect(result?.taskType).toBe("debug");
145
+ expect(result?.deliverableKind).toBe("code");
146
+ });
147
+ it("recovers the deliverable position-independently and defaults to null when absent", async () => {
148
+ const reportHandle = installMockModel({ fixture: { stream: textOnlyStream("analyze,concise,task,report") } });
149
+ cleanup = reportHandle.uninstall;
150
+ const reportClassify = createLlmClassifier((() => reportHandle.model), "deepseek-v4-flash");
151
+ expect((await reportClassify("list every env var the CLI reads"))?.deliverableKind).toBe("report");
152
+ reportHandle.uninstall();
153
+ // Model omits the 4th word → deliverableKind null (consumers fall back to regex).
154
+ const bareHandle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise") } });
155
+ cleanup = bareHandle.uninstall;
156
+ const bareClassify = createLlmClassifier((() => bareHandle.model), "deepseek-v4-flash");
157
+ expect((await bareClassify("fix it"))?.deliverableKind).toBeNull();
111
158
  });
112
159
  });
113
160
  //# sourceMappingURL=llm-classify.test.js.map
@@ -37,7 +37,7 @@
37
37
  * one imperative line targeting that phase's most damaging failure mode. Kept
38
38
  * tight (primacy matters more than detail; tokens are the cost).
39
39
  */
40
- export declare const AGENT_OPERATING_CONTRACT = "[AGENT OPERATING CONTRACT \u2014 read first; applies to every step]\n\n1. BEFORE ACTING: do only what was asked. Never assume scope or facts \u2014 if ambiguous, ask or use defaults; never invent requirements.\n2. READING: base statements on what you read/ran THIS turn. Do not infer contents of files you did not open.\n3. EXECUTING: smallest correct change; never widen scope or mask failures (no `|| true`, skipped tests, or swallowed catch).\n4. WHEN UNSURE: verify and cross-check BEFORE concluding. Bugs need a reproduction; reading code is not proof.\n5. REPORTING: answer ONLY what was asked. Every fact or file:line MUST come from this turn; else label \"unverified\"; do not guess. Synthesize evidence gracefully \u2014 do NOT dump massive verbatim tool outputs into the final answer. Cite concise file:line references. Never claim a build/test ran, or describe edits, you did not actually do this turn; if a check can't run, fix it or say so \u2014 don't imply success.\n\n6. LANGUAGE: Reply in user's detected language for final output. Internal reasoning, tools, and code remain in English.\n\n7. ANTI-M\u00D9 / COMPACTION: After seeing \"[pre-compaction warning at step...\" or \"[context compacted at step...\", decide if you need full prior tool results. Emit PRESERVE_FULL_CONTEXT for full veto this turn, or the lighter KEEP_TOOL_IDS: id1,id2 (ids from prior stub \"(id=...)\") to protect only high-value results (read_file/grep on src/PLAN/error etc are auto-protected). Use the ee_query tool with \"tool-artifact id=XXX\" for on-demand full re-hydrate of elided ones. Self-check \"task finished?\" / \"compacted yet?\". Use EE checkpoints.\n\n[END CONTRACT \u2014 instructions follow]";
40
+ export declare const AGENT_OPERATING_CONTRACT = "[AGENT OPERATING CONTRACT \u2014 read first; applies to every step]\n\n1. BEFORE ACTING: do only what was asked. Never assume scope or facts \u2014 if ambiguous, ask or use defaults; never invent requirements.\n2. READING: base statements on what you read/ran THIS turn. Do not infer contents of files you did not open.\n3. EXECUTING: smallest correct change; never widen scope or mask failures (no `|| true`, skipped tests, or swallowed catch).\n4. WHEN UNSURE: verify and cross-check BEFORE concluding. Bugs need a reproduction; reading code is not proof.\n5. REPORTING: answer ONLY what was asked. Every fact or file:line MUST come from this turn; else label \"unverified\"; do not guess. Synthesize evidence gracefully \u2014 do NOT dump massive verbatim tool outputs into the final answer. Cite concise file:line references. Never claim a build/test ran, or describe edits, you did not actually do this turn; if a check can't run, fix it or say so \u2014 don't imply success.\n\n6. LANGUAGE: Reply in user's detected language for final output. Internal reasoning, tools, and code remain in English.\n\n7. ANTI-M\u00D9 / COMPACTION: After seeing \"[pre-compaction warning at step...\" or \"[context compacted at step...\", decide if you need full prior tool results. Emit PRESERVE_FULL_CONTEXT for full veto this turn, or the lighter KEEP_TOOL_IDS: id1,id2 (ids from prior stub \"(id=...)\") to protect only high-value results (read_file/grep on src/PLAN/error etc are auto-protected). Use the ee_query tool with \"tool-artifact id=XXX\" for on-demand full re-hydrate of elided ones. Self-check \"task finished?\" / \"compacted yet?\". Use EE checkpoints.\n\n8. GIT SAFETY: never push on red \u2014 run the check, await its result in a SEPARATE step, confirm 0 failures, then push. Never `git add -A`/`commit -a`; stage explicitly so secrets (.env, .muonroi-cli/, keys) aren't committed. Never `--no-verify`.\n\n[END CONTRACT \u2014 instructions follow]";
41
41
  export interface ContractSectionOptions {
42
42
  /** Chitchat turns carry no tools and make no factual claims — skip the contract. */
43
43
  chitchat?: boolean;
@@ -49,6 +49,8 @@ export const AGENT_OPERATING_CONTRACT = `[AGENT OPERATING CONTRACT — read firs
49
49
 
50
50
  7. ANTI-MÙ / COMPACTION: After seeing "[pre-compaction warning at step..." or "[context compacted at step...", decide if you need full prior tool results. Emit PRESERVE_FULL_CONTEXT for full veto this turn, or the lighter KEEP_TOOL_IDS: id1,id2 (ids from prior stub "(id=...)") to protect only high-value results (read_file/grep on src/PLAN/error etc are auto-protected). Use the ee_query tool with "tool-artifact id=XXX" for on-demand full re-hydrate of elided ones. Self-check "task finished?" / "compacted yet?". Use EE checkpoints.
51
51
 
52
+ 8. GIT SAFETY: never push on red — run the check, await its result in a SEPARATE step, confirm 0 failures, then push. Never \`git add -A\`/\`commit -a\`; stage explicitly so secrets (.env, .muonroi-cli/, keys) aren't committed. Never \`--no-verify\`.
53
+
52
54
  [END CONTRACT — instructions follow]`;
53
55
  /**
54
56
  * Build the contract block for insertion at the front of the system prompt.
@@ -42,8 +42,13 @@ describe("AGENT_OPERATING_CONTRACT", () => {
42
42
  expect(AGENT_OPERATING_CONTRACT).toMatch(/AGENT OPERATING CONTRACT/i);
43
43
  expect(AGENT_OPERATING_CONTRACT).toMatch(/END CONTRACT/i);
44
44
  });
45
- it("stays compact (under 1800 chars) to preserve attention budget on every turn (anti-mù section added)", () => {
46
- expect(AGENT_OPERATING_CONTRACT.length).toBeLessThan(1800);
45
+ it("carries the git-safety rule (never push on red; no broad git add of secrets)", () => {
46
+ expect(AGENT_OPERATING_CONTRACT).toMatch(/GIT SAFETY/i);
47
+ expect(AGENT_OPERATING_CONTRACT).toMatch(/push on red|never push/i);
48
+ expect(AGENT_OPERATING_CONTRACT).toMatch(/git add -A|stage explicitly/i);
49
+ });
50
+ it("stays compact (under 1900 chars) to preserve attention budget on every turn (git-safety rule added)", () => {
51
+ expect(AGENT_OPERATING_CONTRACT.length).toBeLessThan(1900);
47
52
  });
48
53
  });
49
54
  describe("buildContractSection", () => {
@@ -25,41 +25,41 @@
25
25
  * Wrapped with the `[CRITICAL TOOL-USE RULES ...]` marker so the model knows
26
26
  * to treat these as overrides to anything that follows.
27
27
  */
28
- export const CHEAP_MODEL_PLAYBOOK = `[CRITICAL TOOL-USE RULES — read before invoking any tool; these override defaults that follow]
29
-
30
- 1. Bash output is AUTOMATICALLY cached. Every \`bash\` call returns a \`run_id\`
31
- (e.g. \`bash-1\`) you can re-query via \`bash_output_get(run_id, mode=tail|head|grep|lines)\`.
32
- - When you want only the last N lines: do NOT pipe \`| tail -N\`. Run the
33
- bare command, then call \`bash_output_get(run_id, mode=tail, lines=N)\`.
34
- - Same for \`| head\`, \`| grep PATTERN\`, \`> file\`. Pipes/redirects HIDE
35
- the full output from the cache; \`bash_output_get\` reads from the cache
36
- without re-running.
37
- - This applies to EVERY bash call, not just retries.
38
- - To VIEW a file use \`read_file\` (start_line/end_line) — never sed/cat a
39
- file. \`bash_output_get\` is for COMMAND output, not files.
40
-
41
- 2. Before reading more than 3 files to understand a topic, delegate to
42
- \`task(agent="explore")\`. The sub-agent returns a compressed summary;
43
- you save reading tokens.
44
-
45
- 3. Use the \`grep\` tool (ripgrep) for content search — NOT \`bash\` with
46
- \`grep\` / \`find\` piped.
47
-
48
- 4. When a tool returns \`ERROR: ...\`, do NOT retry the identical call.
49
- Pick a different tool, change inputs meaningfully, or stop and report.
50
-
51
- 5. Fix the ROOT CAUSE, never mask a failure to make it "pass"
52
- (\`continue-on-error\`, swallowed try/catch, skipped/deleted test, \`|| true\`).
53
- If a step fails from a missing secret/config, make it CONDITIONAL (skip when
54
- absent) so it still runs when present — do NOT blanket-ignore it.
55
-
56
- 6. For a build / CI / test failure, read the ACTUAL failure log or stack trace
57
- BEFORE hypothesizing — fix the real error, not a guess from source alone.
58
-
59
- 7. ANTI-MÙ / COMPACTION (for long sessions): On pre-warn or "[context compacted at step...", emit PRESERVE_FULL_CONTEXT (full veto) or lighter KEEP_TOOL_IDS: id1,id2 (from stub id=) to protect specific high-value results. read_file/grep/lsp/bash on src/PLAN/error are auto-kept (idea 1). Use ee.query tool with "tool-artifact id=XXX" for on-demand full. Self-check "task finished?" / "compacted yet?". Use EE checkpoints.
60
-
61
- [END CRITICAL TOOL-USE RULES — your regular instructions begin below]
62
-
28
+ export const CHEAP_MODEL_PLAYBOOK = `[CRITICAL TOOL-USE RULES — read before invoking any tool; these override defaults that follow]
29
+
30
+ 1. Bash output is AUTOMATICALLY cached. Every \`bash\` call returns a \`run_id\`
31
+ (e.g. \`bash-1\`) you can re-query via \`bash_output_get(run_id, mode=tail|head|grep|lines)\`.
32
+ - When you want only the last N lines: do NOT pipe \`| tail -N\`. Run the
33
+ bare command, then call \`bash_output_get(run_id, mode=tail, lines=N)\`.
34
+ - Same for \`| head\`, \`| grep PATTERN\`, \`> file\`. Pipes/redirects HIDE
35
+ the full output from the cache; \`bash_output_get\` reads from the cache
36
+ without re-running.
37
+ - This applies to EVERY bash call, not just retries.
38
+ - To VIEW a file use \`read_file\` (start_line/end_line) — never sed/cat a
39
+ file. \`bash_output_get\` is for COMMAND output, not files.
40
+
41
+ 2. Before reading more than 3 files to understand a topic, delegate to
42
+ \`task(agent="explore")\`. The sub-agent returns a compressed summary;
43
+ you save reading tokens.
44
+
45
+ 3. Use the \`grep\` tool (ripgrep) for content search — NOT \`bash\` with
46
+ \`grep\` / \`find\` piped.
47
+
48
+ 4. When a tool returns \`ERROR: ...\`, do NOT retry the identical call.
49
+ Pick a different tool, change inputs meaningfully, or stop and report.
50
+
51
+ 5. Fix the ROOT CAUSE, never mask a failure to make it "pass"
52
+ (\`continue-on-error\`, swallowed try/catch, skipped/deleted test, \`|| true\`).
53
+ If a step fails from a missing secret/config, make it CONDITIONAL (skip when
54
+ absent) so it still runs when present — do NOT blanket-ignore it.
55
+
56
+ 6. For a build / CI / test failure, read the ACTUAL failure log or stack trace
57
+ BEFORE hypothesizing — fix the real error, not a guess from source alone.
58
+
59
+ 7. ANTI-MÙ / COMPACTION (for long sessions): On pre-warn or "[context compacted at step...", emit PRESERVE_FULL_CONTEXT (full veto) or lighter KEEP_TOOL_IDS: id1,id2 (from stub id=) to protect specific high-value results. read_file/grep/lsp/bash on src/PLAN/error are auto-kept (idea 1). Use ee.query tool with "tool-artifact id=XXX" for on-demand full. Self-check "task finished?" / "compacted yet?". Use EE checkpoints.
60
+
61
+ [END CRITICAL TOOL-USE RULES — your regular instructions begin below]
62
+
63
63
  `;
64
64
  /**
65
65
  * Predicate gating playbook injection.