muonroi-cli 1.6.5 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/src/generated/version.d.ts +1 -1
  2. package/dist/src/generated/version.js +1 -1
  3. package/dist/src/orchestrator/message-processor.js +1 -1
  4. package/dist/src/orchestrator/prompts.js +16 -2
  5. package/dist/src/orchestrator/stream-runner.js +50 -3
  6. package/dist/src/orchestrator/subagent-compactor.d.ts +1 -1
  7. package/dist/src/orchestrator/subagent-compactor.js +1 -1
  8. package/dist/src/pil/__tests__/layer4-gsd.test.js +40 -23
  9. package/dist/src/pil/__tests__/llm-classify.test.js +40 -3
  10. package/dist/src/pil/layer1-intent.js +10 -1
  11. package/dist/src/pil/layer1-intent.test.js +18 -0
  12. package/dist/src/pil/layer4-gsd.js +43 -19
  13. package/dist/src/pil/llm-classify.d.ts +36 -0
  14. package/dist/src/pil/llm-classify.js +84 -18
  15. package/dist/src/pil/types.d.ts +27 -2
  16. package/dist/src/{gsd → playbook}/__tests__/directives.test.js +34 -58
  17. package/dist/src/playbook/complexity.d.ts +17 -0
  18. package/dist/src/playbook/complexity.js +18 -0
  19. package/dist/src/{gsd → playbook}/directives.d.ts +20 -13
  20. package/dist/src/playbook/directives.js +149 -0
  21. package/dist/src/providers/__tests__/reasoning-roundtrip.test.js +70 -1
  22. package/dist/src/providers/strategies/deepseek.strategy.js +5 -22
  23. package/dist/src/providers/strategies/siliconflow.strategy.js +5 -0
  24. package/dist/src/providers/strategies/thinking-mode.d.ts +35 -0
  25. package/dist/src/providers/strategies/thinking-mode.js +73 -0
  26. package/dist/src/tools/registry.js +47 -47
  27. package/dist/src/ui/app.js +91 -24
  28. package/dist/src/ui/hooks/use-session-picker.d.ts +14 -0
  29. package/dist/src/ui/hooks/use-session-picker.js +20 -0
  30. package/dist/src/ui/modals/session-picker-modal.d.ts +14 -0
  31. package/dist/src/ui/modals/session-picker-modal.js +39 -0
  32. package/dist/src/ui/utils/relaunch.d.ts +41 -0
  33. package/dist/src/ui/utils/relaunch.js +71 -0
  34. package/dist/src/ui/utils/relaunch.test.js +83 -0
  35. package/package.json +1 -1
  36. package/dist/src/gsd/__tests__/complexity.test.js +0 -0
  37. package/dist/src/gsd/complexity.d.ts +0 -28
  38. package/dist/src/gsd/complexity.js +0 -103
  39. package/dist/src/gsd/directives.js +0 -154
  40. /package/dist/src/{gsd → playbook}/__tests__/directives.test.d.ts +0 -0
  41. /package/dist/src/{gsd/__tests__/complexity.test.d.ts → ui/utils/relaunch.test.d.ts} +0 -0
@@ -1,2 +1,2 @@
1
- export declare const PACKAGE_VERSION = "1.6.5";
1
+ export declare const PACKAGE_VERSION = "1.7.0";
2
2
  export declare const PACKAGE_DESCRIPTION = "BYOK AI coding agent with multi-model council debate, role-based routing, and auto-compact.";
@@ -1,5 +1,5 @@
1
1
  // AUTO-GENERATED by scripts/sync-version.cjs. DO NOT EDIT BY HAND.
2
2
  // Sourced from package.json at build time so it survives bun --compile bundling.
3
- export const PACKAGE_VERSION = "1.6.5";
3
+ export const PACKAGE_VERSION = "1.7.0";
4
4
  export const PACKAGE_DESCRIPTION = "BYOK AI coding agent with multi-model council debate, role-based routing, and auto-compact.";
5
5
  //# sourceMappingURL=version.js.map
@@ -60,7 +60,6 @@ import * as phaseTracker from "../ee/phase-tracker.js";
60
60
  import { buildScope as buildScopeForVeto } from "../ee/scope.js";
61
61
  import { fireTrajectoryEvent } from "../ee/session-trajectory.js";
62
62
  import { getTenantId as getTenantIdForVeto } from "../ee/tenant.js";
63
- import { mentionsEcosystemScope } from "../gsd/directives.js";
64
63
  import { acquireMcpTools } from "../mcp/client-pool.js";
65
64
  import { dropRedundantFsMcpTools, filterMcpServersByMessage } from "../mcp/smart-filter.js";
66
65
  import { getModelInfo } from "../models/registry.js";
@@ -69,6 +68,7 @@ import { injectCheapModelWorkbook, shouldInjectCheapModelWorkbook } from "../pil
69
68
  import { applyPilSuffix, getResponseTaskType, getResponseToolSet, isResponseTool, runPipeline, shouldHaltOnResponseTool, } from "../pil/index.js";
70
69
  import { isMetaAnalysisPrompt } from "../pil/layer6-output.js";
71
70
  import { taskTypeToMaxTokens, taskTypeToReasoningEffort, taskTypeToTier } from "../pil/task-tier-map.js";
71
+ import { mentionsEcosystemScope } from "../playbook/directives.js";
72
72
  import { getProviderCapabilities } from "../providers/capabilities.js";
73
73
  import { loadKeyForProvider } from "../providers/keychain.js";
74
74
  import { bridgeMcpToolResult, getVisionGuidanceForTextOnly, listCachedImages, scrubImagePayloadsInMessages, } from "../providers/mcp-vision-bridge.js";
@@ -245,6 +245,12 @@ DEFAULT DELEGATION POLICY:
245
245
  - Do not wait for the user to explicitly ask for a sub-agent when delegation would clearly help.
246
246
  - Skip delegation only when the task is trivial, single-file, or you already have the exact answer.
247
247
 
248
+ WRITING A GOOD DELEGATION PROMPT (the sub-agent sees ONLY what you put in the prompt field — it does NOT share your context):
249
+ - GOAL: state the one concrete question or outcome the sub must deliver.
250
+ - CONTEXT: include the specific facts the sub needs (file paths, symbol names, constraints, what you already know) so it doesn't re-derive them blindly.
251
+ - RETURN SHAPE: say exactly what to hand back — e.g. "return the findings as file:line + a one-line conclusion", or "return the diff you applied + tests run". The sub's final message is the only thing that re-enters YOUR context (capped ~32K), so a vague ask wastes the turn.
252
+ - When fanning out several sub-agents in parallel, give each a NON-overlapping scope so their syntheses compose instead of duplicating.
253
+
248
254
  EXAMPLES:
249
255
  - "review this change" -> delegate to explore first
250
256
  - "research how auth works" -> delegate to explore first
@@ -266,6 +272,8 @@ IMPORTANT:
266
272
  - Use write_file only for new files or when most of the file is changing. For very large files (>500 lines), split into multiple edit_file calls or write smaller chunks.
267
273
  - Use read_file instead of cat/head/tail for reading files.
268
274
  - When the user asks for an automated recurring or one-time run, use the schedule tools instead of only describing the setup.
275
+ - Commit incrementally (in any git repo, without being asked): the moment a cohesive, working chunk is finished and its checks pass, commit it before starting the next chunk. This is part of doing the work, not a separate request — prefer several small, logically-scoped commits over one large catch-all commit at the end. Group related file changes per commit; keep unrelated changes in separate commits.
276
+ - Every commit message you create MUST end with this attribution line, verbatim, on its own final line: "Coding by - Muonroi-CLI".
269
277
  - After creating a recurring schedule, check the daemon status and start it with \`schedule_daemon_start\` if needed.
270
278
 
271
279
  Be direct. Execute, don't just describe. Show results, not plans.`,
@@ -452,7 +460,10 @@ export function buildSubagentPrompt(request, cwd, custom, sandboxMode, subagents
452
460
  ? [
453
461
  "Do not create, modify, or delete files.",
454
462
  "Prefer `read_file` and search commands over broad shell exploration.",
455
- "Return concise findings for the parent agent.",
463
+ // RETURN CONTRACT the parent only ingests your FINAL message (capped at
464
+ // ~32K, head+tail), never your tool output. Make that message a tight
465
+ // synthesis so the parent's context stays clean.
466
+ "End with a tight synthesis FOR THE PARENT AGENT: lead with the answer to the delegated task, ground each claim in a concrete file:line, then note any gaps or the recommended next step. Do NOT narrate your search process or restate these instructions — the parent needs the conclusion, not the journey.",
456
467
  ]
457
468
  : isVerifyDetect
458
469
  ? [
@@ -528,7 +539,10 @@ export function buildSubagentPrompt(request, cwd, custom, sandboxMode, subagents
528
539
  : [
529
540
  "Work only on the delegated task below.",
530
541
  "Use tools directly instead of narrating your intent.",
531
- "Return a concise summary for the parent agent with key outcomes and any open risks.",
542
+ // RETURN CONTRACT the parent only ingests your FINAL message
543
+ // (capped at ~32K, head+tail), never your tool output. Make that
544
+ // message a tight synthesis so the parent's context stays clean.
545
+ "End with a tight synthesis FOR THE PARENT AGENT: lead with what you did / what you found, cite the concrete file:line you changed or relied on, then list any open risks, follow-ups, or verification still owed. Do NOT narrate your process or restate these instructions — the parent needs the result, not a transcript.",
532
546
  ];
533
547
  const instructionLines = custom?.instruction.trim() ? ["", "SUB-AGENT INSTRUCTIONS:", custom.instruction.trim()] : [];
534
548
  return [
@@ -345,7 +345,13 @@ export class StreamRunner {
345
345
  // sub-agent has no PIL ctx of its own; the caller already bounded the
346
346
  // work via maxSteps. We compose alongside that hard step cap so a
347
347
  // wandering sub-agent loop trips whichever fires first (logical OR).
348
- const _subCeiling = resolveCeiling("general", "medium");
348
+ // Explore sub-agents are READ-ONLY research — a codebase investigation
349
+ // legitimately needs more grep/read steps than the tight general/medium=10
350
+ // cell allows. Cutting it early (esp. for reasoning models that front-load
351
+ // tool calls) leaves no budget to reach the synthesis turn (root cause of
352
+ // empty "Task completed. Last action: grep" returns). Give explore the
353
+ // analyze/large cell (15); edit-capable agents keep general/medium (10).
354
+ const _subCeiling = isExplore ? resolveCeiling("analyze", "large") : resolveCeiling("general", "medium");
349
355
  const _subCounterKey = `subagent:${subCallId}`;
350
356
  const _subStopWhen = (async (state) => {
351
357
  if (state.steps.length >= maxSteps)
@@ -603,8 +609,49 @@ export class StreamRunner {
603
609
  finally {
604
610
  stall.dispose();
605
611
  }
606
- const output = assistantText.trim() || `Task completed. Last action: ${lastActivity}`;
607
- return { output, lastActivity, cancelled: false, assistantText };
612
+ // Forced final synthesis. When the loop ends on a tool-call / step-ceiling
613
+ // cut (finishReason="tool-calls"), the model never got a turn to write its
614
+ // findings, so assistantText is empty and the parent would receive the
615
+ // useless "Task completed. Last action: <tool>" fallback — then redo the
616
+ // work itself, defeating the whole point of delegation (live root cause:
617
+ // grok-build-0.1 emitted 277 reasoning-deltas + 19 tool-calls but ZERO
618
+ // text-deltas across the 10-step ceiling). Give the sub exactly ONE
619
+ // tool-free turn to synthesize what it already gathered. Tools are removed
620
+ // so finishReason cannot be "tool-calls" again — the model MUST emit text.
621
+ let synthesizedText = "";
622
+ if (!assistantText.trim() && !signal?.aborted && !stallTriggered) {
623
+ try {
624
+ const resp = await result.response;
625
+ const priorMessages = (resp.messages ?? []);
626
+ const synthResult = streamText({
627
+ model: childRuntime.model,
628
+ system: childSystem,
629
+ messages: [
630
+ ...childMessages,
631
+ ...priorMessages,
632
+ {
633
+ role: "user",
634
+ content: "You've reached your investigation budget and have not written your findings yet. Stop now — do NOT call any more tools. Write your final synthesis FOR THE PARENT AGENT: lead with the answer to the delegated task, cite the concrete file:line behind each claim, then note any gaps or the recommended next step. Be concise; the parent only ingests this message.",
635
+ },
636
+ ],
637
+ tools: {},
638
+ maxRetries: 0,
639
+ abortSignal: signal,
640
+ ...(childProviderOptions ? { providerOptions: childProviderOptions } : {}),
641
+ });
642
+ for await (const part of synthResult.fullStream) {
643
+ if (part.type === "text-delta")
644
+ synthesizedText += part.text ?? "";
645
+ }
646
+ debugLog(`forced-synthesis: textLen=${synthesizedText.length}`);
647
+ }
648
+ catch (err) {
649
+ debugLog(`forced-synthesis failed: ${err?.message}`);
650
+ }
651
+ }
652
+ const recovered = assistantText.trim() || synthesizedText.trim();
653
+ const output = recovered || `Task completed. Last action: ${lastActivity}`;
654
+ return { output, lastActivity, cancelled: false, assistantText: recovered };
608
655
  }
609
656
  /**
610
657
  * Phase 3 — canonical entrypoint. Orchestrates setup + runStream with
@@ -114,7 +114,7 @@ export declare const IMPORTANT_TOOL_NAMES: readonly ["read_file", "grep", "lsp",
114
114
  /**
115
115
  * MCP tool prefixes whose results are an AUTHORITATIVE source the agent is
116
116
  * explicitly steered to fetch FIRST and ground on (the ECOSYSTEM_DOCS_NUDGE in
117
- * src/gsd/directives.ts). Eliding them defeats the nudge — the agent calls the
117
+ * src/playbook/directives.ts). Eliding them defeats the nudge — the agent calls the
118
118
  * ecosystem docs, then compaction discards them and it goes blind on the very
119
119
  * source it was told to trust (session 584ba476c07a: mcp_muonroi-docs__setup_guide
120
120
  * + bb_recipe_list elided, ee_unavailable, 0 rehydrated → "partially blind").
@@ -76,7 +76,7 @@ export const IMPORTANT_TOOL_NAMES = [
76
76
  /**
77
77
  * MCP tool prefixes whose results are an AUTHORITATIVE source the agent is
78
78
  * explicitly steered to fetch FIRST and ground on (the ECOSYSTEM_DOCS_NUDGE in
79
- * src/gsd/directives.ts). Eliding them defeats the nudge — the agent calls the
79
+ * src/playbook/directives.ts). Eliding them defeats the nudge — the agent calls the
80
80
  * ecosystem docs, then compaction discards them and it goes blind on the very
81
81
  * source it was told to trust (session 584ba476c07a: mcp_muonroi-docs__setup_guide
82
82
  * + bb_recipe_list elided, ee_unavailable, 0 rehydrated → "partially blind").
@@ -17,7 +17,7 @@ function makeCtx(overrides = {}) {
17
17
  ...overrides,
18
18
  };
19
19
  }
20
- describe("layer4Gsd (gsd-native)", () => {
20
+ describe("layer4Gsd (playbook)", () => {
21
21
  it("skips directive injection when intentKind === 'chitchat'", async () => {
22
22
  const before = "hello";
23
23
  const result = await layer4Gsd(makeCtx({ raw: before, enriched: before, intentKind: "chitchat" }));
@@ -26,9 +26,9 @@ describe("layer4Gsd (gsd-native)", () => {
26
26
  expect(layer.applied).toBe(false);
27
27
  expect(layer.delta).toBe("skip:chitchat");
28
28
  });
29
- it("appends a gsd-native directive and records the layer as applied", async () => {
29
+ it("appends a playbook directive and records the layer as applied", async () => {
30
30
  const result = await layer4Gsd(makeCtx({ raw: "implement the login feature" }));
31
- expect(result.enriched).toContain("[gsd-native]");
31
+ expect(result.enriched).toContain("[playbook]");
32
32
  const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
33
33
  expect(layer).toBeDefined();
34
34
  expect(layer.applied).toBe(true);
@@ -39,18 +39,30 @@ describe("layer4Gsd (gsd-native)", () => {
39
39
  const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
40
40
  expect(layer.delta).toContain("phase=plan");
41
41
  });
42
- it("emits a HEAVY directive for wholesale, multi-step prompts", async () => {
43
- const heavy = "redo the entire architecture and produce a deep-map across all repos, including business rules";
44
- const result = await layer4Gsd(makeCtx({ raw: heavy, tokenBudget: 4000 }));
42
+ it("emits a HEAVY directive when the model classifies depth=heavy (agent-first, not regex)", async () => {
43
+ // Depth now comes from ctx.modelDepthTier (the model's 5th classify word),
44
+ // NOT a regex scan of the raw prompt. A plainly-phrased prompt the model
45
+ // judged heavy still gets the full discuss → research → check-plan flow.
46
+ const result = await layer4Gsd(makeCtx({ raw: "rework how auth works", tokenBudget: 8000, modelDepthTier: "heavy" }));
45
47
  const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
46
48
  expect(layer.delta).toContain("tier=heavy");
47
- expect(result.enriched).toMatch(/MANDATORY/);
49
+ expect(layer.delta).toContain("depth=model");
50
+ expect(result.enriched).toMatch(/HEAVY task/);
51
+ expect(result.enriched).toMatch(/DISCUSS/);
48
52
  expect(result.enriched).toMatch(/AskUserQuestion/);
53
+ expect(result.enriched).toMatch(/CHECK-PLAN/);
49
54
  });
50
- it("emits a QUICK directive for trivial prompts", async () => {
51
- const result = await layer4Gsd(makeCtx({ raw: "fix typo in README" }));
55
+ it("emits a QUICK directive when the model classifies depth=quick", async () => {
56
+ const result = await layer4Gsd(makeCtx({ raw: "fix typo in README", modelDepthTier: "quick" }));
52
57
  const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
53
58
  expect(layer.delta).toContain("tier=quick");
59
+ expect(layer.delta).toContain("depth=model");
60
+ });
61
+ it("defaults to STANDARD tier when the model supplied no depth (no regex fallback)", async () => {
62
+ const result = await layer4Gsd(makeCtx({ raw: "do the thing", modelDepthTier: null }));
63
+ const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
64
+ expect(layer.delta).toContain("tier=standard");
65
+ expect(layer.delta).toContain("depth=default");
54
66
  });
55
67
  it("still records a layer entry even when no phase is detected", async () => {
56
68
  const result = await layer4Gsd(makeCtx({ raw: "hello there", gsdPhase: null }));
@@ -59,21 +71,26 @@ describe("layer4Gsd (gsd-native)", () => {
59
71
  expect(layer.applied).toBe(true);
60
72
  expect(layer.delta).toMatch(/phase=(none|discuss|plan|execute|verify|review)/);
61
73
  });
62
- it("respects tokenBudget when truncating the directive", async () => {
63
- const result = await layer4Gsd(makeCtx({ raw: "implement this", tokenBudget: 30 }));
64
- const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
65
- if (layer?.applied && layer.delta) {
66
- const charsMatch = layer.delta.match(/chars=(\d+)/);
67
- if (charsMatch) {
68
- // Directive budget is 25% of tokenBudget * 4 chars/token = 30 chars at budget=30.
69
- // truncateToBudget returns chars-based budget — accept up to tokenBudget*4.
70
- expect(parseInt(charsMatch[1], 10)).toBeLessThanOrEqual(30 * 4);
71
- }
72
- }
73
- });
74
- it("updates gsdPhase on context when keyword detection fires", async () => {
74
+ it("floors the directive budget so the rubric survives at the default tokenBudget (regression: directive was truncated to ~500 chars)", async () => {
75
+ // The playbook directive is a critical behavioural instruction and must NOT
76
+ // be gutted by the tiny pipeline budget. At the production default
77
+ // tokenBudget=500 the bare 25% fraction was ~500 chars, which cut the HEAVY
78
+ // rubric after step 1. The floor (DIRECTIVE_MIN_TOKENS) guarantees the full
79
+ // ~1.7K-char HEAVY rubric reaches the model intact.
80
+ const result = await layer4Gsd(makeCtx({ raw: "rework auth", tokenBudget: 500, modelDepthTier: "heavy" }));
81
+ expect(result.enriched).toMatch(/DISCUSS/);
82
+ expect(result.enriched).toMatch(/CHECK-PLAN/);
83
+ expect(result.enriched).toMatch(/VERIFY/);
84
+ expect(result.enriched).toMatch(/todo_write/);
85
+ });
86
+ it("does NOT keyword-detect a phase from the raw prompt (agent-first, no regex)", async () => {
87
+ // Phase keyword detection was removed: a regex scan of the prompt would
88
+ // mislabel the directive. Phase is sourced only from ctx.gsdPhase (L1
89
+ // unified) or the EE brain route. With neither, it stays null/undefined.
75
90
  const result = await layer4Gsd(makeCtx({ raw: "review the pull request" }));
76
- expect(["review", "discuss", "execute"]).toContain(result.gsdPhase);
91
+ expect(result.gsdPhase ?? null).toBeNull();
92
+ const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
93
+ expect(layer.delta).toContain("phase=none");
77
94
  });
78
95
  it("routes a question-shaped analyze/debug prompt to the QUESTION directive (no 'state a plan')", async () => {
79
96
  // De-robotizing: a plain question must not get the STANDARD "state a 2-3 line
@@ -126,17 +126,37 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
126
126
  expect(result?.taskType).toBe("debug");
127
127
  expect(result?.outputStyle).toBe("concise");
128
128
  });
129
- it("keeps a tiny output budget for non-reasoning models (24four comma words)", async () => {
129
+ it("keeps a tiny output budget for non-reasoning models (48seven comma words)", async () => {
130
130
  const handle = installMockModel({ fixture: { stream: textOnlyStream("generate,concise") } });
131
131
  cleanup = handle.uninstall;
132
132
  const factory = (() => handle.model);
133
133
  const classify = createLlmClassifier(factory, "Qwen/Qwen3-8B"); // reasoning:false
134
134
  await classify("add a new endpoint");
135
135
  const call = handle.calls[0];
136
- expect(call.maxOutputTokens).toBe(24);
136
+ expect(call.maxOutputTokens).toBe(48);
137
+ });
138
+ it("parses the sixth + seventh words as agent-first scope and reply-language", async () => {
139
+ const eco = installMockModel({
140
+ fixture: { stream: textOnlyStream("analyze,balanced,task,answer,standard,ecosystem,vietnamese") },
141
+ });
142
+ cleanup = eco.uninstall;
143
+ const ecoClassify = createLlmClassifier((() => eco.model), "deepseek-v4-flash");
144
+ const r = await ecoClassify("hệ sinh thái muonroi gồm những gì");
145
+ expect(r?.ecosystemScope).toBe(true);
146
+ expect(r?.replyLanguage).toBe("Vietnamese");
147
+ eco.uninstall();
148
+ // English + local → no nudge signals (ecosystemScope false, replyLanguage null).
149
+ const plain = installMockModel({
150
+ fixture: { stream: textOnlyStream("debug,concise,task,code,standard,local,english") },
151
+ });
152
+ cleanup = plain.uninstall;
153
+ const plainClassify = createLlmClassifier((() => plain.model), "deepseek-v4-flash");
154
+ const p = await plainClassify("fix the crash");
155
+ expect(p?.ecosystemScope).toBe(false);
156
+ expect(p?.replyLanguage).toBeNull();
137
157
  });
138
158
  it("parses the fourth word as the output deliverable (Phase 2b)", async () => {
139
- const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code") } });
159
+ const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code,standard") } });
140
160
  cleanup = handle.uninstall;
141
161
  const factory = (() => handle.model);
142
162
  const classify = createLlmClassifier(factory, "deepseek-v4-flash");
@@ -144,6 +164,23 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
144
164
  expect(result?.taskType).toBe("debug");
145
165
  expect(result?.deliverableKind).toBe("code");
146
166
  });
167
+ it("parses the fifth word as the model-decided work depth (agent-first tier)", async () => {
168
+ const heavy = installMockModel({ fixture: { stream: textOnlyStream("refactor,concise,task,code,heavy") } });
169
+ cleanup = heavy.uninstall;
170
+ const heavyClassify = createLlmClassifier((() => heavy.model), "deepseek-v4-flash");
171
+ expect((await heavyClassify("rework the auth system"))?.depthTier).toBe("heavy");
172
+ heavy.uninstall();
173
+ // Position-independent recovery (taskType still leads; depth appears early).
174
+ const reordered = installMockModel({ fixture: { stream: textOnlyStream("debug,quick,concise,task,code") } });
175
+ cleanup = reordered.uninstall;
176
+ const reorderedClassify = createLlmClassifier((() => reordered.model), "deepseek-v4-flash");
177
+ expect((await reorderedClassify("fix typo"))?.depthTier).toBe("quick");
178
+ reordered.uninstall();
179
+ const noDepth = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code") } });
180
+ cleanup = noDepth.uninstall;
181
+ const noDepthClassify = createLlmClassifier((() => noDepth.model), "deepseek-v4-flash");
182
+ expect((await noDepthClassify("fix the bug"))?.depthTier).toBeNull();
183
+ });
147
184
  it("recovers the deliverable position-independently and defaults to null when absent", async () => {
148
185
  const reportHandle = installMockModel({ fixture: { stream: textOnlyStream("analyze,concise,task,report") } });
149
186
  cleanup = reportHandle.uninstall;
@@ -656,6 +656,15 @@ export async function layer1Intent(ctx, opts = {}) {
656
656
  // routing instead of keyword regex. null → those layers fall back to
657
657
  // their legacy regex predicates for this turn.
658
658
  deliverableKind: llmRes.deliverableKind,
659
+ // Agent-first work depth: the model decides the GSD tier in the same
660
+ // classify call (no extra round-trip). layer4 prefers this over the
661
+ // regex scorer. null → layer4 defaults to "standard".
662
+ modelDepthTier: llmRes.depthTier,
663
+ // Agent-first scope + reply-language (same classify call). Replace the
664
+ // ecosystem/diacritic regexes: layer4 reads these instead of scanning
665
+ // the raw prompt.
666
+ ecosystemScope: llmRes.ecosystemScope,
667
+ replyLanguage: llmRes.replyLanguage,
659
668
  // null lets L6 run its cheap style-rescue if outputStyle is still null;
660
669
  // EE retrieval enrichment happens downstream in layer3 as usual.
661
670
  _brainData: null,
@@ -665,7 +674,7 @@ export async function layer1Intent(ctx, opts = {}) {
665
674
  {
666
675
  name: "intent-detection",
667
676
  applied: true,
668
- delta: `taskType=${llmRes.taskType},kind=${intentKind},deliverable=${llmRes.deliverableKind ?? "none"},conf=${llmRes.confidence.toFixed(2)},domain=${domain ?? "none"},style=${outputStyle ?? "none"},source=llm-first`,
677
+ delta: `taskType=${llmRes.taskType},kind=${intentKind},deliverable=${llmRes.deliverableKind ?? "none"},depth=${llmRes.depthTier ?? "none"},conf=${llmRes.confidence.toFixed(2)},domain=${domain ?? "none"},style=${outputStyle ?? "none"},source=llm-first`,
669
678
  },
670
679
  ],
671
680
  };
@@ -378,6 +378,9 @@ describe("intentKind guard — a tool/command request must never route as chitch
378
378
  confidence: 0.75,
379
379
  intentKind: "task",
380
380
  deliverableKind: null,
381
+ depthTier: null,
382
+ ecosystemScope: null,
383
+ replyLanguage: null,
381
384
  });
382
385
  it("flips chitchat → task when the LLM fallback returns 'general' but the prompt is a command request", async () => {
383
386
  // Reproduces 817e508f57ee: classify abstains, LLM fallback returns
@@ -546,6 +549,9 @@ describe("Pass 2.6 — social pleasantries route to chitchat (drop the tool-sche
546
549
  confidence: 0.8,
547
550
  intentKind: "task",
548
551
  deliverableKind: "code",
552
+ depthTier: "standard",
553
+ ecosystemScope: null,
554
+ replyLanguage: null,
549
555
  }),
550
556
  });
551
557
  expect(result.intentKind).toBe("task");
@@ -565,6 +571,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
565
571
  confidence: 0.9,
566
572
  intentKind: "task",
567
573
  deliverableKind: "answer",
574
+ depthTier: null,
575
+ ecosystemScope: null,
576
+ replyLanguage: null,
568
577
  }),
569
578
  });
570
579
  expect(result.taskType).toBe("general"); // NOT the regex 'create-file' → generate
@@ -581,6 +590,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
581
590
  confidence: 0.9,
582
591
  intentKind: "chitchat",
583
592
  deliverableKind: "answer",
593
+ depthTier: null,
594
+ ecosystemScope: null,
595
+ replyLanguage: null,
584
596
  }),
585
597
  });
586
598
  expect(result.intentKind).toBe("chitchat");
@@ -593,6 +605,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
593
605
  confidence: 0.9,
594
606
  intentKind: "chitchat",
595
607
  deliverableKind: "answer",
608
+ depthTier: null,
609
+ ecosystemScope: null,
610
+ replyLanguage: null,
596
611
  }),
597
612
  });
598
613
  expect(result.intentKind).toBe("task");
@@ -627,6 +642,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
627
642
  confidence: 0.9,
628
643
  intentKind: "task",
629
644
  deliverableKind: null,
645
+ depthTier: null,
646
+ ecosystemScope: null,
647
+ replyLanguage: null,
630
648
  }));
631
649
  const result = await layer1Intent(makeCtx("fix the failing build"), { llmFallback: llm });
632
650
  expect(llm).not.toHaveBeenCalled();
@@ -17,10 +17,7 @@
17
17
  * the agent, into the user's language.
18
18
  */
19
19
  import { routeTask } from "../ee/bridge.js";
20
- import { scoreComplexity } from "../gsd/complexity.js";
21
- import { buildDirective, mentionsEcosystemScope } from "../gsd/directives.js";
22
- import { detectGrayAreas } from "../gsd/gray-areas.js";
23
- import { detectGsdPhase } from "../gsd/types.js";
20
+ import { buildDirective } from "../playbook/directives.js";
24
21
  import { classifyEeError, logEeFailure } from "../utils/ee-logger.js";
25
22
  import { truncateToBudget } from "./budget.js";
26
23
  import { isImplementationIntent, isMetaAnalysisPrompt, isQuestionLike } from "./layer6-output.js";
@@ -37,6 +34,15 @@ function mapRouteToPhase(route) {
37
34
  }
38
35
  }
39
36
  const DIRECTIVE_BUDGET_FRACTION = 0.25;
37
+ // The playbook directive is a CRITICAL behavioural instruction, not enrichment
38
+ // context — it must reach the model INTACT. With the pipeline default
39
+ // tokenBudget=500, a 25% share is only ~125 tokens (~500 chars), which silently
40
+ // truncated the HEAVY rubric after the first step (CHECK-PLAN / IMPLEMENT /
41
+ // VERIFY / the todo_write checklist instruction never reached the model). The
42
+ // full HEAVY directive is ~1.7K chars, so floor the directive's own budget at a
43
+ // value that fits it whole (truncateToBudget multiplies by CHARS_PER_TOKEN=4 →
44
+ // 700 tokens ≈ 2.8K chars). The fraction still wins when tokenBudget is large.
45
+ const DIRECTIVE_MIN_TOKENS = 700;
40
46
  // TODO(WhoAmI-L4): when EE v4.0 Who Am I profile is available:
41
47
  // - work_patterns.delegation_style="autonomous" → bias routeTask toward "direct",
42
48
  // skip qc-flow discussion phase for familiar task types
@@ -74,11 +80,24 @@ export async function layer4Gsd(ctx) {
74
80
  routeSource = "unified";
75
81
  }
76
82
  if (!phase) {
77
- phase = detectGsdPhase(ctx.raw);
78
- routeSource = phase ? "keyword" : "none";
83
+ // Agent-first: phase is a minor hint sourced from the EE brain only. We do
84
+ // NOT keyword-regex it from the raw prompt — regex misclassification here
85
+ // would mislabel the directive (no-regex rule, 2026-06-18). null is fine:
86
+ // the directive reads cleanly without a phase hint.
87
+ routeSource = "none";
79
88
  }
80
- const complexity = scoreComplexity(ctx.raw);
81
- const grayAreas = complexity.tier === "heavy" ? detectGrayAreas(ctx.raw).questions : [];
89
+ // Work depth is decided by the model in layer1's classify call (the 5th
90
+ // word ctx.modelDepthTier). The regex `scoreComplexity` scorer has been
91
+ // removed from this decision path: depth must reflect what the task actually
92
+ // entails, not which keywords it contains. When the model classifier is
93
+ // unwired/failed (modelDepthTier null — rare, since it IS the chat model),
94
+ // default to the safe middle tier; the injected rubric still lets the agent
95
+ // self-select up or down.
96
+ const tier = ctx.modelDepthTier ?? "standard";
97
+ // Gray areas are no longer pre-computed by regex. The HEAVY rubric instructs
98
+ // the agent to surface its own clarifying questions via AskUserQuestion,
99
+ // grounded in what it actually finds — far more accurate than keyword guesses.
100
+ const grayAreas = [];
82
101
  // Informational prompts (a question / explanation / self-eval) ask for an
83
102
  // ANSWER, not a code change. The implement/verify directive otherwise leaks
84
103
  // into the human-facing reply as a "2-3 line plan" + process narration
@@ -101,18 +120,24 @@ export async function layer4Gsd(ctx) {
101
120
  : isMetaAnalysisPrompt(ctx.raw) ||
102
121
  (ctx.taskType === "general" && ctx.intentKind === "task") ||
103
122
  (isQuestionLike(ctx.raw) && !isImplementationIntent(ctx.raw));
104
- const ecosystem = mentionsEcosystemScope(ctx.raw);
105
- // Heuristic: VN diacritics user wrote Vietnamese re-anchor language rule
106
- // inside the directive (storyflow_ui session 22661c8de9f2 base rule
107
- // crowded out by brevity/FIX-FIRST directives).
108
- const replyLanguage = /[à-ỹÀ-Ỹ]/.test(ctx.raw) ? "Vietnamese" : undefined;
109
- const directive = buildDirective({ complexity, phase, grayAreas, informational, ecosystem, replyLanguage });
110
- const budgetChars = Math.floor(ctx.tokenBudget * DIRECTIVE_BUDGET_FRACTION);
111
- const trimmed = truncateToBudget(directive.text, budgetChars);
123
+ // Scope + reply-language are now agent-first (model-decided in layer1's
124
+ // classify call), NOT regex scans of the raw prompt (no-regex rule,
125
+ // 2026-06-18). The ecosystem docs-first nudge fires only when the model judged
126
+ // the turn platform-scoped; the language re-anchor fires for any non-English
127
+ // language the model detected (the old regex only caught Vietnamese).
128
+ const ecosystem = ctx.ecosystemScope === true;
129
+ const replyLanguage = ctx.replyLanguage ?? undefined;
130
+ const directive = buildDirective({ tier, phase, informational, ecosystem, replyLanguage });
131
+ // truncateToBudget takes a TOKEN budget (×CHARS_PER_TOKEN internally). Floor it
132
+ // at DIRECTIVE_MIN_TOKENS so the full directive always survives, even at the
133
+ // default tokenBudget=500 where the bare fraction would gut it.
134
+ const directiveTokenBudget = Math.max(Math.floor(ctx.tokenBudget * DIRECTIVE_BUDGET_FRACTION), DIRECTIVE_MIN_TOKENS);
135
+ const trimmed = truncateToBudget(directive.text, directiveTokenBudget);
136
+ const depthSource = ctx.modelDepthTier ? "model" : "default";
112
137
  return {
113
138
  ...ctx,
114
139
  gsdPhase: phase,
115
- complexityTier: complexity.tier,
140
+ complexityTier: tier,
116
141
  grayAreas,
117
142
  enriched: `${ctx.enriched}\n${trimmed}`,
118
143
  layers: [
@@ -122,10 +147,9 @@ export async function layer4Gsd(ctx) {
122
147
  applied: true,
123
148
  delta: [
124
149
  `tier=${directive.tier}`,
125
- `score=${complexity.score}`,
150
+ `depth=${depthSource}`,
126
151
  `phase=${phase ?? "none"}`,
127
152
  `route=${routeSource}`,
128
- `gray=${grayAreas.length}`,
129
153
  `blocking=${directive.blocking}`,
130
154
  `chars=${trimmed.length}`,
131
155
  ].join(" "),
@@ -12,6 +12,18 @@ import type { OutputStyle, TaskType } from "./types.js";
12
12
  * legacy regex predicates for that turn (graceful, never a wrong forced route).
13
13
  */
14
14
  export type DeliverableKind = "answer" | "code" | "report";
15
+ /**
16
+ * Model-decided WORK DEPTH for the turn — the agent-first replacement for the
17
+ * old regex `scoreComplexity` tier. Decided by the same single classify call so
18
+ * it costs no extra round-trip. Drives the GSD rubric injected by Layer 4:
19
+ * - "quick" — trivial single-shot (typo, one-liner, small lookup/answer). No plan.
20
+ * - "standard" — ordinary feature/bugfix touching a few files. Short plan + verify.
21
+ * - "heavy" — architectural / multi-file / wide / ambiguous. Full
22
+ * discuss → research → plan → check-plan → implement → verify.
23
+ * `null` when the model omits/garbles the word → Layer 4 defaults to "standard"
24
+ * (the safe middle) and the injected rubric lets the agent self-select.
25
+ */
26
+ export type DepthTier = "quick" | "standard" | "heavy";
15
27
  export interface LlmClassifyResult {
16
28
  taskType: TaskType;
17
29
  outputStyle: OutputStyle | null;
@@ -30,6 +42,30 @@ export interface LlmClassifyResult {
30
42
  * model omitted the word — consumers then fall back to their legacy regex.
31
43
  */
32
44
  deliverableKind: DeliverableKind | null;
45
+ /**
46
+ * Model-decided work depth (quick | standard | heavy). null when the model
47
+ * omitted/garbled the word — Layer 4 then defaults to "standard". This is the
48
+ * agent-first replacement for the regex complexity scorer: depth is judged by
49
+ * what the task actually entails, not by which keywords it happens to contain.
50
+ */
51
+ depthTier: DepthTier | null;
52
+ /**
53
+ * Model-decided scope: true when the turn is about the Muonroi PLATFORM /
54
+ * ecosystem (BB/.NET packages, building-block, open-core, rule engine,
55
+ * platform setup) — where the muonroi-docs MCP is the authoritative source —
56
+ * as opposed to muonroi-cli's own internals. Agent-first replacement for the
57
+ * `mentionsEcosystemScope` regex. null when the model omitted the word →
58
+ * Layer 4 treats it as not-ecosystem (no docs nudge).
59
+ */
60
+ ecosystemScope: boolean | null;
61
+ /**
62
+ * The language the user wrote in, as a capitalized display name (e.g.
63
+ * "Vietnamese", "Japanese"), or null when the user wrote in English / the
64
+ * model omitted it. Drives Layer 4's language re-anchor nudge. Agent-first
65
+ * replacement for the Vietnamese-only diacritic regex — generalizes to ANY
66
+ * non-English language.
67
+ */
68
+ replyLanguage: string | null;
33
69
  }
34
70
  export type LlmClassifyFn = (prompt: string, signal?: AbortSignal) => Promise<LlmClassifyResult | null>;
35
71
  /**