npm - muonroi-cli - Versions diffs - 1.6.5 → 1.7.0 - Mend

muonroi-cli 1.6.5 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/dist/src/generated/version.d.ts +1 -1
package/dist/src/generated/version.js +1 -1
package/dist/src/orchestrator/message-processor.js +1 -1
package/dist/src/orchestrator/prompts.js +16 -2
package/dist/src/orchestrator/stream-runner.js +50 -3
package/dist/src/orchestrator/subagent-compactor.d.ts +1 -1
package/dist/src/orchestrator/subagent-compactor.js +1 -1
package/dist/src/pil/__tests__/layer4-gsd.test.js +40 -23
package/dist/src/pil/__tests__/llm-classify.test.js +40 -3
package/dist/src/pil/layer1-intent.js +10 -1
package/dist/src/pil/layer1-intent.test.js +18 -0
package/dist/src/pil/layer4-gsd.js +43 -19
package/dist/src/pil/llm-classify.d.ts +36 -0
package/dist/src/pil/llm-classify.js +84 -18
package/dist/src/pil/types.d.ts +27 -2
package/dist/src/{gsd → playbook}/__tests__/directives.test.js +34 -58
package/dist/src/playbook/complexity.d.ts +17 -0
package/dist/src/playbook/complexity.js +18 -0
package/dist/src/{gsd → playbook}/directives.d.ts +20 -13
package/dist/src/playbook/directives.js +149 -0
package/dist/src/providers/__tests__/reasoning-roundtrip.test.js +70 -1
package/dist/src/providers/strategies/deepseek.strategy.js +5 -22
package/dist/src/providers/strategies/siliconflow.strategy.js +5 -0
package/dist/src/providers/strategies/thinking-mode.d.ts +35 -0
package/dist/src/providers/strategies/thinking-mode.js +73 -0
package/dist/src/tools/registry.js +47 -47
package/dist/src/ui/app.js +91 -24
package/dist/src/ui/hooks/use-session-picker.d.ts +14 -0
package/dist/src/ui/hooks/use-session-picker.js +20 -0
package/dist/src/ui/modals/session-picker-modal.d.ts +14 -0
package/dist/src/ui/modals/session-picker-modal.js +39 -0
package/dist/src/ui/utils/relaunch.d.ts +41 -0
package/dist/src/ui/utils/relaunch.js +71 -0
package/dist/src/ui/utils/relaunch.test.js +83 -0
package/package.json +1 -1
package/dist/src/gsd/__tests__/complexity.test.js +0 -0
package/dist/src/gsd/complexity.d.ts +0 -28
package/dist/src/gsd/complexity.js +0 -103
package/dist/src/gsd/directives.js +0 -154
/package/dist/src/{gsd → playbook}/__tests__/directives.test.d.ts +0 -0
/package/dist/src/{gsd/__tests__/complexity.test.d.ts → ui/utils/relaunch.test.d.ts} +0 -0

package/dist/src/generated/version.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export declare const PACKAGE_VERSION = "1.6.5";
+export declare const PACKAGE_VERSION = "1.7.0";
 export declare const PACKAGE_DESCRIPTION = "BYOK AI coding agent with multi-model council debate, role-based routing, and auto-compact.";

package/dist/src/generated/version.js CHANGED Viewed

@@ -1,5 +1,5 @@
 // AUTO-GENERATED by scripts/sync-version.cjs. DO NOT EDIT BY HAND.
 // Sourced from package.json at build time so it survives bun --compile bundling.
-export const PACKAGE_VERSION = "1.6.5";
+export const PACKAGE_VERSION = "1.7.0";
 export const PACKAGE_DESCRIPTION = "BYOK AI coding agent with multi-model council debate, role-based routing, and auto-compact.";
 //# sourceMappingURL=version.js.map

package/dist/src/orchestrator/message-processor.js CHANGED Viewed

@@ -60,7 +60,6 @@ import * as phaseTracker from "../ee/phase-tracker.js";
 import { buildScope as buildScopeForVeto } from "../ee/scope.js";
 import { fireTrajectoryEvent } from "../ee/session-trajectory.js";
 import { getTenantId as getTenantIdForVeto } from "../ee/tenant.js";
-import { mentionsEcosystemScope } from "../gsd/directives.js";
 import { acquireMcpTools } from "../mcp/client-pool.js";
 import { dropRedundantFsMcpTools, filterMcpServersByMessage } from "../mcp/smart-filter.js";
 import { getModelInfo } from "../models/registry.js";
@@ -69,6 +68,7 @@ import { injectCheapModelWorkbook, shouldInjectCheapModelWorkbook } from "../pil
 import { applyPilSuffix, getResponseTaskType, getResponseToolSet, isResponseTool, runPipeline, shouldHaltOnResponseTool, } from "../pil/index.js";
 import { isMetaAnalysisPrompt } from "../pil/layer6-output.js";
 import { taskTypeToMaxTokens, taskTypeToReasoningEffort, taskTypeToTier } from "../pil/task-tier-map.js";
+import { mentionsEcosystemScope } from "../playbook/directives.js";
 import { getProviderCapabilities } from "../providers/capabilities.js";
 import { loadKeyForProvider } from "../providers/keychain.js";
 import { bridgeMcpToolResult, getVisionGuidanceForTextOnly, listCachedImages, scrubImagePayloadsInMessages, } from "../providers/mcp-vision-bridge.js";

package/dist/src/orchestrator/prompts.js CHANGED Viewed

@@ -245,6 +245,12 @@ DEFAULT DELEGATION POLICY:
 - Do not wait for the user to explicitly ask for a sub-agent when delegation would clearly help.
 - Skip delegation only when the task is trivial, single-file, or you already have the exact answer.
+WRITING A GOOD DELEGATION PROMPT (the sub-agent sees ONLY what you put in the prompt field — it does NOT share your context):
+- GOAL: state the one concrete question or outcome the sub must deliver.
+- CONTEXT: include the specific facts the sub needs (file paths, symbol names, constraints, what you already know) so it doesn't re-derive them blindly.
+- RETURN SHAPE: say exactly what to hand back — e.g. "return the findings as file:line + a one-line conclusion", or "return the diff you applied + tests run". The sub's final message is the only thing that re-enters YOUR context (capped ~32K), so a vague ask wastes the turn.
+- When fanning out several sub-agents in parallel, give each a NON-overlapping scope so their syntheses compose instead of duplicating.
 EXAMPLES:
 - "review this change" -> delegate to explore first
 - "research how auth works" -> delegate to explore first
@@ -266,6 +272,8 @@ IMPORTANT:
 - Use write_file only for new files or when most of the file is changing. For very large files (>500 lines), split into multiple edit_file calls or write smaller chunks.
 - Use read_file instead of cat/head/tail for reading files.
 - When the user asks for an automated recurring or one-time run, use the schedule tools instead of only describing the setup.
+- Commit incrementally (in any git repo, without being asked): the moment a cohesive, working chunk is finished and its checks pass, commit it before starting the next chunk. This is part of doing the work, not a separate request — prefer several small, logically-scoped commits over one large catch-all commit at the end. Group related file changes per commit; keep unrelated changes in separate commits.
+- Every commit message you create MUST end with this attribution line, verbatim, on its own final line: "Coding by - Muonroi-CLI".
 - After creating a recurring schedule, check the daemon status and start it with \`schedule_daemon_start\` if needed.
 Be direct. Execute, don't just describe. Show results, not plans.`,
@@ -452,7 +460,10 @@ export function buildSubagentPrompt(request, cwd, custom, sandboxMode, subagents
         ? [
             "Do not create, modify, or delete files.",
             "Prefer `read_file` and search commands over broad shell exploration.",
-            "Return concise findings for the parent agent.",
+            // RETURN CONTRACT — the parent only ingests your FINAL message (capped at
+            // ~32K, head+tail), never your tool output. Make that message a tight
+            // synthesis so the parent's context stays clean.
+            "End with a tight synthesis FOR THE PARENT AGENT: lead with the answer to the delegated task, ground each claim in a concrete file:line, then note any gaps or the recommended next step. Do NOT narrate your search process or restate these instructions — the parent needs the conclusion, not the journey.",
         ]
         : isVerifyDetect
             ? [
@@ -528,7 +539,10 @@ export function buildSubagentPrompt(request, cwd, custom, sandboxMode, subagents
                             : [
                                 "Work only on the delegated task below.",
                                 "Use tools directly instead of narrating your intent.",
-                                "Return a concise summary for the parent agent with key outcomes and any open risks.",
+                                // RETURN CONTRACT — the parent only ingests your FINAL message
+                                // (capped at ~32K, head+tail), never your tool output. Make that
+                                // message a tight synthesis so the parent's context stays clean.
+                                "End with a tight synthesis FOR THE PARENT AGENT: lead with what you did / what you found, cite the concrete file:line you changed or relied on, then list any open risks, follow-ups, or verification still owed. Do NOT narrate your process or restate these instructions — the parent needs the result, not a transcript.",
                             ];
     const instructionLines = custom?.instruction.trim() ? ["", "SUB-AGENT INSTRUCTIONS:", custom.instruction.trim()] : [];
     return [

package/dist/src/orchestrator/stream-runner.js CHANGED Viewed

@@ -345,7 +345,13 @@ export class StreamRunner {
         // sub-agent has no PIL ctx of its own; the caller already bounded the
         // work via maxSteps. We compose alongside that hard step cap so a
         // wandering sub-agent loop trips whichever fires first (logical OR).
-        const _subCeiling = resolveCeiling("general", "medium");
+        // Explore sub-agents are READ-ONLY research — a codebase investigation
+        // legitimately needs more grep/read steps than the tight general/medium=10
+        // cell allows. Cutting it early (esp. for reasoning models that front-load
+        // tool calls) leaves no budget to reach the synthesis turn (root cause of
+        // empty "Task completed. Last action: grep" returns). Give explore the
+        // analyze/large cell (15); edit-capable agents keep general/medium (10).
+        const _subCeiling = isExplore ? resolveCeiling("analyze", "large") : resolveCeiling("general", "medium");
         const _subCounterKey = `subagent:${subCallId}`;
         const _subStopWhen = (async (state) => {
             if (state.steps.length >= maxSteps)
@@ -603,8 +609,49 @@ export class StreamRunner {
         finally {
             stall.dispose();
         }
-        const output = assistantText.trim() || `Task completed. Last action: ${lastActivity}`;
-        return { output, lastActivity, cancelled: false, assistantText };
+        // Forced final synthesis. When the loop ends on a tool-call / step-ceiling
+        // cut (finishReason="tool-calls"), the model never got a turn to write its
+        // findings, so assistantText is empty and the parent would receive the
+        // useless "Task completed. Last action: <tool>" fallback — then redo the
+        // work itself, defeating the whole point of delegation (live root cause:
+        // grok-build-0.1 emitted 277 reasoning-deltas + 19 tool-calls but ZERO
+        // text-deltas across the 10-step ceiling). Give the sub exactly ONE
+        // tool-free turn to synthesize what it already gathered. Tools are removed
+        // so finishReason cannot be "tool-calls" again — the model MUST emit text.
+        let synthesizedText = "";
+        if (!assistantText.trim() && !signal?.aborted && !stallTriggered) {
+            try {
+                const resp = await result.response;
+                const priorMessages = (resp.messages ?? []);
+                const synthResult = streamText({
+                    model: childRuntime.model,
+                    system: childSystem,
+                    messages: [
+                        ...childMessages,
+                        ...priorMessages,
+                        {
+                            role: "user",
+                            content: "You've reached your investigation budget and have not written your findings yet. Stop now — do NOT call any more tools. Write your final synthesis FOR THE PARENT AGENT: lead with the answer to the delegated task, cite the concrete file:line behind each claim, then note any gaps or the recommended next step. Be concise; the parent only ingests this message.",
+                        },
+                    ],
+                    tools: {},
+                    maxRetries: 0,
+                    abortSignal: signal,
+                    ...(childProviderOptions ? { providerOptions: childProviderOptions } : {}),
+                });
+                for await (const part of synthResult.fullStream) {
+                    if (part.type === "text-delta")
+                        synthesizedText += part.text ?? "";
+                }
+                debugLog(`forced-synthesis: textLen=${synthesizedText.length}`);
+            }
+            catch (err) {
+                debugLog(`forced-synthesis failed: ${err?.message}`);
+            }
+        }
+        const recovered = assistantText.trim() || synthesizedText.trim();
+        const output = recovered || `Task completed. Last action: ${lastActivity}`;
+        return { output, lastActivity, cancelled: false, assistantText: recovered };
     }
     /**
      * Phase 3 — canonical entrypoint. Orchestrates setup + runStream with

package/dist/src/orchestrator/subagent-compactor.d.ts CHANGED Viewed

@@ -114,7 +114,7 @@ export declare const IMPORTANT_TOOL_NAMES: readonly ["read_file", "grep", "lsp",
 /**
  * MCP tool prefixes whose results are an AUTHORITATIVE source the agent is
  * explicitly steered to fetch FIRST and ground on (the ECOSYSTEM_DOCS_NUDGE in
- * src/gsd/directives.ts). Eliding them defeats the nudge — the agent calls the
+ * src/playbook/directives.ts). Eliding them defeats the nudge — the agent calls the
  * ecosystem docs, then compaction discards them and it goes blind on the very
  * source it was told to trust (session 584ba476c07a: mcp_muonroi-docs__setup_guide
  * + bb_recipe_list elided, ee_unavailable, 0 rehydrated → "partially blind").

package/dist/src/orchestrator/subagent-compactor.js CHANGED Viewed

@@ -76,7 +76,7 @@ export const IMPORTANT_TOOL_NAMES = [
 /**
  * MCP tool prefixes whose results are an AUTHORITATIVE source the agent is
  * explicitly steered to fetch FIRST and ground on (the ECOSYSTEM_DOCS_NUDGE in
- * src/gsd/directives.ts). Eliding them defeats the nudge — the agent calls the
+ * src/playbook/directives.ts). Eliding them defeats the nudge — the agent calls the
  * ecosystem docs, then compaction discards them and it goes blind on the very
  * source it was told to trust (session 584ba476c07a: mcp_muonroi-docs__setup_guide
  * + bb_recipe_list elided, ee_unavailable, 0 rehydrated → "partially blind").

package/dist/src/pil/__tests__/layer4-gsd.test.js CHANGED Viewed

@@ -17,7 +17,7 @@ function makeCtx(overrides = {}) {
         ...overrides,
     };
 }
-describe("layer4Gsd (gsd-native)", () => {
+describe("layer4Gsd (playbook)", () => {
     it("skips directive injection when intentKind === 'chitchat'", async () => {
         const before = "hello";
         const result = await layer4Gsd(makeCtx({ raw: before, enriched: before, intentKind: "chitchat" }));
@@ -26,9 +26,9 @@ describe("layer4Gsd (gsd-native)", () => {
         expect(layer.applied).toBe(false);
         expect(layer.delta).toBe("skip:chitchat");
     });
-    it("appends a gsd-native directive and records the layer as applied", async () => {
+    it("appends a playbook directive and records the layer as applied", async () => {
         const result = await layer4Gsd(makeCtx({ raw: "implement the login feature" }));
-        expect(result.enriched).toContain("[gsd-native]");
+        expect(result.enriched).toContain("[playbook]");
         const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
         expect(layer).toBeDefined();
         expect(layer.applied).toBe(true);
@@ -39,18 +39,30 @@ describe("layer4Gsd (gsd-native)", () => {
         const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
         expect(layer.delta).toContain("phase=plan");
     });
-    it("emits a HEAVY directive for wholesale, multi-step prompts", async () => {
-        const heavy = "redo the entire architecture and produce a deep-map across all repos, including business rules";
-        const result = await layer4Gsd(makeCtx({ raw: heavy, tokenBudget: 4000 }));
+    it("emits a HEAVY directive when the model classifies depth=heavy (agent-first, not regex)", async () => {
+        // Depth now comes from ctx.modelDepthTier (the model's 5th classify word),
+        // NOT a regex scan of the raw prompt. A plainly-phrased prompt the model
+        // judged heavy still gets the full discuss → research → check-plan flow.
+        const result = await layer4Gsd(makeCtx({ raw: "rework how auth works", tokenBudget: 8000, modelDepthTier: "heavy" }));
         const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
         expect(layer.delta).toContain("tier=heavy");
-        expect(result.enriched).toMatch(/MANDATORY/);
+        expect(layer.delta).toContain("depth=model");
+        expect(result.enriched).toMatch(/HEAVY task/);
+        expect(result.enriched).toMatch(/DISCUSS/);
         expect(result.enriched).toMatch(/AskUserQuestion/);
+        expect(result.enriched).toMatch(/CHECK-PLAN/);
     });
-    it("emits a QUICK directive for trivial prompts", async () => {
-        const result = await layer4Gsd(makeCtx({ raw: "fix typo in README" }));
+    it("emits a QUICK directive when the model classifies depth=quick", async () => {
+        const result = await layer4Gsd(makeCtx({ raw: "fix typo in README", modelDepthTier: "quick" }));
         const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
         expect(layer.delta).toContain("tier=quick");
+        expect(layer.delta).toContain("depth=model");
+    });
+    it("defaults to STANDARD tier when the model supplied no depth (no regex fallback)", async () => {
+        const result = await layer4Gsd(makeCtx({ raw: "do the thing", modelDepthTier: null }));
+        const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
+        expect(layer.delta).toContain("tier=standard");
+        expect(layer.delta).toContain("depth=default");
     });
     it("still records a layer entry even when no phase is detected", async () => {
         const result = await layer4Gsd(makeCtx({ raw: "hello there", gsdPhase: null }));
@@ -59,21 +71,26 @@ describe("layer4Gsd (gsd-native)", () => {
         expect(layer.applied).toBe(true);
         expect(layer.delta).toMatch(/phase=(none|discuss|plan|execute|verify|review)/);
     });
-    it("respects tokenBudget when truncating the directive", async () => {
-        const result = await layer4Gsd(makeCtx({ raw: "implement this", tokenBudget: 30 }));
-        const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
-        if (layer?.applied && layer.delta) {
-            const charsMatch = layer.delta.match(/chars=(\d+)/);
-            if (charsMatch) {
-                // Directive budget is 25% of tokenBudget * 4 chars/token = 30 chars at budget=30.
-                // truncateToBudget returns chars-based budget — accept up to tokenBudget*4.
-                expect(parseInt(charsMatch[1], 10)).toBeLessThanOrEqual(30 * 4);
-            }
-        }
-    });
-    it("updates gsdPhase on context when keyword detection fires", async () => {
+    it("floors the directive budget so the rubric survives at the default tokenBudget (regression: directive was truncated to ~500 chars)", async () => {
+        // The playbook directive is a critical behavioural instruction and must NOT
+        // be gutted by the tiny pipeline budget. At the production default
+        // tokenBudget=500 the bare 25% fraction was ~500 chars, which cut the HEAVY
+        // rubric after step 1. The floor (DIRECTIVE_MIN_TOKENS) guarantees the full
+        // ~1.7K-char HEAVY rubric reaches the model intact.
+        const result = await layer4Gsd(makeCtx({ raw: "rework auth", tokenBudget: 500, modelDepthTier: "heavy" }));
+        expect(result.enriched).toMatch(/DISCUSS/);
+        expect(result.enriched).toMatch(/CHECK-PLAN/);
+        expect(result.enriched).toMatch(/VERIFY/);
+        expect(result.enriched).toMatch(/todo_write/);
+    });
+    it("does NOT keyword-detect a phase from the raw prompt (agent-first, no regex)", async () => {
+        // Phase keyword detection was removed: a regex scan of the prompt would
+        // mislabel the directive. Phase is sourced only from ctx.gsdPhase (L1
+        // unified) or the EE brain route. With neither, it stays null/undefined.
         const result = await layer4Gsd(makeCtx({ raw: "review the pull request" }));
-        expect(["review", "discuss", "execute"]).toContain(result.gsdPhase);
+        expect(result.gsdPhase ?? null).toBeNull();
+        const layer = result.layers.find((l) => l.name === "gsd-workflow-structuring");
+        expect(layer.delta).toContain("phase=none");
     });
     it("routes a question-shaped analyze/debug prompt to the QUESTION directive (no 'state a plan')", async () => {
         // De-robotizing: a plain question must not get the STANDARD "state a 2-3 line

package/dist/src/pil/__tests__/llm-classify.test.js CHANGED Viewed

@@ -126,17 +126,37 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
         expect(result?.taskType).toBe("debug");
         expect(result?.outputStyle).toBe("concise");
     });
-    it("keeps a tiny output budget for non-reasoning models (24 — four comma words)", async () => {
+    it("keeps a tiny output budget for non-reasoning models (48 — seven comma words)", async () => {
         const handle = installMockModel({ fixture: { stream: textOnlyStream("generate,concise") } });
         cleanup = handle.uninstall;
         const factory = (() => handle.model);
         const classify = createLlmClassifier(factory, "Qwen/Qwen3-8B"); // reasoning:false
         await classify("add a new endpoint");
         const call = handle.calls[0];
-        expect(call.maxOutputTokens).toBe(24);
+        expect(call.maxOutputTokens).toBe(48);
+    });
+    it("parses the sixth + seventh words as agent-first scope and reply-language", async () => {
+        const eco = installMockModel({
+            fixture: { stream: textOnlyStream("analyze,balanced,task,answer,standard,ecosystem,vietnamese") },
+        });
+        cleanup = eco.uninstall;
+        const ecoClassify = createLlmClassifier((() => eco.model), "deepseek-v4-flash");
+        const r = await ecoClassify("hệ sinh thái muonroi gồm những gì");
+        expect(r?.ecosystemScope).toBe(true);
+        expect(r?.replyLanguage).toBe("Vietnamese");
+        eco.uninstall();
+        // English + local → no nudge signals (ecosystemScope false, replyLanguage null).
+        const plain = installMockModel({
+            fixture: { stream: textOnlyStream("debug,concise,task,code,standard,local,english") },
+        });
+        cleanup = plain.uninstall;
+        const plainClassify = createLlmClassifier((() => plain.model), "deepseek-v4-flash");
+        const p = await plainClassify("fix the crash");
+        expect(p?.ecosystemScope).toBe(false);
+        expect(p?.replyLanguage).toBeNull();
     });
     it("parses the fourth word as the output deliverable (Phase 2b)", async () => {
-        const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code") } });
+        const handle = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code,standard") } });
         cleanup = handle.uninstall;
         const factory = (() => handle.model);
         const classify = createLlmClassifier(factory, "deepseek-v4-flash");
@@ -144,6 +164,23 @@ describe("createLlmClassifier (PIL Layer 1 Pass 4)", () => {
         expect(result?.taskType).toBe("debug");
         expect(result?.deliverableKind).toBe("code");
     });
+    it("parses the fifth word as the model-decided work depth (agent-first tier)", async () => {
+        const heavy = installMockModel({ fixture: { stream: textOnlyStream("refactor,concise,task,code,heavy") } });
+        cleanup = heavy.uninstall;
+        const heavyClassify = createLlmClassifier((() => heavy.model), "deepseek-v4-flash");
+        expect((await heavyClassify("rework the auth system"))?.depthTier).toBe("heavy");
+        heavy.uninstall();
+        // Position-independent recovery (taskType still leads; depth appears early).
+        const reordered = installMockModel({ fixture: { stream: textOnlyStream("debug,quick,concise,task,code") } });
+        cleanup = reordered.uninstall;
+        const reorderedClassify = createLlmClassifier((() => reordered.model), "deepseek-v4-flash");
+        expect((await reorderedClassify("fix typo"))?.depthTier).toBe("quick");
+        reordered.uninstall();
+        const noDepth = installMockModel({ fixture: { stream: textOnlyStream("debug,concise,task,code") } });
+        cleanup = noDepth.uninstall;
+        const noDepthClassify = createLlmClassifier((() => noDepth.model), "deepseek-v4-flash");
+        expect((await noDepthClassify("fix the bug"))?.depthTier).toBeNull();
+    });
     it("recovers the deliverable position-independently and defaults to null when absent", async () => {
         const reportHandle = installMockModel({ fixture: { stream: textOnlyStream("analyze,concise,task,report") } });
         cleanup = reportHandle.uninstall;

package/dist/src/pil/layer1-intent.js CHANGED Viewed

@@ -656,6 +656,15 @@ export async function layer1Intent(ctx, opts = {}) {
                     // routing instead of keyword regex. null → those layers fall back to
                     // their legacy regex predicates for this turn.
                     deliverableKind: llmRes.deliverableKind,
+                    // Agent-first work depth: the model decides the GSD tier in the same
+                    // classify call (no extra round-trip). layer4 prefers this over the
+                    // regex scorer. null → layer4 defaults to "standard".
+                    modelDepthTier: llmRes.depthTier,
+                    // Agent-first scope + reply-language (same classify call). Replace the
+                    // ecosystem/diacritic regexes: layer4 reads these instead of scanning
+                    // the raw prompt.
+                    ecosystemScope: llmRes.ecosystemScope,
+                    replyLanguage: llmRes.replyLanguage,
                     // null lets L6 run its cheap style-rescue if outputStyle is still null;
                     // EE retrieval enrichment happens downstream in layer3 as usual.
                     _brainData: null,
@@ -665,7 +674,7 @@ export async function layer1Intent(ctx, opts = {}) {
                         {
                             name: "intent-detection",
                             applied: true,
-                            delta: `taskType=${llmRes.taskType},kind=${intentKind},deliverable=${llmRes.deliverableKind ?? "none"},conf=${llmRes.confidence.toFixed(2)},domain=${domain ?? "none"},style=${outputStyle ?? "none"},source=llm-first`,
+                            delta: `taskType=${llmRes.taskType},kind=${intentKind},deliverable=${llmRes.deliverableKind ?? "none"},depth=${llmRes.depthTier ?? "none"},conf=${llmRes.confidence.toFixed(2)},domain=${domain ?? "none"},style=${outputStyle ?? "none"},source=llm-first`,
                         },
                     ],
                 };

package/dist/src/pil/layer1-intent.test.js CHANGED Viewed

@@ -378,6 +378,9 @@ describe("intentKind guard — a tool/command request must never route as chitch
         confidence: 0.75,
         intentKind: "task",
         deliverableKind: null,
+        depthTier: null,
+        ecosystemScope: null,
+        replyLanguage: null,
     });
     it("flips chitchat → task when the LLM fallback returns 'general' but the prompt is a command request", async () => {
         // Reproduces 817e508f57ee: classify abstains, LLM fallback returns
@@ -546,6 +549,9 @@ describe("Pass 2.6 — social pleasantries route to chitchat (drop the tool-sche
                 confidence: 0.8,
                 intentKind: "task",
                 deliverableKind: "code",
+                depthTier: "standard",
+                ecosystemScope: null,
+                replyLanguage: null,
             }),
         });
         expect(result.intentKind).toBe("task");
@@ -565,6 +571,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
                 confidence: 0.9,
                 intentKind: "task",
                 deliverableKind: "answer",
+                depthTier: null,
+                ecosystemScope: null,
+                replyLanguage: null,
             }),
         });
         expect(result.taskType).toBe("general"); // NOT the regex 'create-file' → generate
@@ -581,6 +590,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
                 confidence: 0.9,
                 intentKind: "chitchat",
                 deliverableKind: "answer",
+                depthTier: null,
+                ecosystemScope: null,
+                replyLanguage: null,
             }),
         });
         expect(result.intentKind).toBe("chitchat");
@@ -593,6 +605,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
                 confidence: 0.9,
                 intentKind: "chitchat",
                 deliverableKind: "answer",
+                depthTier: null,
+                ecosystemScope: null,
+                replyLanguage: null,
             }),
         });
         expect(result.intentKind).toBe("task");
@@ -627,6 +642,9 @@ describe("layer1Intent — model-first gate (MUONROI_LLM_FIRST_CLASSIFY)", () =>
             confidence: 0.9,
             intentKind: "task",
             deliverableKind: null,
+            depthTier: null,
+            ecosystemScope: null,
+            replyLanguage: null,
         }));
         const result = await layer1Intent(makeCtx("fix the failing build"), { llmFallback: llm });
         expect(llm).not.toHaveBeenCalled();

package/dist/src/pil/layer4-gsd.js CHANGED Viewed

@@ -17,10 +17,7 @@
  * the agent, into the user's language.
  */
 import { routeTask } from "../ee/bridge.js";
-import { scoreComplexity } from "../gsd/complexity.js";
-import { buildDirective, mentionsEcosystemScope } from "../gsd/directives.js";
-import { detectGrayAreas } from "../gsd/gray-areas.js";
-import { detectGsdPhase } from "../gsd/types.js";
+import { buildDirective } from "../playbook/directives.js";
 import { classifyEeError, logEeFailure } from "../utils/ee-logger.js";
 import { truncateToBudget } from "./budget.js";
 import { isImplementationIntent, isMetaAnalysisPrompt, isQuestionLike } from "./layer6-output.js";
@@ -37,6 +34,15 @@ function mapRouteToPhase(route) {
     }
 }
 const DIRECTIVE_BUDGET_FRACTION = 0.25;
+// The playbook directive is a CRITICAL behavioural instruction, not enrichment
+// context — it must reach the model INTACT. With the pipeline default
+// tokenBudget=500, a 25% share is only ~125 tokens (~500 chars), which silently
+// truncated the HEAVY rubric after the first step (CHECK-PLAN / IMPLEMENT /
+// VERIFY / the todo_write checklist instruction never reached the model). The
+// full HEAVY directive is ~1.7K chars, so floor the directive's own budget at a
+// value that fits it whole (truncateToBudget multiplies by CHARS_PER_TOKEN=4 →
+// 700 tokens ≈ 2.8K chars). The fraction still wins when tokenBudget is large.
+const DIRECTIVE_MIN_TOKENS = 700;
 // TODO(WhoAmI-L4): when EE v4.0 Who Am I profile is available:
 //   - work_patterns.delegation_style="autonomous" → bias routeTask toward "direct",
 //     skip qc-flow discussion phase for familiar task types
@@ -74,11 +80,24 @@ export async function layer4Gsd(ctx) {
         routeSource = "unified";
     }
     if (!phase) {
-        phase = detectGsdPhase(ctx.raw);
-        routeSource = phase ? "keyword" : "none";
+        // Agent-first: phase is a minor hint sourced from the EE brain only. We do
+        // NOT keyword-regex it from the raw prompt — regex misclassification here
+        // would mislabel the directive (no-regex rule, 2026-06-18). null is fine:
+        // the directive reads cleanly without a phase hint.
+        routeSource = "none";
     }
-    const complexity = scoreComplexity(ctx.raw);
-    const grayAreas = complexity.tier === "heavy" ? detectGrayAreas(ctx.raw).questions : [];
+    // Work depth is decided by the model in layer1's classify call (the 5th
+    // word → ctx.modelDepthTier). The regex `scoreComplexity` scorer has been
+    // removed from this decision path: depth must reflect what the task actually
+    // entails, not which keywords it contains. When the model classifier is
+    // unwired/failed (modelDepthTier null — rare, since it IS the chat model),
+    // default to the safe middle tier; the injected rubric still lets the agent
+    // self-select up or down.
+    const tier = ctx.modelDepthTier ?? "standard";
+    // Gray areas are no longer pre-computed by regex. The HEAVY rubric instructs
+    // the agent to surface its own clarifying questions via AskUserQuestion,
+    // grounded in what it actually finds — far more accurate than keyword guesses.
+    const grayAreas = [];
     // Informational prompts (a question / explanation / self-eval) ask for an
     // ANSWER, not a code change. The implement/verify directive otherwise leaks
     // into the human-facing reply as a "2-3 line plan" + process narration
@@ -101,18 +120,24 @@ export async function layer4Gsd(ctx) {
         : isMetaAnalysisPrompt(ctx.raw) ||
             (ctx.taskType === "general" && ctx.intentKind === "task") ||
             (isQuestionLike(ctx.raw) && !isImplementationIntent(ctx.raw));
-    const ecosystem = mentionsEcosystemScope(ctx.raw);
-    // Heuristic: VN diacritics → user wrote Vietnamese → re-anchor language rule
-    // inside the directive (storyflow_ui session 22661c8de9f2 — base rule
-    // crowded out by brevity/FIX-FIRST directives).
-    const replyLanguage = /[à-ỹÀ-Ỹ]/.test(ctx.raw) ? "Vietnamese" : undefined;
-    const directive = buildDirective({ complexity, phase, grayAreas, informational, ecosystem, replyLanguage });
-    const budgetChars = Math.floor(ctx.tokenBudget * DIRECTIVE_BUDGET_FRACTION);
-    const trimmed = truncateToBudget(directive.text, budgetChars);
+    // Scope + reply-language are now agent-first (model-decided in layer1's
+    // classify call), NOT regex scans of the raw prompt (no-regex rule,
+    // 2026-06-18). The ecosystem docs-first nudge fires only when the model judged
+    // the turn platform-scoped; the language re-anchor fires for any non-English
+    // language the model detected (the old regex only caught Vietnamese).
+    const ecosystem = ctx.ecosystemScope === true;
+    const replyLanguage = ctx.replyLanguage ?? undefined;
+    const directive = buildDirective({ tier, phase, informational, ecosystem, replyLanguage });
+    // truncateToBudget takes a TOKEN budget (×CHARS_PER_TOKEN internally). Floor it
+    // at DIRECTIVE_MIN_TOKENS so the full directive always survives, even at the
+    // default tokenBudget=500 where the bare fraction would gut it.
+    const directiveTokenBudget = Math.max(Math.floor(ctx.tokenBudget * DIRECTIVE_BUDGET_FRACTION), DIRECTIVE_MIN_TOKENS);
+    const trimmed = truncateToBudget(directive.text, directiveTokenBudget);
+    const depthSource = ctx.modelDepthTier ? "model" : "default";
     return {
         ...ctx,
         gsdPhase: phase,
-        complexityTier: complexity.tier,
+        complexityTier: tier,
         grayAreas,
         enriched: `${ctx.enriched}\n${trimmed}`,
         layers: [
@@ -122,10 +147,9 @@ export async function layer4Gsd(ctx) {
                 applied: true,
                 delta: [
                     `tier=${directive.tier}`,
-                    `score=${complexity.score}`,
+                    `depth=${depthSource}`,
                     `phase=${phase ?? "none"}`,
                     `route=${routeSource}`,
-                    `gray=${grayAreas.length}`,
                     `blocking=${directive.blocking}`,
                     `chars=${trimmed.length}`,
                 ].join(" "),

package/dist/src/pil/llm-classify.d.ts CHANGED Viewed

@@ -12,6 +12,18 @@ import type { OutputStyle, TaskType } from "./types.js";
  * legacy regex predicates for that turn (graceful, never a wrong forced route).
  */
 export type DeliverableKind = "answer" | "code" | "report";
+/**
+ * Model-decided WORK DEPTH for the turn — the agent-first replacement for the
+ * old regex `scoreComplexity` tier. Decided by the same single classify call so
+ * it costs no extra round-trip. Drives the GSD rubric injected by Layer 4:
+ *   - "quick"    — trivial single-shot (typo, one-liner, small lookup/answer). No plan.
+ *   - "standard" — ordinary feature/bugfix touching a few files. Short plan + verify.
+ *   - "heavy"    — architectural / multi-file / wide / ambiguous. Full
+ *                  discuss → research → plan → check-plan → implement → verify.
+ * `null` when the model omits/garbles the word → Layer 4 defaults to "standard"
+ * (the safe middle) and the injected rubric lets the agent self-select.
+ */
+export type DepthTier = "quick" | "standard" | "heavy";
 export interface LlmClassifyResult {
     taskType: TaskType;
     outputStyle: OutputStyle | null;
@@ -30,6 +42,30 @@ export interface LlmClassifyResult {
      * model omitted the word — consumers then fall back to their legacy regex.
      */
     deliverableKind: DeliverableKind | null;
+    /**
+     * Model-decided work depth (quick | standard | heavy). null when the model
+     * omitted/garbled the word — Layer 4 then defaults to "standard". This is the
+     * agent-first replacement for the regex complexity scorer: depth is judged by
+     * what the task actually entails, not by which keywords it happens to contain.
+     */
+    depthTier: DepthTier | null;
+    /**
+     * Model-decided scope: true when the turn is about the Muonroi PLATFORM /
+     * ecosystem (BB/.NET packages, building-block, open-core, rule engine,
+     * platform setup) — where the muonroi-docs MCP is the authoritative source —
+     * as opposed to muonroi-cli's own internals. Agent-first replacement for the
+     * `mentionsEcosystemScope` regex. null when the model omitted the word →
+     * Layer 4 treats it as not-ecosystem (no docs nudge).
+     */
+    ecosystemScope: boolean | null;
+    /**
+     * The language the user wrote in, as a capitalized display name (e.g.
+     * "Vietnamese", "Japanese"), or null when the user wrote in English / the
+     * model omitted it. Drives Layer 4's language re-anchor nudge. Agent-first
+     * replacement for the Vietnamese-only diacritic regex — generalizes to ANY
+     * non-English language.
+     */
+    replyLanguage: string | null;
 }
 export type LlmClassifyFn = (prompt: string, signal?: AbortSignal) => Promise<LlmClassifyResult | null>;
 /**