muonroi-cli 1.6.6 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/src/generated/version.d.ts +1 -1
  2. package/dist/src/generated/version.js +1 -1
  3. package/dist/src/orchestrator/message-processor.js +1 -1
  4. package/dist/src/orchestrator/prompts.js +16 -2
  5. package/dist/src/orchestrator/stream-runner.js +50 -3
  6. package/dist/src/orchestrator/subagent-compactor.d.ts +1 -1
  7. package/dist/src/orchestrator/subagent-compactor.js +1 -1
  8. package/dist/src/pil/__tests__/layer4-gsd.test.js +40 -23
  9. package/dist/src/pil/__tests__/llm-classify.test.js +40 -3
  10. package/dist/src/pil/layer1-intent.js +10 -1
  11. package/dist/src/pil/layer1-intent.test.js +18 -0
  12. package/dist/src/pil/layer4-gsd.js +43 -19
  13. package/dist/src/pil/llm-classify.d.ts +36 -0
  14. package/dist/src/pil/llm-classify.js +84 -18
  15. package/dist/src/pil/types.d.ts +27 -2
  16. package/dist/src/{gsd → playbook}/__tests__/directives.test.js +34 -58
  17. package/dist/src/playbook/complexity.d.ts +17 -0
  18. package/dist/src/playbook/complexity.js +18 -0
  19. package/dist/src/{gsd → playbook}/directives.d.ts +20 -13
  20. package/dist/src/playbook/directives.js +149 -0
  21. package/dist/src/providers/__tests__/reasoning-roundtrip.test.js +70 -1
  22. package/dist/src/providers/strategies/deepseek.strategy.js +5 -22
  23. package/dist/src/providers/strategies/siliconflow.strategy.js +5 -0
  24. package/dist/src/providers/strategies/thinking-mode.d.ts +35 -0
  25. package/dist/src/providers/strategies/thinking-mode.js +73 -0
  26. package/dist/src/tools/registry.js +47 -47
  27. package/package.json +1 -1
  28. package/dist/src/gsd/__tests__/complexity.test.d.ts +0 -1
  29. package/dist/src/gsd/__tests__/complexity.test.js +0 -0
  30. package/dist/src/gsd/complexity.d.ts +0 -28
  31. package/dist/src/gsd/complexity.js +0 -103
  32. package/dist/src/gsd/directives.js +0 -154
  33. /package/dist/src/{gsd → playbook}/__tests__/directives.test.d.ts +0 -0
@@ -25,10 +25,10 @@ const LLM_CLASSIFY_TIMEOUT_MS = 2500;
25
25
  // The ceiling is a cap, not padding: the model still stops after two words, so a
26
26
  // generous headroom costs nothing when reasoning is short.
27
27
  const REASONING_CLASSIFY_TIMEOUT_MS = 8000;
28
- // Four comma-separated words now (added <deliverable>) — ~10-14 tokens worst
29
- // case ("documentation,balanced,task,report"). 24 keeps headroom over the
30
- // prior 16-token cap without padding (the model still stops after four words).
31
- const NONREASONING_MAX_OUTPUT_TOKENS = 24;
28
+ // Seven comma-separated words now (added <scope>,<lang>) — ~18-26 tokens worst
29
+ // case ("documentation,balanced,task,report,standard,ecosystem,vietnamese").
30
+ // 48 keeps headroom without padding (the model still stops after seven words).
31
+ const NONREASONING_MAX_OUTPUT_TOKENS = 48;
32
32
  const REASONING_MAX_OUTPUT_TOKENS = 2048;
33
33
  /**
34
34
  * Per-namespace shallow merge of providerOptions. The base already carries
@@ -58,7 +58,34 @@ const VALID_TASK_TYPES = new Set([
58
58
  "general",
59
59
  ]);
60
60
  const VALID_STYLES = new Set(["concise", "balanced", "detailed"]);
61
- const SYSTEM_PROMPT = "You classify user prompts for a coding assistant. Reply with ONE line of FOUR lowercase words separated by commas: <taskType>,<style>,<intent>,<deliverable>\n\n" +
61
+ const VALID_DEPTHS = new Set(["quick", "standard", "heavy"]);
62
+ // Every token the classifier can legitimately emit for the first six fields.
63
+ // Used to isolate the 7th field (language), which is open-vocabulary: the lang
64
+ // word is the one alphabetic token that is NOT a known enum value.
65
+ const KNOWN_CLASSIFY_WORDS = new Set([
66
+ "refactor",
67
+ "debug",
68
+ "plan",
69
+ "analyze",
70
+ "documentation",
71
+ "generate",
72
+ "general",
73
+ "concise",
74
+ "balanced",
75
+ "detailed",
76
+ "task",
77
+ "chat",
78
+ "chitchat",
79
+ "answer",
80
+ "code",
81
+ "report",
82
+ "quick",
83
+ "standard",
84
+ "heavy",
85
+ "ecosystem",
86
+ "local",
87
+ ]);
88
+ const SYSTEM_PROMPT = "You classify user prompts for a coding assistant. Reply with ONE line of SEVEN lowercase words separated by commas: <taskType>,<style>,<intent>,<deliverable>,<depth>,<scope>,<lang>\n\n" +
62
89
  "taskType ∈ { refactor | debug | plan | analyze | documentation | generate | general }\n" +
63
90
  "style ∈ { concise | balanced | detailed }\n" +
64
91
  "intent ∈ { task | chat } — 'chat' ONLY for a pure greeting, thanks, or acknowledgement with NO work request (e.g. 'hi', 'cảm ơn nhé', 'ok great'). EVERYTHING else is 'task', including questions about code or the CLI, 'are you done?', and requests to call a tool. When unsure, choose 'task'.\n" +
@@ -66,7 +93,17 @@ const SYSTEM_PROMPT = "You classify user prompts for a coding assistant. Reply w
66
93
  "- code — CREATE or EDIT files: implement, fix, build, scaffold, refactor, wire, rename, apply a patch. The deliverable is changed code.\n" +
67
94
  "- report — a STRUCTURED list / plan / audit / roadmap / checklist is the deliverable (its value IS the structure).\n" +
68
95
  "- answer — everything else: explain, review, investigate, compare, a question about code or the CLI, a yes/no question, a meta/self-eval. The deliverable is a written answer, NO file edits.\n" +
69
- " Pick by the PRIMARY thing the user asked you to produce. A question that merely mentions code is 'answer'. When unsure between answer and report, choose answer.\n\n" +
96
+ " Pick by the PRIMARY thing the user asked you to produce. A question that merely mentions code is 'answer'. When unsure between answer and report, choose answer.\n" +
97
+ "depth ∈ { quick | standard | heavy } — how much work the task ACTUALLY entails (judge the work, NOT the wording; a plainly-phrased request can still be heavy):\n" +
98
+ "- quick — a trivial single-shot change or a small direct answer: typo, rename one symbol, one-line edit, a quick lookup, 'what does X do'. No plan needed.\n" +
99
+ "- standard — ordinary feature or bugfix touching a handful of files/functions; needs a short plan + a verify step, but no upfront research or user discussion.\n" +
100
+ "- heavy — architectural, cross-cutting, multi-file/multi-module, a migration, 'redo/rebuild', a vague 'make it better', or a request with real unresolved design choices. Needs discussion + research + a checked plan before any code.\n" +
101
+ " For a pure question/answer (deliverable=answer), depth reflects how much investigation the answer needs: 'quick' for a simple fact, 'standard' for a normal explanation, 'heavy' for a deep architectural review.\n" +
102
+ " When unsure between quick and standard, choose standard. When the task is genuinely wide or ambiguous, choose heavy.\n" +
103
+ "scope ∈ { ecosystem | local }:\n" +
104
+ "- ecosystem — the turn is about the Muonroi PLATFORM as a whole: the building-block / .NET packages, open-core boundary, the rule engine / decision tables, NuGet packages, or platform setup/install. These are documented in an authoritative docs source.\n" +
105
+ "- local — EVERYTHING else, including questions about this CLI's own internals (even when they mention the word 'muonroi'). When unsure, choose local.\n" +
106
+ "lang — the language the user's message is written in, as ONE lowercase English word: english, vietnamese, japanese, french, etc. Use 'english' for English or when unsure.\n\n" +
70
107
  "Rules (read carefully — Phase 4 4P-2 disambiguation):\n" +
71
108
  "- debug — fix a bug, CI/build/test failure, error, exception, crash, or any 'why is X broken' question.\n" +
72
109
  "- generate — create new code, scaffold, write a new file, add a feature from scratch, ADD A NEW TEST, CHANGE A DEFAULT VALUE, modify configuration, improve coverage.\n" +
@@ -91,17 +128,21 @@ const SYSTEM_PROMPT = "You classify user prompts for a coding assistant. Reply w
91
128
  "- documentation → balanced (examples + explanation)\n" +
92
129
  "- general → concise\n" +
93
130
  "Only output 'detailed' if the user prompt LITERALLY contains words like 'explain in detail', 'thorough analysis', 'walk me through', 'giải thích chi tiết', 'phân tích kỹ'.\n\n" +
94
- "Intent + deliverable examples:\n" +
95
- "- 'hi' → general,concise,chat,answer\n" +
96
- "- 'cảm ơn bạn nhé' → general,concise,chat,answer\n" +
97
- "- 'bạn thử call tool setup_guide xem được không' → general,concise,task,answer (wants info, not file edits)\n" +
98
- "- 'bạn xong chưa' → general,concise,task,answer (a question — NOT chat)\n" +
99
- "- 'fix CI failing on Windows' → debug,concise,task,code\n" +
100
- "- 'rename function shouldInject to needsReminder' → refactor,concise,task,code\n" +
101
- "- 'tại sao bash_output_get trả empty' → analyze,concise,task,answer (investigate → written answer)\n" +
102
- "- 'liệt tất cả env var CLI đọc' → analyze,concise,task,report (structured list)\n" +
103
- "- 'plan the migration to hooks' → plan,balanced,task,report\n\n" +
104
- "Prompts may be Vietnamese, English, or mixed. Reply with exactly four words separated by commas. No other text.";
131
+ "Full examples (taskType,style,intent,deliverable,depth,scope,lang):\n" +
132
+ "- 'hi' → general,concise,chat,answer,quick,local,english\n" +
133
+ "- 'cảm ơn bạn nhé' → general,concise,chat,answer,quick,local,vietnamese\n" +
134
+ "- 'bạn xong chưa' → general,concise,task,answer,quick,local,vietnamese (a question NOT chat)\n" +
135
+ "- 'fix the typo in the README title' → generate,concise,task,code,quick,local,english\n" +
136
+ "- 'fix CI failing on Windows' → debug,concise,task,code,standard,local,english\n" +
137
+ "- 'rename function shouldInject to needsReminder' → refactor,concise,task,code,quick,local,english\n" +
138
+ "- 'thêm caching cho provider layer và update tests' → generate,concise,task,code,standard,local,vietnamese\n" +
139
+ "- 'tại sao bash_output_get trả empty' → analyze,concise,task,answer,standard,local,vietnamese\n" +
140
+ "- 'liệt tất cả env var CLI đọc' → analyze,concise,task,report,standard,local,vietnamese\n" +
141
+ "- 'refactor the entire auth system to use OAuth' refactor,concise,task,code,heavy,local,english\n" +
142
+ "- 'how does the building-block rule engine work' → analyze,concise,task,answer,standard,ecosystem,english\n" +
143
+ "- 'hệ sinh thái muonroi gồm những gì' → analyze,balanced,task,answer,standard,ecosystem,vietnamese\n" +
144
+ "- 'plan the migration to hooks' → plan,balanced,task,report,heavy,local,english\n\n" +
145
+ "Prompts may be Vietnamese, English, or mixed. Reply with exactly seven words separated by commas. No other text.";
105
146
  function parseResponse(raw) {
106
147
  const cleaned = raw.trim().toLowerCase().replace(/[`*"]/g, "");
107
148
  const firstLine = cleaned.split(/\r?\n/)[0] ?? "";
@@ -126,7 +167,32 @@ function parseResponse(raw) {
126
167
  // their legacy regex predicates for this turn (never a wrong forced route).
127
168
  const deliverableWord = parts.find((p) => p === "answer" || p === "code" || p === "report");
128
169
  const deliverableKind = deliverableWord ?? null;
129
- return { taskType: taskWord, outputStyle: style, confidence: 0.75, intentKind, deliverableKind };
170
+ // Fifth word is the model-decided work depth. Parsed position-independently so
171
+ // a reordered/garbled reply still recovers it; null when absent → Layer 4
172
+ // defaults to "standard" and the injected rubric lets the agent self-select.
173
+ const depthWord = parts.find((p) => VALID_DEPTHS.has(p));
174
+ const depthTier = depthWord ?? null;
175
+ // Sixth word is the scope. "ecosystem" → platform/docs-authoritative turn;
176
+ // anything else (incl. absent) → not ecosystem. Position-independent.
177
+ const scopeWord = parts.find((p) => p === "ecosystem" || p === "local");
178
+ const ecosystemScope = scopeWord ? scopeWord === "ecosystem" : null;
179
+ // Seventh word is the user's language. It is the one alphabetic token that is
180
+ // NOT a known enum value (open vocabulary). null when English / absent so
181
+ // Layer 4 skips the language re-anchor for English turns.
182
+ const langWord = parts.find((p) => /^[a-z][a-z-]+$/.test(p) && !KNOWN_CLASSIFY_WORDS.has(p));
183
+ const replyLanguage = langWord && langWord !== "english" && langWord !== "en"
184
+ ? langWord.charAt(0).toUpperCase() + langWord.slice(1)
185
+ : null;
186
+ return {
187
+ taskType: taskWord,
188
+ outputStyle: style,
189
+ confidence: 0.75,
190
+ intentKind,
191
+ deliverableKind,
192
+ depthTier,
193
+ ecosystemScope,
194
+ replyLanguage,
195
+ };
130
196
  }
131
197
  /**
132
198
  * Build a closure the PIL pipeline can call. Reuses the orchestrator's already-
@@ -3,8 +3,8 @@
3
3
  *
4
4
  * Core type definitions for the Prompt Intelligence Layer (PIL) pipeline.
5
5
  */
6
- import type { ComplexityTier } from "../gsd/complexity.js";
7
6
  import type { GrayAreaQuestion } from "../gsd/gray-areas.js";
7
+ import type { ComplexityTier } from "../playbook/complexity.js";
8
8
  import type { ComplexitySizeResult } from "./layer1_5-complexity-size.js";
9
9
  export type TaskType = "refactor" | "debug" | "plan" | "analyze" | "documentation" | "generate" | "build" | "general";
10
10
  export type OutputStyle = "concise" | "detailed" | "balanced";
@@ -42,8 +42,33 @@ export interface PipelineContext {
42
42
  activeRunId?: string | null;
43
43
  digestAgeMs?: number | null;
44
44
  sessionId?: string | null;
45
- /** GSD-native triage tier (set by layer4). */
45
+ /** GSD-native triage tier (set by layer4 — sourced from modelDepthTier when present). */
46
46
  complexityTier?: ComplexityTier | null;
47
+ /**
48
+ * Model-decided work depth (quick | standard | heavy), set by layer1's
49
+ * model-first classifier (the 5th classify word). This is the agent-first
50
+ * source of truth for the GSD directive tier; layer4 prefers it over the
51
+ * legacy regex `scoreComplexity` (which now only runs as the offline fallback
52
+ * when the model classifier is unwired/failed). null when the model omitted
53
+ * the word OR the legacy cascade ran → layer4 falls back accordingly.
54
+ */
55
+ modelDepthTier?: ComplexityTier | null;
56
+ /**
57
+ * Model-decided scope (agent-first replacement for the `mentionsEcosystemScope`
58
+ * regex): true when the turn is about the Muonroi PLATFORM/ecosystem (BB/.NET,
59
+ * building-block, rule engine, platform setup) where muonroi-docs is
60
+ * authoritative. Set by layer1's classifier; consumed by layer4 to gate the
61
+ * docs-first nudge. null/undefined → treated as not-ecosystem.
62
+ */
63
+ ecosystemScope?: boolean | null;
64
+ /**
65
+ * Model-decided reply language as a display name ("Vietnamese", "Japanese"),
66
+ * or null for English. Agent-first replacement for the Vietnamese-only
67
+ * diacritic regex — generalizes to any language. Set by layer1; consumed by
68
+ * layer4 to re-anchor the "reply in the user's language" rule inside the
69
+ * directive when the user did not write in English.
70
+ */
71
+ replyLanguage?: string | null;
47
72
  /**
48
73
  * Layer 1.5 deterministic complexity-size classification.
49
74
  * Populated immediately after `layer1Intent` in `runLayers()`. Consumers:
@@ -1,41 +1,40 @@
1
1
  import { describe, expect, it } from "vitest";
2
- import { scoreComplexity } from "../complexity.js";
3
2
  import { buildDirective, mentionsEcosystemScope } from "../directives.js";
4
- import { detectGrayAreas } from "../gray-areas.js";
5
3
  describe("buildDirective", () => {
6
- it("emits a blocking heavy directive with mandatory steps", () => {
7
- const prompt = "redo the entire architecture and map everything across all repos";
8
- const complexity = scoreComplexity(prompt);
9
- expect(complexity.tier).toBe("heavy");
10
- const grayAreas = detectGrayAreas(prompt).questions;
11
- const out = buildDirective({ complexity, phase: null, grayAreas });
4
+ it("emits a blocking heavy directive with discuss → research → plan → check-plan → verify", () => {
5
+ const out = buildDirective({ tier: "heavy", phase: null });
12
6
  expect(out.tier).toBe("heavy");
13
7
  expect(out.blocking).toBe(true);
14
- expect(out.text).toContain("MANDATORY");
8
+ expect(out.text).toMatch(/HEAVY task/);
9
+ expect(out.text).toMatch(/DISCUSS/);
10
+ expect(out.text).toMatch(/RESEARCH/);
11
+ expect(out.text).toMatch(/CHECK-PLAN/);
15
12
  expect(out.text).toMatch(/AskUserQuestion/);
16
- expect(out.text).toMatch(/IN PARALLEL/);
17
- expect(out.text).toMatch(/research/i);
18
- expect(out.text).toMatch(/verify/i);
13
+ expect(out.text).toMatch(/VERIFY/);
14
+ // Hybrid: the agent may de-escalate if the task is smaller than it reads.
15
+ expect(out.text).toMatch(/STANDARD flow/);
19
16
  });
20
- it("emits a non-blocking standard directive", () => {
21
- const complexity = scoreComplexity("add a /health endpoint");
22
- const out = buildDirective({ complexity, phase: "execute", grayAreas: [] });
17
+ it("emits a non-blocking standard directive with an explicit plan + check step", () => {
18
+ const out = buildDirective({ tier: "standard", phase: "execute" });
23
19
  expect(out.tier).toBe("standard");
24
20
  expect(out.blocking).toBe(false);
25
- expect(out.text).toMatch(/GSD-quick/i);
21
+ expect(out.text).toMatch(/STANDARD task/);
22
+ expect(out.text).toMatch(/PLAN —/);
23
+ expect(out.text).toMatch(/CHECK —/);
24
+ expect(out.text).toMatch(/VERIFY —/);
25
+ // Hybrid: escalate to HEAVY if it turns out architectural.
26
+ expect(out.text).toMatch(/escalate to the HEAVY flow/);
26
27
  });
27
28
  it("emits a fix-first debug variant when phase is debug (session 7d56a049e1e3 regression)", () => {
28
- const complexity = scoreComplexity("fix CI fail");
29
- const out = buildDirective({ complexity, phase: "debug", grayAreas: [] });
29
+ const out = buildDirective({ tier: "standard", phase: "debug" });
30
30
  expect(out.tier).toBe("standard");
31
31
  expect(out.text).toMatch(/DEBUG task/);
32
32
  expect(out.text).toMatch(/FIX-FIRST/);
33
33
  expect(out.text).toMatch(/≤ 8 read_file/);
34
34
  expect(out.text).toMatch(/edit_file/);
35
35
  });
36
- it("standard non-debug phases use the generic GSD-quick directive (regression: don't apply fix-first cap to plan/execute)", () => {
37
- const complexity = scoreComplexity("add a counter feature");
38
- const out = buildDirective({ complexity, phase: "execute", grayAreas: [] });
36
+ it("standard non-debug phases use the generic plan/check directive (regression: don't apply fix-first cap to plan/execute)", () => {
37
+ const out = buildDirective({ tier: "standard", phase: "execute" });
39
38
  expect(out.text).not.toMatch(/FIX-FIRST/);
40
39
  expect(out.text).not.toMatch(/read_file calls before/);
41
40
  });
@@ -43,40 +42,36 @@ describe("buildDirective", () => {
43
42
  // A self/meta CLI question routed through GSD must NOT get the
44
43
  // implement/verify scaffold — that leaked a "2-3 line plan" preamble +
45
44
  // process narration into the human-facing answer.
46
- const complexity = scoreComplexity("how does this CLI affect you?");
47
- const out = buildDirective({ complexity, phase: null, grayAreas: [], informational: true });
45
+ const out = buildDirective({ tier: "quick", phase: null, informational: true });
48
46
  expect(out.blocking).toBe(false);
49
47
  expect(out.text).toMatch(/QUESTION \/ explanatory/);
50
48
  expect(out.text).toMatch(/written for the HUMAN/);
51
49
  expect(out.text).not.toMatch(/2-3 line plan/);
52
- expect(out.text).not.toMatch(/Implement directly/);
50
+ expect(out.text).not.toMatch(/CHECK-PLAN/);
53
51
  });
54
52
  it("informational overrides even a heavy tier (a question never implements)", () => {
55
- const complexity = scoreComplexity("redo the entire architecture and map everything across all repos");
56
- expect(complexity.tier).toBe("heavy");
57
- const out = buildDirective({ complexity, phase: null, grayAreas: [], informational: true });
53
+ const out = buildDirective({ tier: "heavy", phase: null, informational: true });
58
54
  expect(out.blocking).toBe(false);
59
55
  expect(out.text).toMatch(/QUESTION \/ explanatory/);
60
- expect(out.text).not.toMatch(/MANDATORY/);
56
+ expect(out.text).not.toMatch(/DISCUSS/);
57
+ expect(out.text).not.toMatch(/CHECK-PLAN/);
61
58
  });
62
- it("emits a minimal quick directive", () => {
63
- const complexity = scoreComplexity("fix typo");
64
- const out = buildDirective({ complexity, phase: null, grayAreas: [] });
59
+ it("emits a quick directive that stays short", () => {
60
+ const out = buildDirective({ tier: "quick", phase: null });
65
61
  expect(out.tier).toBe("quick");
66
62
  expect(out.blocking).toBe(false);
67
- expect(out.text.length).toBeLessThan(300);
63
+ expect(out.text).toMatch(/QUICK task/);
64
+ expect(out.text.length).toBeLessThan(600);
68
65
  });
69
66
  it("appends the muonroi-docs nudge for an ecosystem question (session 41ccfeb2ceee turn 1)", () => {
70
- const complexity = scoreComplexity("bạn hiểu thế nào về ecosystem muonroi nói chung");
71
- const out = buildDirective({ complexity, phase: null, grayAreas: [], informational: true, ecosystem: true });
67
+ const out = buildDirective({ tier: "quick", phase: null, informational: true, ecosystem: true });
72
68
  expect(out.text).toMatch(/QUESTION \/ explanatory/); // still the human-facing question directive
73
69
  expect(out.text).toMatch(/ECOSYSTEM SCOPE/);
74
70
  expect(out.text).toMatch(/muonroi-docs MCP is the AUTHORITATIVE source|AUTHORITATIVE source/);
75
71
  expect(out.text).toMatch(/call it FIRST/i);
76
72
  });
77
73
  it("does NOT append the ecosystem nudge for a plain question", () => {
78
- const complexity = scoreComplexity("how does this CLI affect you?");
79
- const out = buildDirective({ complexity, phase: null, grayAreas: [], informational: true });
74
+ const out = buildDirective({ tier: "quick", phase: null, informational: true });
80
75
  expect(out.text).not.toMatch(/ECOSYSTEM SCOPE/);
81
76
  });
82
77
  it("mentionsEcosystemScope is tight: ecosystem/BB wording yes, bare CLI-internals no", () => {
@@ -89,43 +84,24 @@ describe("buildDirective", () => {
89
84
  expect(mentionsEcosystemScope("how does muonroi-cli compaction work")).toBe(false);
90
85
  expect(mentionsEcosystemScope("fix the off-by-one in the router")).toBe(false);
91
86
  });
92
- it("renders the recommended option first in gray-area block", () => {
93
- const prompt = "redo everything from scratch";
94
- const complexity = scoreComplexity(prompt);
95
- const grayAreas = detectGrayAreas(prompt).questions;
96
- const out = buildDirective({ complexity, phase: null, grayAreas });
97
- if (grayAreas.length > 0) {
98
- expect(out.text).toMatch(/\[recommended\]/);
99
- }
100
- });
101
87
  // Language nudge — re-anchors the "reply in user's language" rule INSIDE the
102
88
  // directive so layered brevity / FIX-FIRST directives can't drown it (live
103
89
  // miss: storyflow_ui session 22661c8de9f2).
104
90
  describe("language nudge", () => {
105
91
  it("appends the nudge when replyLanguage is set", () => {
106
- const out = buildDirective({
107
- complexity: scoreComplexity("fix CI fail"),
108
- phase: "debug",
109
- grayAreas: [],
110
- replyLanguage: "Vietnamese",
111
- });
92
+ const out = buildDirective({ tier: "standard", phase: "debug", replyLanguage: "Vietnamese" });
112
93
  expect(out.text).toMatch(/LANGUAGE — the user wrote in Vietnamese/);
113
94
  expect(out.text).toMatch(/Reply in Vietnamese/);
114
95
  expect(out.text).toMatch(/OVERRIDES any brevity/);
115
96
  });
116
97
  it("omits the nudge when replyLanguage is undefined", () => {
117
- const out = buildDirective({
118
- complexity: scoreComplexity("fix CI fail"),
119
- phase: "debug",
120
- grayAreas: [],
121
- });
98
+ const out = buildDirective({ tier: "standard", phase: "debug" });
122
99
  expect(out.text).not.toMatch(/LANGUAGE —/);
123
100
  });
124
101
  it("stacks with the ecosystem nudge when both apply", () => {
125
102
  const out = buildDirective({
126
- complexity: scoreComplexity("how does the muonroi ecosystem work"),
103
+ tier: "heavy",
127
104
  phase: null,
128
- grayAreas: [],
129
105
  ecosystem: true,
130
106
  replyLanguage: "Vietnamese",
131
107
  });
@@ -0,0 +1,17 @@
1
+ /**
2
+ * src/playbook/complexity.ts
3
+ *
4
+ * Work-depth tier used by the [playbook] directive injected per turn.
5
+ *
6
+ * - "quick" → trivial single-shot tasks (typo, rename, read-and-explain).
7
+ * - "standard" → ordinary feature/bugfix work. Short plan → check → impl → verify.
8
+ * - "heavy" → architectural / multi-file / wide / ambiguous. Full
9
+ * discuss → research → plan → check-plan → implement → verify.
10
+ *
11
+ * The depth is decided AGENT-FIRST by the model (the 5th word of the layer1
12
+ * `llm-classify` call → `ctx.modelDepthTier`), NOT by a regex scan of the
13
+ * prompt. The old keyword `scoreComplexity` scorer was removed (2026-06-18,
14
+ * no-regex rule): keyword matching mis-tiered plainly-phrased tasks, which is
15
+ * exactly what made the agent skip the rigor a task needed.
16
+ */
17
+ export type ComplexityTier = "quick" | "standard" | "heavy";
@@ -0,0 +1,18 @@
1
+ /**
2
+ * src/playbook/complexity.ts
3
+ *
4
+ * Work-depth tier used by the [playbook] directive injected per turn.
5
+ *
6
+ * - "quick" → trivial single-shot tasks (typo, rename, read-and-explain).
7
+ * - "standard" → ordinary feature/bugfix work. Short plan → check → impl → verify.
8
+ * - "heavy" → architectural / multi-file / wide / ambiguous. Full
9
+ * discuss → research → plan → check-plan → implement → verify.
10
+ *
11
+ * The depth is decided AGENT-FIRST by the model (the 5th word of the layer1
12
+ * `llm-classify` call → `ctx.modelDepthTier`), NOT by a regex scan of the
13
+ * prompt. The old keyword `scoreComplexity` scorer was removed (2026-06-18,
14
+ * no-regex rule): keyword matching mis-tiered plainly-phrased tasks, which is
15
+ * exactly what made the agent skip the rigor a task needed.
16
+ */
17
+ export {};
18
+ //# sourceMappingURL=complexity.js.map
@@ -1,26 +1,33 @@
1
1
  /**
2
- * src/gsd/directives.ts
2
+ * src/playbook/directives.ts
3
3
  *
4
- * Builds the system-prompt directive block injected by layer4-gsd. The directive
5
- * is what actually changes the agent's behaviour: it lists the GSD-style steps
6
- * the agent must take before touching code.
4
+ * Builds the system-prompt directive block injected per turn by layer4
5
+ * (`src/pil/layer4-gsd.ts`). The directive is what actually changes the agent's
6
+ * behaviour: it injects a HYBRID rubric for the work-depth tier the model chose
7
+ * — the system recommends a depth, the agent declares its path and may
8
+ * escalate/de-escalate. This is the "[playbook]" mindset layer (NOT the real
9
+ * GSD framework / `/gsd:*` skills — it only borrows the discuss→plan→execute
10
+ * mindset).
7
11
  *
8
12
  * Three tiers:
9
- * - heavy: full discuss → research → verify → plan → impl → verify flow,
10
- * with mandatory AskUserQuestion + parallel Agent dispatch.
11
- * - standard: GSD-quick mindset — short plan, then implement, then verify.
13
+ * - heavy: discuss → research → plancheck-plan → implement → verify.
14
+ * - standard: short plan check → implement → verify.
12
15
  * - quick: minimal hint, run inline.
13
16
  *
14
17
  * All directive text is English. The agent is responsible for translating
15
18
  * user-facing prompts into the user's language at render time.
16
19
  */
17
- import type { ComplexityResult } from "./complexity.js";
18
- import type { GrayAreaQuestion } from "./gray-areas.js";
19
- import type { GsdPhase } from "./types.js";
20
+ import type { GsdPhase } from "../gsd/types.js";
21
+ import type { ComplexityTier } from "./complexity.js";
20
22
  export interface DirectiveInput {
21
- complexity: ComplexityResult;
23
+ /**
24
+ * Model-decided work depth (agent-first — see layer1 `llm-classify`). Drives
25
+ * which rubric is injected. The rubric itself is HYBRID: it states the
26
+ * recommended depth but lets the agent escalate/de-escalate if the task turns
27
+ * out bigger or smaller than it read.
28
+ */
29
+ tier: ComplexityTier;
22
30
  phase: GsdPhase | null;
23
- grayAreas: GrayAreaQuestion[];
24
31
  /**
25
32
  * True when the prompt is informational/explanatory (a question or a
26
33
  * self/meta analysis) rather than a request to change code. The deliverable
@@ -58,7 +65,7 @@ export interface DirectiveInput {
58
65
  }
59
66
  export interface DirectiveOutput {
60
67
  text: string;
61
- tier: ComplexityResult["tier"];
68
+ tier: ComplexityTier;
62
69
  /** True when the directive forbids the agent from acting before clarifying. */
63
70
  blocking: boolean;
64
71
  }
@@ -0,0 +1,149 @@
1
+ /**
2
+ * src/playbook/directives.ts
3
+ *
4
+ * Builds the system-prompt directive block injected per turn by layer4
5
+ * (`src/pil/layer4-gsd.ts`). The directive is what actually changes the agent's
6
+ * behaviour: it injects a HYBRID rubric for the work-depth tier the model chose
7
+ * — the system recommends a depth, the agent declares its path and may
8
+ * escalate/de-escalate. This is the "[playbook]" mindset layer (NOT the real
9
+ * GSD framework / `/gsd:*` skills — it only borrows the discuss→plan→execute
10
+ * mindset).
11
+ *
12
+ * Three tiers:
13
+ * - heavy: discuss → research → plan → check-plan → implement → verify.
14
+ * - standard: short plan → check → implement → verify.
15
+ * - quick: minimal hint, run inline.
16
+ *
17
+ * All directive text is English. The agent is responsible for translating
18
+ * user-facing prompts into the user's language at render time.
19
+ */
20
+ const HEADER = "[playbook]";
21
+ /**
22
+ * High-precision predicate: is this turn about the Muonroi ECOSYSTEM (where the
23
+ * muonroi-docs MCP is the right source), as opposed to muonroi-cli internals?
24
+ * Deliberately TIGHTER than smart-filter's hasEcosystemSignal — that one keeps
25
+ * the server (over-keeping costs only tokens), but a behavioural "call docs
26
+ * FIRST" nudge must not fire on every "muonroi" mention or it misdirects
27
+ * CLI-internals questions toward .NET package docs. EN + VI.
28
+ */
29
+ const ECOSYSTEM_SCOPE_RE = /\becosystem\b|hệ\s*sinh\s*thái|he\s*sinh\s*thai|building[-\s]?block|open[-\s]?core|rule\s*engine|decision\s*table|\bnuget\b/i;
30
+ export function mentionsEcosystemScope(message) {
31
+ return ECOSYSTEM_SCOPE_RE.test(message);
32
+ }
33
+ /**
34
+ * Appended to any directive when the turn is ecosystem-scoped. Phrased
35
+ * conditionally ("if … available") so it is harmless when muonroi-docs is not
36
+ * configured — the model simply finds no such tool and falls back to local files.
37
+ */
38
+ export const ECOSYSTEM_DOCS_NUDGE = [
39
+ `${HEADER} ECOSYSTEM SCOPE — this turn concerns the Muonroi ecosystem (platform overview, BB/.NET packages, building-block, open-core boundary, setup).`,
40
+ "If the muonroi-docs MCP is available, it is the AUTHORITATIVE source — call it FIRST (docs_search / setup_guide / bb_recipe_list / bb_package_describe), THEN ground with local files. Do NOT characterize the ecosystem from local repo files alone.",
41
+ ].join("\n");
42
+ /**
43
+ * Appended to any directive when the user's reply language is non-English.
44
+ * The base system prompt's "reply in user's language" rule normally suffices,
45
+ * but `concise` / `FIX-FIRST` / GSD-debug directive bodies stack on top of it
46
+ * with strong "be terse / code over prose" language that crowds the rule out
47
+ * — observed live (storyflow_ui 22661c8de9f2). This NUDGE re-anchors the rule
48
+ * inside the directive itself so brevity preferences cannot override it.
49
+ */
50
+ export function buildLanguageNudge(lang) {
51
+ return [
52
+ `${HEADER} LANGUAGE — the user wrote in ${lang}. Reply in ${lang}.`,
53
+ "This rule OVERRIDES any brevity / concise / code-over-prose directive: terseness is fine, but the response language stays the user's.",
54
+ ].join("\n");
55
+ }
56
+ // All three rubrics are HYBRID + agent-first: the system recommends a depth
57
+ // based on the model's read of the task, but each rubric ends by empowering the
58
+ // agent to escalate or de-escalate if the task turns out bigger/smaller than it
59
+ // looked. Phrased as guidance, not a rigid template (the user prefers natural,
60
+ // senior-engineer reasoning over labeled scaffolds — feedback 12eceab7).
61
+ function buildHeavy(input) {
62
+ const phaseHint = input.phase ? ` (hint: this reads like a "${input.phase}" task)` : "";
63
+ return [
64
+ `${HEADER} This reads like a HEAVY task${phaseHint} — architectural, cross-cutting, multi-file, or with real unresolved design choices. Don't start editing yet; work through these phases:`,
65
+ " 1. DISCUSS — surface the decisions/ambiguities that actually change the design. For the ones the prompt doesn't already answer, ask up front with AskUserQuestion (put your recommended option first; write the question text in the user's language). Skip questions the prompt already settles — don't interrogate.",
66
+ " 2. RESEARCH — gather the codebase facts the task depends on: read/grep the relevant modules, and dispatch parallel research Agents when the areas are independent. When you delegate, give each sub-agent a NON-overlapping scope and tell it the exact return shape you need (findings as file:line + a one-line conclusion) — only the sub's final synthesis re-enters your context. Ground every later decision in what you actually found, not assumptions.",
67
+ " 3. PLAN — write a concrete, numbered plan: the change per file, the order, and the acceptance criteria (how you'll know it's done). Then record the plan as a todo_write checklist (one item per step) so the user sees a live progress list.",
68
+ " 4. CHECK-PLAN — review your own plan BEFORE executing: does it cover the acceptance criteria, handle the edge cases, and match what the user actually asked? Revise until it does (update the todo_write list if steps change). Confirm with the user only if the plan diverges from their intent.",
69
+ " 5. IMPLEMENT — execute in atomic steps; parallelize independent work. Keep the todo_write list accurate: mark each item in_progress before you start it and completed when it lands (exactly ONE item in_progress at a time). When you're in a git repo, COMMIT each completed chunk before starting the next one (small, logically-scoped commits; message ends with the mandatory attribution line) — do NOT pile the whole task into one commit at the end.",
70
+ " 6. VERIFY — run the relevant tests / lint / type-check and report evidence (command + result) before claiming done.",
71
+ "This depth is a recommendation from how the task reads. If, once you look, it's genuinely smaller than it appears, say so and drop to the STANDARD flow rather than over-processing it.",
72
+ ].join("\n");
73
+ }
74
+ function buildStandard(input) {
75
+ const phaseHint = input.phase ? ` (hint: this reads like a "${input.phase}" task)` : "";
76
+ // Debug-phase variant: tighten exploration budget. Session 7d56a049e1e3
77
+ // ran 109 tool calls (58 bash + 33 read_file + 16 grep + 2 mcp) over 6
78
+ // minutes WITHOUT a single edit_file / write_file — agent over-researched
79
+ // the CI failure instead of attempting a fix. Keep the FIX-FIRST exploration
80
+ // cap, but still require a brief check against reality before editing.
81
+ if (input.phase === "debug") {
82
+ return [
83
+ `${HEADER} This reads like a DEBUG task${phaseHint} — work FIX-FIRST, but think before you edit:`,
84
+ " 1. HYPOTHESIS — state a 2-3 line hypothesis (what's failing + your best guess why) BEFORE reading more than 3 files.",
85
+ " 2. CHECK — confirm the hypothesis against the actual failing code/log (read the key file, re-read the error). Adjust if reality disagrees.",
86
+ " 3. FIX — apply the smallest plausible fix with edit_file / write_file. Commit to a hypothesis and ship the diff; don't keep exploring.",
87
+ " 4. VERIFY — rerun the failing command/test and report evidence. When you're in a git repo and the fix verifies, commit it (message ends with the mandatory attribution line).",
88
+ "Hard limits — exceed only if a tool result genuinely contradicts your hypothesis:",
89
+ " - ≤ 8 read_file calls before first edit_file",
90
+ " - ≤ 5 grep calls before first edit_file",
91
+ " - ≤ 10 bash log-fetching calls (gh run view, cat log, etc.) before first edit_file",
92
+ "If the limits are blown and you still have no fix, STOP and report what you tried + why you're stuck.",
93
+ ].join("\n");
94
+ }
95
+ return [
96
+ `${HEADER} This reads like a STANDARD task${phaseHint} — work like a senior engineer, but keep it lightweight:`,
97
+ " 1. PLAN — state a short, concrete plan: the files/functions you'll touch and in what order. A few bullets in your reply, not an essay. If it breaks into ≥3 steps, also record them with todo_write so the user gets a live checklist.",
98
+ " 2. CHECK — sanity-check that plan against the real code (read the key files you named) and against the user's intent; fix the plan if reality differs. If a genuine ambiguity blocks you, ask ONE focused question via AskUserQuestion instead of guessing.",
99
+ " 3. IMPLEMENT — execute the plan in small steps with the appropriate tools. If you made a todo_write checklist, keep it updated as you go (exactly one item in_progress at a time). When you're in a git repo, COMMIT each cohesive chunk as it lands (small commits; message ends with the mandatory attribution line) rather than batching everything into one final commit.",
100
+ " 4. VERIFY — run the relevant tests / type-check / quick smoke and report evidence before claiming done.",
101
+ "You don't need subagents or a discussion round for this. But if it turns out to be architectural or spans many files, escalate to the HEAVY flow (discuss → research → checked plan) rather than charging ahead.",
102
+ ].join("\n");
103
+ }
104
+ function buildQuestion() {
105
+ // Informational / question / meta-analysis turns. The deliverable is the
106
+ // answer itself — there is no code to implement or test. Keep the agent's
107
+ // process OUT of the reply: a human asked, a human reads the result.
108
+ return [
109
+ `${HEADER} QUESTION / explanatory request — no code change is being asked for.`,
110
+ "Answer it directly and completely, written for the HUMAN who asked:",
111
+ " 1. Investigate only as needed — read/grep the specific files that ground your answer this turn.",
112
+ " 2. Lead with the answer. Use clear prose + structure (headings, bullets). Where a claim rests on the code, cite a concise file:line inline.",
113
+ " 3. Do NOT output an implementation plan, do NOT narrate your own process or restate these instructions, and do NOT name internal layers / contract rules / tools as if the reader were the agent.",
114
+ "There is no implement/verify step — the answer is the deliverable.",
115
+ ].join("\n");
116
+ }
117
+ function buildQuick(input) {
118
+ const phaseHint = input.phase ? ` (hint: "${input.phase}")` : "";
119
+ return [
120
+ `${HEADER} This reads like a QUICK task${phaseHint} — handle it inline. Make the smallest correct change (or give the direct answer) and report what you did. No plan, no subagents.`,
121
+ "If, as you work, it turns out bigger than it looked — multiple files, unclear requirements — say so and switch to the STANDARD flow (short plan → check → implement → verify) instead of forcing it.",
122
+ ].join("\n");
123
+ }
124
+ export function buildDirective(input) {
125
+ // Informational/meta prompts answer a human — never apply the
126
+ // implement/verify scaffold (it agent-ifies the reply), regardless of tier.
127
+ const base = input.informational
128
+ ? { text: buildQuestion(), tier: input.tier, blocking: false }
129
+ : input.tier === "heavy"
130
+ ? { text: buildHeavy(input), tier: "heavy", blocking: true }
131
+ : input.tier === "standard"
132
+ ? { text: buildStandard(input), tier: "standard", blocking: false }
133
+ : { text: buildQuick(input), tier: "quick", blocking: false };
134
+ // Ecosystem-scoped turns get a docs-first nudge regardless of tier (question
135
+ // OR task): muonroi-docs is the authoritative source and must not be skipped
136
+ // in favour of guessing from local files (session 41ccfeb2ceee turn 1).
137
+ let text = base.text;
138
+ if (input.ecosystem) {
139
+ text = `${text}\n${ECOSYSTEM_DOCS_NUDGE}`;
140
+ }
141
+ // Language nudge: re-anchor the "reply in user's language" rule INSIDE the
142
+ // directive when the user wrote in a non-English language, so layered
143
+ // brevity/concise directives can't drown it (storyflow_ui 22661c8de9f2).
144
+ if (input.replyLanguage) {
145
+ text = `${text}\n${buildLanguageNudge(input.replyLanguage)}`;
146
+ }
147
+ return { ...base, text };
148
+ }
149
+ //# sourceMappingURL=directives.js.map