create-byan-agent 2.20.1 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +91 -0
- package/install/templates/.claude/CLAUDE.md +21 -1
- package/install/templates/.claude/rules/byan-agents.md +1 -0
- package/install/templates/.claude/rules/hermes-dispatcher.md +1 -0
- package/install/templates/.claude/rules/team-doctrine.md +102 -0
- package/install/templates/.claude/skills/byan-byan-test/SKILL.md +1 -1
- package/install/templates/.claude/skills/byan-suitability/SKILL.md +71 -0
- package/install/templates/.claude/workflows/create-excalidraw-dataflow.js +2 -2
- package/install/templates/.claude/workflows/create-excalidraw-diagram.js +2 -2
- package/install/templates/.claude/workflows/create-excalidraw-flowchart.js +2 -2
- package/install/templates/.claude/workflows/create-excalidraw-wireframe.js +2 -2
- package/install/templates/.claude/workflows/dev-story.js +1 -1
- package/install/templates/.claude/workflows/document-project.js +3 -1
- package/install/templates/.githooks/pre-commit +20 -2
- package/install/templates/.github/agents/bmad-agent-byan.md +1056 -10
- package/install/templates/.github/agents/bmad-agent-skeptic.md +7 -5
- package/install/templates/_bmad/bmb/agents/agent-builder.md +5 -5
- package/install/templates/_bmad/bmb/agents/byan-test.md +8 -8
- package/install/templates/_bmad/bmb/agents/byan.md +8 -8
- package/install/templates/_bmad/bmb/agents/marc.md +11 -11
- package/install/templates/_bmad/bmb/agents/module-builder.md +6 -6
- package/install/templates/_bmad/bmb/agents/patnote.md +8 -8
- package/install/templates/_bmad/bmb/agents/rachid.md +10 -10
- package/install/templates/_bmad/bmb/agents/workflow-builder.md +7 -7
- package/install/templates/_bmad/bmb/workflows/byan/quick-create-workflow.md +2 -2
- package/install/templates/_bmad/bmb/workflows/byan/templates/base-agent-template.md +1 -1
- package/install/templates/_bmad/bmb/workflows/byan/validate-agent-workflow.md +1 -1
- package/install/templates/_bmad/core/agents/carmack.md +2 -2
- package/install/templates/_byan/_config/agent-manifest.csv +1 -0
- package/install/templates/_byan/agent/agent-builder/agent-builder.md +20 -0
- package/install/templates/_byan/agent/analyst/analyst.md +21 -0
- package/install/templates/_byan/agent/architect/architect.md +21 -0
- package/install/templates/_byan/agent/bmad-master/bmad-master.md +23 -0
- package/install/templates/_byan/agent/brainstorming-coach/brainstorming-coach.md +21 -0
- package/install/templates/_byan/agent/byan/byan.md +24 -0
- package/install/templates/_byan/agent/byan-flat/byan.md +23 -0
- package/install/templates/_byan/agent/byan-test/byan-test.md +19 -0
- package/install/templates/_byan/agent/byan-test-flat/byan-test.md +20 -0
- package/install/templates/_byan/agent/carmack/carmack.md +22 -0
- package/install/templates/_byan/agent/claude/claude.md +21 -0
- package/install/templates/_byan/agent/codex/codex.md +21 -0
- package/install/templates/_byan/agent/creative-problem-solver/creative-problem-solver.md +21 -0
- package/install/templates/_byan/agent/design-thinking-coach/design-thinking-coach.md +21 -0
- package/install/templates/_byan/agent/dev/dev.md +20 -0
- package/install/templates/_byan/agent/drawio/drawio.md +21 -0
- package/install/templates/_byan/agent/expert-merise-agile/expert-merise-agile.md +21 -0
- package/install/templates/_byan/agent/fact-checker/fact-checker.md +21 -0
- package/install/templates/_byan/agent/forgeron/forgeron.md +22 -0
- package/install/templates/_byan/agent/innovation-strategist/innovation-strategist.md +21 -0
- package/install/templates/_byan/agent/jimmy/jimmy.md +23 -0
- package/install/templates/_byan/agent/marc/marc.md +21 -0
- package/install/templates/_byan/agent/marc-flat/marc.md +23 -0
- package/install/templates/_byan/agent/mike/mike.md +23 -0
- package/install/templates/_byan/agent/module-builder/module-builder.md +20 -0
- package/install/templates/_byan/agent/patnote/patnote.md +21 -0
- package/install/templates/_byan/agent/pm/pm.md +21 -0
- package/install/templates/_byan/agent/presentation-master/presentation-master.md +21 -0
- package/install/templates/_byan/agent/quick-flow-solo-dev/quick-flow-solo-dev.md +19 -0
- package/install/templates/_byan/agent/quinn/quinn.md +19 -0
- package/install/templates/_byan/agent/rachid/rachid.md +21 -0
- package/install/templates/_byan/agent/rachid-flat/rachid.md +22 -0
- package/install/templates/_byan/agent/skeptic/skeptic.md +23 -0
- package/install/templates/_byan/agent/sm/sm.md +21 -0
- package/install/templates/_byan/agent/storyteller/storyteller.md +21 -0
- package/install/templates/_byan/agent/tao/tao.md +22 -0
- package/install/templates/_byan/agent/tea/tea.md +23 -0
- package/install/templates/_byan/agent/tech-writer/tech-writer.md +21 -0
- package/install/templates/_byan/agent/test-dynamic/test-dynamic.md +19 -0
- package/install/templates/_byan/agent/turbo-whisper/turbo-whisper.md +22 -0
- package/install/templates/_byan/agent/turbo-whisper-integration/turbo-whisper-integration.md +21 -0
- package/install/templates/_byan/agent/ux-designer/ux-designer.md +20 -0
- package/install/templates/_byan/agent/workflow-builder/workflow-builder.md +20 -0
- package/install/templates/_byan/agent/yanstaller/yanstaller.md +23 -0
- package/install/templates/_byan/bmb/config.yaml +36 -2
- package/install/templates/_byan/config.yaml +28 -0
- package/install/templates/_byan/core/activation/soul-activation.md +35 -0
- package/install/templates/_byan/mcp/byan-mcp-server/bin/byan-suitability.js +50 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/native-tiers.js +112 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/suitability-feeder.js +45 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/suitability-store.js +102 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/suitability.js +234 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/workflows-lint.js +72 -2
- package/install/templates/_byan/mcp/byan-mcp-server/package.json +9 -3
- package/install/templates/_byan/mcp/byan-mcp-server/server.js +58 -0
- package/install/templates/_byan/worker/workers.md +71 -1
- package/install/templates/_byan/workflow/simple/byan/soul-memory-update.md +25 -3
- package/install/templates/docs/native-workflows-contract.md +109 -0
- package/package.json +1 -1
- package/src/byan-v2/dispatcher/complexity-scorer.js +6 -0
- package/src/byan-v2/dispatcher/task-router.js +5 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
// Single source of truth for MODEL ROUTING of native-workflow leaves.
|
|
2
|
+
//
|
|
3
|
+
// Claude Code's in-CLI Workflow tool runs each agent() leaf on the main-loop
|
|
4
|
+
// model unless that call sets opts.model. Ported BYAN workflows never set it,
|
|
5
|
+
// so every leaf ran on the session model (Opus) — the read-the-file leaf paid
|
|
6
|
+
// the same tier as the implement-and-verify leaf. This module is the one place
|
|
7
|
+
// that decides a leaf's model tier, so the rule lives once and the linter
|
|
8
|
+
// (workflows-lint.js) can enforce it.
|
|
9
|
+
//
|
|
10
|
+
// This is a DISTINCT concern from src/byan-v2/dispatcher/complexity-scorer.js,
|
|
11
|
+
// which scores task COMPLEXITY (0-100) to route a whole task to an executor.
|
|
12
|
+
// That scorer answers "how hard is this task"; this module answers "which model
|
|
13
|
+
// tier does this workflow LEAF deserve". They share the same exploration intent
|
|
14
|
+
// but produce different outputs, so they stay separate (clarified, not merged).
|
|
15
|
+
//
|
|
16
|
+
// The sandbox forbids import INSIDE a .claude/workflows/*.js script, so a script
|
|
17
|
+
// cannot require() this file at runtime. The contract it encodes is instead a
|
|
18
|
+
// literal (model: 'haiku') the author writes on exploration leaves, validated
|
|
19
|
+
// against this module by the linter. This module is the canonical reference.
|
|
20
|
+
|
|
21
|
+
// The three-tier vocabulary. cheap/balanced are explicit downgrades; deep is the
|
|
22
|
+
// default and means "inherit the main-loop model".
|
|
23
|
+
export const TIERS = Object.freeze({ CHEAP: 'cheap', BALANCED: 'balanced', DEEP: 'deep' });
|
|
24
|
+
|
|
25
|
+
// tier -> concrete opts.model value, or null = OMIT opts.model (inherit).
|
|
26
|
+
//
|
|
27
|
+
// deep MUST be null. Omitting opts.model lets the leaf inherit whatever model the
|
|
28
|
+
// session runs (Opus by default, but Sonnet if the user chose Sonnet). We never
|
|
29
|
+
// PIN UP — pinning a leaf to a fixed high tier would override the user's session
|
|
30
|
+
// choice and could silently DOWNGRADE a Sonnet/Opus session's heavy leaf. Only
|
|
31
|
+
// cheap/balanced carry a value, and only exploration leaves ever get one.
|
|
32
|
+
//
|
|
33
|
+
// Values are the harness model-selection aliases (same set as the Agent tool:
|
|
34
|
+
// 'haiku' | 'sonnet' | 'opus'). They are version-independent. If a future
|
|
35
|
+
// runtime needs full model ids, this map is the ONLY edit — the linter then
|
|
36
|
+
// flags every script literal that drifts from it, so the fan-out stays bounded.
|
|
37
|
+
export const TIER_MODEL = Object.freeze({ cheap: 'haiku', balanced: 'sonnet', deep: null });
|
|
38
|
+
|
|
39
|
+
// Leaf task-type taxonomy. EXPLORATION is the only downgrade-safe class; the
|
|
40
|
+
// other three are protected (never downgraded).
|
|
41
|
+
export const LEAF_TYPES = Object.freeze({
|
|
42
|
+
EXPLORATION: 'exploration',
|
|
43
|
+
IMPLEMENTATION: 'implementation',
|
|
44
|
+
VERIFICATION: 'verification',
|
|
45
|
+
ANALYSIS: 'analysis',
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// Label keyword sets, matched as substrings on the leaf LABEL (not the prompt —
|
|
49
|
+
// see classifyLeaf). Protected sets are checked first so any protected signal
|
|
50
|
+
// beats an exploration signal (conservative: when in doubt, do not downgrade).
|
|
51
|
+
// Note: 'test' is deliberately ABSENT. It collides both ways — 'discover-tests'
|
|
52
|
+
// is exploration (find the test files) while 'test-design' is analysis — so the
|
|
53
|
+
// bare token decides nothing. Real verification leaves carry verify/validate/
|
|
54
|
+
// check/review/gate/audit/assert/lint; a leaf that runs tests is labelled
|
|
55
|
+
// 'verify-*' in practice.
|
|
56
|
+
const VERIFICATION_KEYWORDS = ['verify', 'validate', 'check', 'assert', 'gate', 'lint', 'audit', 'review'];
|
|
57
|
+
const ANALYSIS_KEYWORDS = ['analy', 'design', 'architect', 'assess', 'evaluate', 'strategy', 'risk', 'nfr', 'recommend', 'judge', 'score', 'coverage'];
|
|
58
|
+
const IMPLEMENTATION_KEYWORDS = ['implement', 'build', 'write', 'generate', 'create', 'dev', 'rgr', 'refactor', 'fix', 'scaffold', 'save', 'optimize', 'aggregate', 'report', 'present', 'plan', 'map', 'select', 'subprocess', 'sub-'];
|
|
59
|
+
const EXPLORATION_KEYWORDS = ['load', 'read', 'scan', 'list', 'parse', 'detect', 'discover', 'fetch', 'lookup', 'source-tree', 'mode-detection'];
|
|
60
|
+
|
|
61
|
+
function matchesAny(text, keywords) {
|
|
62
|
+
return keywords.some((kw) => text.includes(kw));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// classifyLeaf({ label }) -> a LEAF_TYPES value.
|
|
66
|
+
//
|
|
67
|
+
// Keys off the LABEL, deliberately NOT the prompt. A leaf's prompt is noisy: an
|
|
68
|
+
// exploration leaf like load-story says "Read... Parse... Report the story key",
|
|
69
|
+
// and 'report' would wrongly pull it to implementation. The label is the curated,
|
|
70
|
+
// stable signal the author controls. Priority is protect-first: VERIFICATION,
|
|
71
|
+
// then ANALYSIS, then IMPLEMENTATION, then EXPLORATION. Anything unmatched
|
|
72
|
+
// defaults to IMPLEMENTATION (deep), so an unknown leaf is never downgraded.
|
|
73
|
+
export function classifyLeaf(leaf) {
|
|
74
|
+
const label = String((leaf && leaf.label) || '').toLowerCase();
|
|
75
|
+
if (!label) return LEAF_TYPES.IMPLEMENTATION;
|
|
76
|
+
if (matchesAny(label, VERIFICATION_KEYWORDS)) return LEAF_TYPES.VERIFICATION;
|
|
77
|
+
if (matchesAny(label, ANALYSIS_KEYWORDS)) return LEAF_TYPES.ANALYSIS;
|
|
78
|
+
if (matchesAny(label, IMPLEMENTATION_KEYWORDS)) return LEAF_TYPES.IMPLEMENTATION;
|
|
79
|
+
if (matchesAny(label, EXPLORATION_KEYWORDS)) return LEAF_TYPES.EXPLORATION;
|
|
80
|
+
return LEAF_TYPES.IMPLEMENTATION;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// tierFor(taskType) -> a TIERS value. Conservative auto-routing: only EXPLORATION
|
|
84
|
+
// is downgraded (cheap); every other type stays deep. BALANCED is part of the
|
|
85
|
+
// vocabulary but is never auto-assigned — it exists for an explicit, manual
|
|
86
|
+
// opt-in on a leaf an author judges mid-weight. Automation only ever picks
|
|
87
|
+
// cheap or deep.
|
|
88
|
+
export function tierFor(taskType) {
|
|
89
|
+
return taskType === LEAF_TYPES.EXPLORATION ? TIERS.CHEAP : TIERS.DEEP;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// modelForLeaf({ label }) -> the opts.model value to write (a string) or null
|
|
93
|
+
// (omit opts.model). This is what F2 stamps onto exploration leaves.
|
|
94
|
+
export function modelForLeaf(leaf) {
|
|
95
|
+
return TIER_MODEL[tierFor(classifyLeaf(leaf))];
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// isKnownTierModel(modelId) -> true if modelId is one of the concrete downgrade
|
|
99
|
+
// models (cheap/balanced). Used by the linter to reject an opts.model literal
|
|
100
|
+
// that is not a recognised tier. null/'' are not "known" (deep = omission, not a
|
|
101
|
+
// literal). 'opus' is intentionally NOT known — we never pin up.
|
|
102
|
+
export function isKnownTierModel(modelId) {
|
|
103
|
+
if (!modelId) return false;
|
|
104
|
+
return Object.values(TIER_MODEL).filter(Boolean).includes(modelId);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// isDowngradeModel(modelId) -> true if modelId pins a leaf BELOW the inherited
|
|
108
|
+
// tier (cheap or balanced). The linter's anti-downgrade rule uses this: a
|
|
109
|
+
// protected leaf must never carry a downgrade model.
|
|
110
|
+
export function isDowngradeModel(modelId) {
|
|
111
|
+
return modelId === TIER_MODEL.cheap || modelId === TIER_MODEL.balanced;
|
|
112
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
// Feeder B — adversarial-pass verdict -> ledger outcome (F3).
|
|
2
|
+
//
|
|
3
|
+
// The adversarial VALIDATE pass runs N skeptics against ONE downgraded leaf,
|
|
4
|
+
// each trying to REFUTE that the cheap model is adequate there. The leaf is
|
|
5
|
+
// "flagged" (cheap inadequate) when at least half the skeptics refute it. This
|
|
6
|
+
// module maps that vote into the ledger's binary outcome:
|
|
7
|
+
//
|
|
8
|
+
// success = the cheap model SURVIVED the panel (refuters fell short of half).
|
|
9
|
+
//
|
|
10
|
+
// It is PURE and DETERMINISTIC and does no I/O. The actual byan_suitability_record
|
|
11
|
+
// call happens in the orchestrating skill on a main-thread turn — a workflow
|
|
12
|
+
// script cannot call MCP tools or write state (sandbox/state-coupling rule), so
|
|
13
|
+
// the script returns the verdicts as DATA and the skill records them. This
|
|
14
|
+
// module is the shared shaping step both sides agree on.
|
|
15
|
+
|
|
16
|
+
// At least half the panel refuting flags the leaf. Ties resolve AGAINST the
|
|
17
|
+
// cheap model — the conservative bias for an anti-downgrade rail. The adversarial
|
|
18
|
+
// pass uses an odd panel (3) so ties do not arise in practice; the rule is
|
|
19
|
+
// defined for any n so an even panel still degrades safely.
|
|
20
|
+
function isFlagged(refutedVotes, totalVotes) {
|
|
21
|
+
return refutedVotes * 2 >= totalVotes;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// verdictToOutcome({ model, leafId, refutedVotes, totalVotes }) ->
|
|
25
|
+
// { model, leafId, success }. Throws on malformed input (programmer error);
|
|
26
|
+
// the no-op-on-failure contract lives one layer up, at the MCP store boundary.
|
|
27
|
+
export function verdictToOutcome({ model, leafId, refutedVotes, totalVotes } = {}) {
|
|
28
|
+
if (!model || !leafId) throw new Error('verdictToOutcome requires model and leafId');
|
|
29
|
+
const total = Number(totalVotes);
|
|
30
|
+
const refuted = Number(refutedVotes);
|
|
31
|
+
if (!Number.isInteger(total) || total <= 0) {
|
|
32
|
+
throw new Error('totalVotes must be a positive integer');
|
|
33
|
+
}
|
|
34
|
+
if (!Number.isInteger(refuted) || refuted < 0 || refuted > total) {
|
|
35
|
+
throw new Error('refutedVotes must be an integer in 0..totalVotes');
|
|
36
|
+
}
|
|
37
|
+
return { model, leafId, success: !isFlagged(refuted, total) };
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// verdictsToOutcomes([verdict, ...]) -> [outcome, ...]. The skill iterates this
|
|
41
|
+
// and calls byan_suitability_record once per outcome.
|
|
42
|
+
export function verdictsToOutcomes(verdicts = []) {
|
|
43
|
+
if (!Array.isArray(verdicts)) throw new Error('verdictsToOutcomes expects an array');
|
|
44
|
+
return verdicts.map(verdictToOutcome);
|
|
45
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
// Model-suitability ledger — persistence + MCP-facing surface (F2).
|
|
2
|
+
//
|
|
3
|
+
// The pure math lives in suitability.js (no I/O). This module is the ONLY place
|
|
4
|
+
// that writes the ledger to disk, which is what makes the sandbox/state-coupling
|
|
5
|
+
// rule hold: a .claude/workflows/*.js script cannot import this file (the sandbox
|
|
6
|
+
// forbids it) and therefore cannot write ledger state. State changes flow only
|
|
7
|
+
// through the MCP tools (byan_suitability_record / _report), which call into
|
|
8
|
+
// here. The workflow feeds the tool; the tool owns the write.
|
|
9
|
+
//
|
|
10
|
+
// Best-effort contract (mirrors strict-sync.js): record() NEVER throws. Bad
|
|
11
|
+
// input or a failed write degrades to { recorded: false, reason } and leaves the
|
|
12
|
+
// on-disk ledger untouched. A telemetry write must never block or corrupt the
|
|
13
|
+
// real work — losing one outcome is acceptable; crashing the caller is not.
|
|
14
|
+
|
|
15
|
+
import fs from 'node:fs';
|
|
16
|
+
import path from 'node:path';
|
|
17
|
+
import { recordOutcome as pureRecord, rating, report } from './suitability.js';
|
|
18
|
+
|
|
19
|
+
export function resolveRoot(projectRoot) {
|
|
20
|
+
return projectRoot || process.env.CLAUDE_PROJECT_DIR || process.cwd();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// The ledger lives beside the FD state, under the gitignored _byan-output/.
|
|
24
|
+
export function ledgerPath(projectRoot) {
|
|
25
|
+
return path.join(resolveRoot(projectRoot), '_byan-output', 'suitability-ledger.json');
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// readLedger never throws: a missing, corrupt, or non-object file reads as {}.
|
|
29
|
+
// A consumer should always get a usable ledger, even degraded to empty.
|
|
30
|
+
export function readLedger({ projectRoot, io = fs } = {}) {
|
|
31
|
+
const p = ledgerPath(projectRoot);
|
|
32
|
+
try {
|
|
33
|
+
if (!io.existsSync(p)) return {};
|
|
34
|
+
const parsed = JSON.parse(io.readFileSync(p, 'utf8'));
|
|
35
|
+
return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {};
|
|
36
|
+
} catch {
|
|
37
|
+
return {};
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function writeLedger(ledger, { projectRoot, io = fs } = {}) {
|
|
42
|
+
const p = ledgerPath(projectRoot);
|
|
43
|
+
io.mkdirSync(path.dirname(p), { recursive: true });
|
|
44
|
+
// Atomic write: stage into a temp file ADJACENT to the target (same directory,
|
|
45
|
+
// hence same filesystem, so the rename is atomic and EXDEV-free), then rename
|
|
46
|
+
// over the target. A partial or failed write leaves the existing ledger
|
|
47
|
+
// byte-identical. The "untouched on failure" guarantee is then literally true,
|
|
48
|
+
// not merely tolerated downstream by readLedger.
|
|
49
|
+
const tmp = `${p}.tmp`;
|
|
50
|
+
try {
|
|
51
|
+
io.writeFileSync(tmp, JSON.stringify(ledger, null, 2));
|
|
52
|
+
io.renameSync(tmp, p);
|
|
53
|
+
} catch (err) {
|
|
54
|
+
// Best-effort cleanup so a failed write leaves no orphan staged file behind.
|
|
55
|
+
try {
|
|
56
|
+
io.unlinkSync(tmp);
|
|
57
|
+
} catch {
|
|
58
|
+
void 0; // nothing was staged, or unlink is unsupported — nothing to clean
|
|
59
|
+
}
|
|
60
|
+
throw err;
|
|
61
|
+
}
|
|
62
|
+
return p;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// record one adequacy outcome. Returns { recorded, reason, rating, source }.
|
|
66
|
+
// recorded:false with reason 'invalid_input' (bad args) or 'persist_failed'
|
|
67
|
+
// (write threw). On a persist failure the rating reflects the PRE-write ledger,
|
|
68
|
+
// so the caller never sees a phantom update. Never throws.
|
|
69
|
+
export function record({ model, leafId, success, source, projectRoot, io = fs } = {}) {
|
|
70
|
+
const before = readLedger({ projectRoot, io });
|
|
71
|
+
|
|
72
|
+
let after;
|
|
73
|
+
try {
|
|
74
|
+
after = pureRecord(before, { model, leafId, success });
|
|
75
|
+
} catch (err) {
|
|
76
|
+
return { recorded: false, reason: 'invalid_input', error: err.message, source: source || null };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
let recorded = true;
|
|
80
|
+
let reason = null;
|
|
81
|
+
try {
|
|
82
|
+
writeLedger(after, { projectRoot, io });
|
|
83
|
+
} catch (err) {
|
|
84
|
+
recorded = false;
|
|
85
|
+
reason = 'persist_failed';
|
|
86
|
+
void err; // swallowed by contract — the outcome is lost, the caller is safe
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
recorded,
|
|
91
|
+
reason,
|
|
92
|
+
rating: rating(recorded ? after : before, { model, leafId }),
|
|
93
|
+
source: source || null,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// reportLedger -> advisory ratings (most-actionable first), each carrying the
|
|
98
|
+
// credible lower bound and n. Optional model filter. Read-only.
|
|
99
|
+
export function reportLedger({ model, projectRoot, io = fs } = {}) {
|
|
100
|
+
const rows = report(readLedger({ projectRoot, io }));
|
|
101
|
+
return model ? rows.filter((r) => r.model === model) : rows;
|
|
102
|
+
}
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
// Model-suitability ledger — the math core (design D1, advisory only).
|
|
2
|
+
//
|
|
3
|
+
// It answers ONE question per (model x leaf) pair: from the binary adequacy
|
|
4
|
+
// outcomes we have seen, is this CHEAP model safe to keep on this leaf? The
|
|
5
|
+
// answer is conservative by construction — it commits to "keep-cheap" only when
|
|
6
|
+
// the evidence is both good AND plentiful, and to "demote" only when the
|
|
7
|
+
// evidence is clearly bad. Everything in between is "watch": not enough proof to
|
|
8
|
+
// move, so the safe default (deep / no downgrade) stands.
|
|
9
|
+
//
|
|
10
|
+
// This module is PURE and DETERMINISTIC. No Date, no Math.random, no I/O. The
|
|
11
|
+
// ledger is a plain object the caller owns; every update returns a NEW ledger.
|
|
12
|
+
// Persistence lives behind the MCP tools (F2) so the sandbox/state-coupling rule
|
|
13
|
+
// holds: a workflow script never writes ledger state, the MCP tool does. The
|
|
14
|
+
// statistics here are the part a downgraded model would get subtly wrong, which
|
|
15
|
+
// is exactly why this leaf was kept on the strong model.
|
|
16
|
+
|
|
17
|
+
// Defaults. The math does not hard-depend on these — they are the policy knobs,
|
|
18
|
+
// passed through opts and overridable per call.
|
|
19
|
+
//
|
|
20
|
+
// Prior Beta(1,1) is uniform/neutral: the conservatism comes from the credible
|
|
21
|
+
// INTERVAL (a thin sample yields a wide interval and therefore a low floor),
|
|
22
|
+
// not from a stacked prior. keepThreshold > demoteThreshold leaves a deliberate
|
|
23
|
+
// "watch" band between them that a straddling interval falls into.
|
|
24
|
+
export const DEFAULTS = Object.freeze({
|
|
25
|
+
priorAlpha: 1,
|
|
26
|
+
priorBeta: 1,
|
|
27
|
+
credibleLevel: 0.95, // equal-tailed credible interval width
|
|
28
|
+
keepThreshold: 0.85, // lower credible bound >= this -> keep-cheap (proven safe)
|
|
29
|
+
demoteThreshold: 0.70, // upper credible bound <= this -> demote (proven unsafe)
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
// --- Statistics: regularized incomplete beta and its inverse ---------------
|
|
33
|
+
//
|
|
34
|
+
// We need P(p <= x) for a Beta(a,b) posterior (the regularized incomplete beta
|
|
35
|
+
// I_x(a,b)) and its inverse (the quantile) to read off a credible interval.
|
|
36
|
+
// Implemented from first principles (Lanczos log-gamma + Numerical-Recipes
|
|
37
|
+
// continued fraction + bisection) so there is no dependency and the result is
|
|
38
|
+
// reproducible to ~1e-10.
|
|
39
|
+
|
|
40
|
+
const LANCZOS_G = 7;
|
|
41
|
+
const LANCZOS_C = [
|
|
42
|
+
0.99999999999980993,
|
|
43
|
+
676.5203681218851,
|
|
44
|
+
-1259.1392167224028,
|
|
45
|
+
771.32342877765313,
|
|
46
|
+
-176.61502916214059,
|
|
47
|
+
12.507343278686905,
|
|
48
|
+
-0.13857109526572012,
|
|
49
|
+
9.9843695780195716e-6,
|
|
50
|
+
1.5056327351493116e-7,
|
|
51
|
+
];
|
|
52
|
+
|
|
53
|
+
// Natural log of the Gamma function (Lanczos approximation, reflection for z<0.5).
|
|
54
|
+
export function lgamma(z) {
|
|
55
|
+
if (z < 0.5) {
|
|
56
|
+
return Math.log(Math.PI / Math.sin(Math.PI * z)) - lgamma(1 - z);
|
|
57
|
+
}
|
|
58
|
+
z -= 1;
|
|
59
|
+
let x = LANCZOS_C[0];
|
|
60
|
+
for (let i = 1; i < LANCZOS_G + 2; i++) x += LANCZOS_C[i] / (z + i);
|
|
61
|
+
const t = z + LANCZOS_G + 0.5;
|
|
62
|
+
return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Continued fraction for the incomplete beta (Lentz's method). The 300-iteration
|
|
66
|
+
// cap is a backstop, not a working limit: betai only ever calls this on the
|
|
67
|
+
// fast-converging side (x < (a+1)/(a+b+2), enforced by its reflection), where
|
|
68
|
+
// Lentz reaches the 1e-12 tolerance in tens of steps for any realistic posterior.
|
|
69
|
+
function betacf(x, a, b) {
|
|
70
|
+
const FPMIN = 1e-300;
|
|
71
|
+
const qab = a + b;
|
|
72
|
+
const qap = a + 1;
|
|
73
|
+
const qam = a - 1;
|
|
74
|
+
let c = 1;
|
|
75
|
+
let d = 1 - (qab * x) / qap;
|
|
76
|
+
if (Math.abs(d) < FPMIN) d = FPMIN;
|
|
77
|
+
d = 1 / d;
|
|
78
|
+
let h = d;
|
|
79
|
+
for (let m = 1; m <= 300; m++) {
|
|
80
|
+
const m2 = 2 * m;
|
|
81
|
+
let aa = (m * (b - m) * x) / ((qam + m2) * (a + m2));
|
|
82
|
+
d = 1 + aa * d;
|
|
83
|
+
if (Math.abs(d) < FPMIN) d = FPMIN;
|
|
84
|
+
c = 1 + aa / c;
|
|
85
|
+
if (Math.abs(c) < FPMIN) c = FPMIN;
|
|
86
|
+
d = 1 / d;
|
|
87
|
+
h *= d * c;
|
|
88
|
+
aa = (-(a + m) * (qab + m) * x) / ((a + m2) * (qap + m2));
|
|
89
|
+
d = 1 + aa * d;
|
|
90
|
+
if (Math.abs(d) < FPMIN) d = FPMIN;
|
|
91
|
+
c = 1 + aa / c;
|
|
92
|
+
if (Math.abs(c) < FPMIN) c = FPMIN;
|
|
93
|
+
d = 1 / d;
|
|
94
|
+
const del = d * c;
|
|
95
|
+
h *= del;
|
|
96
|
+
if (Math.abs(del - 1) < 1e-12) break;
|
|
97
|
+
}
|
|
98
|
+
return h;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Regularized incomplete beta I_x(a,b) = P(X <= x) for X ~ Beta(a,b).
|
|
102
|
+
export function betai(x, a, b) {
|
|
103
|
+
if (x <= 0) return 0;
|
|
104
|
+
if (x >= 1) return 1;
|
|
105
|
+
const logBeta = lgamma(a + b) - lgamma(a) - lgamma(b);
|
|
106
|
+
const front = Math.exp(logBeta + a * Math.log(x) + b * Math.log(1 - x));
|
|
107
|
+
// Use the fraction on the side where it converges fastest, then reflect.
|
|
108
|
+
if (x < (a + 1) / (a + b + 2)) {
|
|
109
|
+
return (front * betacf(x, a, b)) / a;
|
|
110
|
+
}
|
|
111
|
+
return 1 - (front * betacf(1 - x, b, a)) / b;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Inverse CDF: smallest x with I_x(a,b) = p. Bisection — monotone, dependency
|
|
115
|
+
// free, deterministic. 100 halvings drive the bracket below 1e-30, far tighter
|
|
116
|
+
// than the betai accuracy, so the quantile is exact for our purposes.
|
|
117
|
+
export function betaQuantile(p, a, b) {
|
|
118
|
+
if (p <= 0) return 0;
|
|
119
|
+
if (p >= 1) return 1;
|
|
120
|
+
let lo = 0;
|
|
121
|
+
let hi = 1;
|
|
122
|
+
for (let i = 0; i < 100; i++) {
|
|
123
|
+
const mid = (lo + hi) / 2;
|
|
124
|
+
if (betai(mid, a, b) < p) lo = mid;
|
|
125
|
+
else hi = mid;
|
|
126
|
+
}
|
|
127
|
+
return (lo + hi) / 2;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// --- Ledger ----------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
// Key for a (model x leaf) pair. '::' is reserved as the separator; model and
|
|
133
|
+
// leafId are also stored on the entry so the report never has to parse keys.
|
|
134
|
+
export function leafKey(model, leafId) {
|
|
135
|
+
return `${model}::${leafId}`;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function emptyEntry(model, leafId) {
|
|
139
|
+
return { model, leafId, successes: 0, failures: 0 };
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// recordOutcome(ledger, { model, leafId, success }) -> a NEW ledger.
|
|
143
|
+
//
|
|
144
|
+
// success === true : the cheap model was adequate on this leaf this time.
|
|
145
|
+
// success === false : it was not (the adversarial pass refuted it).
|
|
146
|
+
// Stores RAW counts (prior-independent) so the prior stays a read-time policy.
|
|
147
|
+
// Throws on malformed input — that is a programmer error, surfaced loudly, not
|
|
148
|
+
// silently swallowed. (The MCP tool's no-op-on-failure contract is about
|
|
149
|
+
// transport/persistence, not input validation.)
|
|
150
|
+
export function recordOutcome(ledger, { model, leafId, success } = {}) {
|
|
151
|
+
if (!model || !leafId) throw new Error('recordOutcome requires model and leafId');
|
|
152
|
+
if (typeof success !== 'boolean') throw new Error('recordOutcome requires success:boolean');
|
|
153
|
+
const key = leafKey(model, leafId);
|
|
154
|
+
const cur = ledger[key] || emptyEntry(model, leafId);
|
|
155
|
+
const next = {
|
|
156
|
+
model,
|
|
157
|
+
leafId,
|
|
158
|
+
successes: cur.successes + (success ? 1 : 0),
|
|
159
|
+
failures: cur.failures + (success ? 0 : 1),
|
|
160
|
+
};
|
|
161
|
+
return { ...ledger, [key]: next };
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// posterior(entry, opts) -> { alpha, beta }. Applies the prior to raw counts.
|
|
165
|
+
export function posterior(entry, opts) {
|
|
166
|
+
const o = { ...DEFAULTS, ...opts };
|
|
167
|
+
return {
|
|
168
|
+
alpha: o.priorAlpha + (entry ? entry.successes : 0),
|
|
169
|
+
beta: o.priorBeta + (entry ? entry.failures : 0),
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// verdictFromBounds(lower, upper, opts) -> 'keep-cheap' | 'demote' | 'watch'.
|
|
174
|
+
// keep-cheap and demote are mutually exclusive (lower <= upper), so the order
|
|
175
|
+
// of the two tests does not matter; "watch" is everything the evidence has not
|
|
176
|
+
// settled. This is the whole safety policy in three lines.
|
|
177
|
+
export function verdictFromBounds(lower, upper, opts) {
|
|
178
|
+
const o = { ...DEFAULTS, ...opts };
|
|
179
|
+
if (lower >= o.keepThreshold) return 'keep-cheap';
|
|
180
|
+
if (upper <= o.demoteThreshold) return 'demote';
|
|
181
|
+
return 'watch';
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// rating(ledger, { model, leafId }, opts) -> the full advisory record. ALWAYS
|
|
185
|
+
// carries the credible lower bound and n; a consumer that shows only `mean`
|
|
186
|
+
// would be discarding the very signal that makes a thin sample untrustworthy.
|
|
187
|
+
export function rating(ledger, { model, leafId }, opts) {
|
|
188
|
+
const o = { ...DEFAULTS, ...opts };
|
|
189
|
+
const entry = ledger[leafKey(model, leafId)] || emptyEntry(model, leafId);
|
|
190
|
+
const { alpha, beta } = posterior(entry, o);
|
|
191
|
+
const n = entry.successes + entry.failures;
|
|
192
|
+
const tail = (1 - o.credibleLevel) / 2;
|
|
193
|
+
const lower = betaQuantile(tail, alpha, beta);
|
|
194
|
+
const upper = betaQuantile(1 - tail, alpha, beta);
|
|
195
|
+
const mean = alpha / (alpha + beta);
|
|
196
|
+
return {
|
|
197
|
+
model,
|
|
198
|
+
leafId,
|
|
199
|
+
n,
|
|
200
|
+
successes: entry.successes,
|
|
201
|
+
failures: entry.failures,
|
|
202
|
+
mean,
|
|
203
|
+
lower,
|
|
204
|
+
upper,
|
|
205
|
+
credibleLevel: o.credibleLevel,
|
|
206
|
+
verdict: verdictFromBounds(lower, upper, o),
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Most-actionable first: demote, then watch, then keep-cheap; ties by leaf then
|
|
211
|
+
// model for stable output.
|
|
212
|
+
const SEVERITY = { demote: 0, watch: 1, 'keep-cheap': 2 };
|
|
213
|
+
|
|
214
|
+
// report(ledger, opts) -> ratings for every pair, severity-sorted.
|
|
215
|
+
export function report(ledger, opts) {
|
|
216
|
+
return Object.values(ledger || {})
|
|
217
|
+
.map((e) => rating(ledger, { model: e.model, leafId: e.leafId }, opts))
|
|
218
|
+
.sort(
|
|
219
|
+
(a, b) =>
|
|
220
|
+
SEVERITY[a.verdict] - SEVERITY[b.verdict] ||
|
|
221
|
+
a.leafId.localeCompare(b.leafId) ||
|
|
222
|
+
a.model.localeCompare(b.model),
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// formatRating(rating) -> one advisory line. It REFUSES to print a bare point
|
|
227
|
+
// estimate: the credible lower bound and n are always present, because "92%"
|
|
228
|
+
// over 3 samples and "92%" over 300 are not the same claim, and only the second
|
|
229
|
+
// should ever move a human to drop a downgrade.
|
|
230
|
+
export function formatRating(r) {
|
|
231
|
+
const pct = (x) => (x * 100).toFixed(1);
|
|
232
|
+
const lvl = Math.round(r.credibleLevel * 100);
|
|
233
|
+
return `${r.model} x ${r.leafId}: lower${lvl}=${pct(r.lower)}% (mean ${pct(r.mean)}%, n=${r.n}) -> ${r.verdict}`;
|
|
234
|
+
}
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
import fs from 'node:fs';
|
|
14
14
|
import path from 'node:path';
|
|
15
|
+
import { isKnownTierModel, isDowngradeModel, classifyLeaf, LEAF_TYPES } from './native-tiers.js';
|
|
15
16
|
|
|
16
17
|
// Strip /* block */ and // line comments. Preserve "://" inside strings (URLs)
|
|
17
18
|
// by only treating // as a comment when not preceded by a colon.
|
|
@@ -87,10 +88,79 @@ export function metaLiteralViolations(src) {
|
|
|
87
88
|
}];
|
|
88
89
|
}
|
|
89
90
|
|
|
91
|
+
// Model-routing anti-downgrade guard (enforcement-bridge F3).
|
|
92
|
+
//
|
|
93
|
+
// A native leaf may pin a CHEAPER model (opts.model) ONLY when it is an
|
|
94
|
+
// EXPLORATION leaf (read/load/parse/detect). Implement, verify and analysis
|
|
95
|
+
// leaves must inherit the session model (no opts.model). This is the structural
|
|
96
|
+
// net that stops a cheap model from silently landing on a heavy leaf — the exact
|
|
97
|
+
// STRICT-2 (No Downgrade) line. The source of truth for tiers and leaf
|
|
98
|
+
// classification is native-tiers.js; this rule only enforces it.
|
|
99
|
+
//
|
|
100
|
+
// Parsing is comment-stripped (a model: token in a comment is not a real call).
|
|
101
|
+
// Each model: is keyed to the nearest preceding label: within the SAME opts
|
|
102
|
+
// object (no intervening }). Downgraded leaves carry static-string labels by
|
|
103
|
+
// convention, so a quoted-literal match is sufficient.
|
|
104
|
+
const MODEL_RE = /\bmodel:\s*(['"`])([^'"`]*)\1/g;
|
|
105
|
+
const LABEL_RE = /\blabel:\s*(['"`])([^'"`]*)\1/g;
|
|
106
|
+
|
|
107
|
+
function nearestLabelBefore(code, modelIndex) {
|
|
108
|
+
const before = code.slice(0, modelIndex);
|
|
109
|
+
let last = null;
|
|
110
|
+
let m;
|
|
111
|
+
LABEL_RE.lastIndex = 0;
|
|
112
|
+
while ((m = LABEL_RE.exec(before))) {
|
|
113
|
+
last = { value: m[2], end: m.index + m[0].length };
|
|
114
|
+
}
|
|
115
|
+
if (!last) return null;
|
|
116
|
+
// Same object only: an object-close between the label and the model means the
|
|
117
|
+
// label belongs to a different (earlier) call.
|
|
118
|
+
if (before.slice(last.end).includes('}')) return null;
|
|
119
|
+
return last.value;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export function modelRoutingViolations(src) {
|
|
123
|
+
const code = stripComments(src);
|
|
124
|
+
const out = [];
|
|
125
|
+
let m;
|
|
126
|
+
MODEL_RE.lastIndex = 0;
|
|
127
|
+
while ((m = MODEL_RE.exec(code))) {
|
|
128
|
+
const model = m[2];
|
|
129
|
+
if (!isKnownTierModel(model)) {
|
|
130
|
+
out.push({
|
|
131
|
+
id: 'unknown-tier-model',
|
|
132
|
+
msg: `opts.model '${model}' is not a known downgrade tier (cheap/balanced); never pin up — omit opts.model to inherit the session model on deep leaves`,
|
|
133
|
+
});
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
const label = nearestLabelBefore(code, m.index);
|
|
137
|
+
if (!label) {
|
|
138
|
+
out.push({
|
|
139
|
+
id: 'downgrade-without-label',
|
|
140
|
+
msg: `a model downgrade ('${model}') must sit on a labelled exploration leaf; no label found in this opts object`,
|
|
141
|
+
});
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
if (isDowngradeModel(model) && classifyLeaf({ label }) !== LEAF_TYPES.EXPLORATION) {
|
|
145
|
+
out.push({
|
|
146
|
+
id: 'protected-leaf-downgraded',
|
|
147
|
+
msg: `leaf '${label}' is not exploration but carries downgrade model '${model}'; only read/load/parse/detect leaves may downgrade (STRICT-2 No Downgrade)`,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return out;
|
|
152
|
+
}
|
|
153
|
+
|
|
90
154
|
// Full native-workflow contract: state-coupling (comment-stripped) + clock/RNG
|
|
91
|
-
// (raw) + meta-literal-first. Returns the
|
|
155
|
+
// (raw) + meta-literal-first + model-routing anti-downgrade. Returns the
|
|
156
|
+
// combined [{ id, msg }] violations.
|
|
92
157
|
export function validateContract(src) {
|
|
93
|
-
return [
|
|
158
|
+
return [
|
|
159
|
+
...lintSource(src),
|
|
160
|
+
...clockRngViolations(src),
|
|
161
|
+
...metaLiteralViolations(src),
|
|
162
|
+
...modelRoutingViolations(src),
|
|
163
|
+
];
|
|
94
164
|
}
|
|
95
165
|
|
|
96
166
|
// Lint every *.js in a directory (non-recursive; native workflows are flat)
|
|
@@ -5,18 +5,24 @@
|
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"bin": {
|
|
8
|
-
"byan-mcp": "./server.js"
|
|
8
|
+
"byan-mcp": "./server.js",
|
|
9
|
+
"byan-sync-rules": "./bin/byan-sync-rules.js"
|
|
9
10
|
},
|
|
10
11
|
"scripts": {
|
|
11
12
|
"start": "node server.js",
|
|
12
13
|
"test": "node --test test/*.test.js"
|
|
13
14
|
},
|
|
14
15
|
"dependencies": {
|
|
15
|
-
"@modelcontextprotocol/sdk": "^1.29.0"
|
|
16
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
17
|
+
"js-yaml": "^4.1.1"
|
|
16
18
|
},
|
|
17
19
|
"engines": {
|
|
18
20
|
"node": ">=18.0.0"
|
|
19
21
|
},
|
|
20
|
-
"keywords": [
|
|
22
|
+
"keywords": [
|
|
23
|
+
"mcp",
|
|
24
|
+
"byan",
|
|
25
|
+
"claude-code"
|
|
26
|
+
],
|
|
21
27
|
"license": "MIT"
|
|
22
28
|
}
|