create-byan-agent 2.23.0 → 2.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +75 -0
- package/install/src/byan-v2/index.js +1 -1
- package/install/templates/.claude/hooks/drain-advisory.js +85 -0
- package/install/templates/.claude/hooks/lib/failure-detector.js +18 -4
- package/install/templates/.claude/settings.json +4 -0
- package/install/templates/.claude/skills/byan-insight/SKILL.md +56 -0
- package/install/templates/.claude/workflows/check-implementation-readiness.js +1 -1
- package/install/templates/.github/agents/bmad-agent-byan.md +3 -3
- package/install/templates/_byan/core/activation/soul-activation.md +3 -3
- package/install/templates/_byan/mcp/byan-mcp-server/bin/byan-insight-digest.js +31 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/advisory-autofeed.js +83 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/insight-harvest.js +220 -0
- package/install/templates/_byan/mcp/byan-mcp-server/lib/outcome-buffer.js +64 -0
- package/install/templates/_byan/mcp/byan-mcp-server/server.js +58 -0
- package/install/templates/_byan/worker/workers.md +8 -7
- package/install/templates/_byan/workflow/simple/byan/feature-workflow.md +2 -2
- package/package.json +1 -1
- package/src/loadbalancer/loadbalancer.js +1 -1
- package/src/core/dispatcher/execution-router.js +0 -66
package/CHANGELOG.md
CHANGED
|
@@ -9,6 +9,81 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
9
9
|
|
|
10
10
|
## [Unreleased]
|
|
11
11
|
|
|
12
|
+
## [2.25.0] - 2026-06-09
|
|
13
|
+
|
|
14
|
+
### Added - Advisory auto-feed (BYAN learns from each session, automatically)
|
|
15
|
+
|
|
16
|
+
The insight loop observed and proposed; the missing half was the LEARNING. BYAN's
|
|
17
|
+
advisory ledgers (ELO trust, the suitability ledger) updated only when the agent
|
|
18
|
+
remembered to call a record tool. This wires the automatic half — outcomes are
|
|
19
|
+
recorded at end of turn, with no agent action — while behavior surfaces stay
|
|
20
|
+
human-gated.
|
|
21
|
+
|
|
22
|
+
- **Capture.** The `byan_outcome_log` MCP tool appends one validated advisory
|
|
23
|
+
outcome to a buffer (cheap; it does not write a ledger directly). kind=elo logs
|
|
24
|
+
`{domain, result}`; kind=suitability logs `{model, leafId, success}`.
|
|
25
|
+
- **Drain.** `.claude/hooks/drain-advisory.js` is a Stop hook that, at end of each
|
|
26
|
+
turn, records the buffered outcomes into the ELO ledger (full Glicko update) and
|
|
27
|
+
the suitability ledger, advancing a line cursor for idempotency. It is strictly
|
|
28
|
+
non-blocking (all work in try/catch, emits `{continue:true}` and exit 0 on every
|
|
29
|
+
path) and crosses the ESM/CJS boundary (the CJS ELO engine via require, the ESM
|
|
30
|
+
suitability store via dynamic import).
|
|
31
|
+
- **Advisory-only.** The loop writes only the buffer and the two advisory ledgers.
|
|
32
|
+
Behavior surfaces (routing, personas, mantra thresholds) are left untouched —
|
|
33
|
+
those stay a human decision, consistent with the insight loop's gated philosophy.
|
|
34
|
+
- 71 tests (the pure planners, the buffer, and a drain-hook e2e with ledger
|
|
35
|
+
snapshot/restore) plus a live smoke test recording a real Glicko update. The tool
|
|
36
|
+
and hook ship in the template; the hook registers alongside the existing Stop
|
|
37
|
+
hooks.
|
|
38
|
+
- Explicit follow-ups (out of this scope): the adversarial verdict panel that would
|
|
39
|
+
feed suitability without a manual log, and a fact-graph-derived ELO source.
|
|
40
|
+
|
|
41
|
+
## [2.24.0] - 2026-06-09
|
|
42
|
+
|
|
43
|
+
### Added - Session insight loop (gated self-improvement)
|
|
44
|
+
|
|
45
|
+
BYAN already has advisory learning surfaces (ELO trust, the suitability ledger)
|
|
46
|
+
and the native Claude Code hooks already leave outcome trails on disk, but the
|
|
47
|
+
loop was open: the agent had to read and act on them by hand. This closes it,
|
|
48
|
+
under a strict gated philosophy.
|
|
49
|
+
|
|
50
|
+
- **Harvester** `_byan/mcp/byan-mcp-server/lib/insight-harvest.js` +
|
|
51
|
+
`bin/byan-insight-digest.js` + the `byan_insight_digest` MCP tool: read the
|
|
52
|
+
native trails (`tool-log.jsonl` health, strict `audit.log` recurring gaps,
|
|
53
|
+
the suitability ledger routing outcomes, the ELO profile trends) and aggregate
|
|
54
|
+
them into a digest with conservative, GATED proposals. Pure aggregation +
|
|
55
|
+
IO-isolated reader, mirroring the template-fidelity pattern.
|
|
56
|
+
- **Gated by design.** The harvester only READS; it writes nothing to a behavior
|
|
57
|
+
surface (routing, personas, mantra thresholds). Every proposal carries
|
|
58
|
+
`gated: true` and is surfaced for a human to ratify — an agent that rewrote its
|
|
59
|
+
own routing on a heuristic would be the silent-downgrade BYAN exists to prevent.
|
|
60
|
+
- **Skill** `byan-insight` presents the digest as a gated improvement proposal
|
|
61
|
+
(observe, propose, human ratifies), consistent with the advisory ELO /
|
|
62
|
+
suitability doctrine.
|
|
63
|
+
- **Guard false-positive fix.** `tool-failure-guard` flagged any tool whose result
|
|
64
|
+
echoed the literal phrase "internal error" as a failure, exempting only
|
|
65
|
+
Write/Edit/Read. Bash (diagnostic stdout) and MCP tools (echoed stored data) now
|
|
66
|
+
join the echo-heavy set: their `is_error` flag is trusted, content patterns are
|
|
67
|
+
not. A genuine failure still sets `is_error`. Caught live (a Bash log-grep
|
|
68
|
+
blocked the session twice) and covered by unit + e2e tests.
|
|
69
|
+
- 43 harvester unit tests + the detector tests; the e2e guard tests moved their
|
|
70
|
+
content-pattern cases onto a non-echo tool. The tool and skill ship in the
|
|
71
|
+
template.
|
|
72
|
+
|
|
73
|
+
### Changed - Closed the fused-route and output-folder legacy debts
|
|
74
|
+
|
|
75
|
+
- Removed the dead parallel router `src/core/dispatcher/execution-router.js` (zero
|
|
76
|
+
live consumers) and its test; the routing docs (`workers.md`,
|
|
77
|
+
`feature-workflow.md`) and the loadbalancer architecture comment now point only
|
|
78
|
+
to `byan_dispatch` and its two-axis model (strategy from score, model tier from
|
|
79
|
+
nature).
|
|
80
|
+
- Standardized the documented output folder from the legacy `_bmad-output/` to the
|
|
81
|
+
runtime's `_byan-output/` across the agent and platform docs plus an inert config
|
|
82
|
+
default. Left untouched on purpose: the deliberate back-compat read in
|
|
83
|
+
`agent-packager.js` (recovers agent creations from older installs under
|
|
84
|
+
`_bmad-output/bmb-creations`), the migration guides, and the anti-regression
|
|
85
|
+
tests that assert the old name is gone.
|
|
86
|
+
|
|
12
87
|
## [2.23.0] - 2026-06-09
|
|
13
88
|
|
|
14
89
|
### Added - Stub path normalizer + a 5th pre-commit gate (no _bmad/@bmad drift)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// drain-advisory.js — Stop hook. At the end of each assistant turn, drain the
|
|
3
|
+
// outcome buffer into BYAN's ADVISORY ledgers (ELO trust, suitability). This is the
|
|
4
|
+
// automatic half of the closed learning loop: outcomes logged during the turn (via
|
|
5
|
+
// byan_outcome_log) are recorded with NO agent action. Behavior surfaces (routing /
|
|
6
|
+
// personas / mantras) are never touched — only advisory data is written.
|
|
7
|
+
//
|
|
8
|
+
// STRICTLY non-blocking. All work is wrapped in try/catch; the hook ALWAYS emits
|
|
9
|
+
// {continue:true} and exits 0, and never throws or exits 2. An advisory feed must
|
|
10
|
+
// never break a turn (the stage-to-byan.js contract: "staging must never break the
|
|
11
|
+
// session"). Idempotent via a line cursor, so a re-fired Stop (stop_hook_active)
|
|
12
|
+
// records nothing new.
|
|
13
|
+
//
|
|
14
|
+
// ESM/CJS: this hook is CommonJS. The ELO engine is CJS (require). The pure libs and
|
|
15
|
+
// the suitability store are ESM under a type:module package, reached via dynamic
|
|
16
|
+
// import() with a file:// URL.
|
|
17
|
+
|
|
18
|
+
const path = require('path');
|
|
19
|
+
const { pathToFileURL } = require('url');
|
|
20
|
+
|
|
21
|
+
function readStdin() {
|
|
22
|
+
return new Promise((resolve) => {
|
|
23
|
+
if (process.stdin.isTTY) return resolve('');
|
|
24
|
+
let data = '';
|
|
25
|
+
process.stdin.on('data', (c) => (data += c));
|
|
26
|
+
process.stdin.on('end', () => resolve(data));
|
|
27
|
+
process.stdin.on('error', () => resolve(data));
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function done() {
|
|
32
|
+
process.stdout.write(JSON.stringify({ continue: true }));
|
|
33
|
+
process.exit(0);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
(async () => {
|
|
37
|
+
try {
|
|
38
|
+
await readStdin(); // the Stop payload is not needed — we drain disk state
|
|
39
|
+
const root = process.env.CLAUDE_PROJECT_DIR || process.cwd();
|
|
40
|
+
const esm = (rel) => import(pathToFileURL(path.join(root, rel)).href);
|
|
41
|
+
|
|
42
|
+
const af = await esm('_byan/mcp/byan-mcp-server/lib/advisory-autofeed.js');
|
|
43
|
+
const buf = await esm('_byan/mcp/byan-mcp-server/lib/outcome-buffer.js');
|
|
44
|
+
|
|
45
|
+
const outcomes = af.parseOutcomes(buf.readBuffer({ rootDir: root }));
|
|
46
|
+
const cursor = buf.readCursor({ rootDir: root });
|
|
47
|
+
const { pending, newCursor } = af.planDrain(outcomes, cursor);
|
|
48
|
+
if (!pending.length) return done();
|
|
49
|
+
|
|
50
|
+
let eloEngine = null;
|
|
51
|
+
let suitability = null;
|
|
52
|
+
for (const o of pending) {
|
|
53
|
+
const rec = af.classifyOutcome(o);
|
|
54
|
+
if (!rec) continue;
|
|
55
|
+
try {
|
|
56
|
+
if (rec.kind === 'elo') {
|
|
57
|
+
if (!eloEngine) {
|
|
58
|
+
const EloEngine = require(path.join(root, 'src', 'byan-v2', 'elo', 'index.js'));
|
|
59
|
+
eloEngine = new EloEngine({
|
|
60
|
+
storagePath: path.join(root, '_byan', 'memoire', 'elo-profile.json'),
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
eloEngine.recordResult(rec.domain, rec.result);
|
|
64
|
+
} else if (rec.kind === 'suitability') {
|
|
65
|
+
if (!suitability) {
|
|
66
|
+
suitability = await esm('_byan/mcp/byan-mcp-server/lib/suitability-store.js');
|
|
67
|
+
}
|
|
68
|
+
suitability.record({
|
|
69
|
+
model: rec.model,
|
|
70
|
+
leafId: rec.leafId,
|
|
71
|
+
success: rec.success,
|
|
72
|
+
source: 'autofeed',
|
|
73
|
+
projectRoot: root,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
} catch {
|
|
77
|
+
// one bad record must not abort the drain or block the turn
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
buf.writeCursor(newCursor, { rootDir: root });
|
|
81
|
+
} catch {
|
|
82
|
+
// any failure degrades silently — the feed is housekeeping, never a blocker
|
|
83
|
+
}
|
|
84
|
+
done();
|
|
85
|
+
})();
|
|
@@ -10,14 +10,27 @@ const ERROR_PATTERNS = [
|
|
|
10
10
|
/tool_use_error/i,
|
|
11
11
|
];
|
|
12
12
|
|
|
13
|
-
// Tools whose response echoes user-authored or
|
|
13
|
+
// Tools whose response echoes user-authored or stored content (Write/Edit
|
|
14
14
|
// return file paths + content fragments, Read echoes file content
|
|
15
15
|
// verbatim). Pattern match on their response fires false positives when
|
|
16
|
-
// the
|
|
16
|
+
// the content itself contains the literal phrase "internal error"
|
|
17
17
|
// (e.g. a doc about errors, a test fixture, a hook that detects errors).
|
|
18
18
|
// For these, only trust the explicit is_error flag.
|
|
19
19
|
const ECHO_TOOLS = new Set(['Write', 'Edit', 'NotebookEdit', 'Read']);
|
|
20
20
|
|
|
21
|
+
// Two more tool classes echo DATA (not a stderr stream), so content-pattern
|
|
22
|
+
// matching on their response is noise. A genuine failure of either sets is_error
|
|
23
|
+
// (checked before this guard), so we lose no real-failure detection:
|
|
24
|
+
// - MCP tools (mcp__server__tool): byan_fd_* echoes the FD state (which can
|
|
25
|
+
// hold user-authored raw_ideas / notes containing the literal phrase),
|
|
26
|
+
// byan_*_status echoes ledger content, etc.
|
|
27
|
+
// - Bash: its response is command stdout - diagnostics, log greps, test output
|
|
28
|
+
// that legitimately surface error-words. A real Bash failure exits non-zero,
|
|
29
|
+
// which the harness marks as is_error.
|
|
30
|
+
function isEchoHeavy(toolName) {
|
|
31
|
+
return ECHO_TOOLS.has(toolName) || toolName === 'Bash' || toolName.startsWith('mcp__');
|
|
32
|
+
}
|
|
33
|
+
|
|
21
34
|
function detectFailure(payload) {
|
|
22
35
|
if (!payload || typeof payload !== 'object') return null;
|
|
23
36
|
|
|
@@ -30,8 +43,9 @@ function detectFailure(payload) {
|
|
|
30
43
|
}
|
|
31
44
|
}
|
|
32
45
|
|
|
33
|
-
// Do not pattern-match on echo-heavy tools
|
|
34
|
-
|
|
46
|
+
// Do not pattern-match on echo-heavy tools (file-echo + MCP data) — only
|
|
47
|
+
// trust the is_error flag, checked above.
|
|
48
|
+
if (isEchoHeavy(toolName)) {
|
|
35
49
|
return null;
|
|
36
50
|
}
|
|
37
51
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: byan-insight
|
|
3
|
+
description: Harvest the native Claude Code outcome trails (tool-log, strict-audit gaps, suitability ledger, ELO) into a GATED self-improvement digest for BYAN. Invoke when the user asks "what did this session teach BYAN", "insight digest", "self-improvement", "qu'est-ce que BYAN a appris", or wants to review recurring gaps / routing outcomes / tool health before deciding what to improve. Observe and propose; the human ratifies each change.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# BYAN Insight Loop (gated self-improvement)
|
|
7
|
+
|
|
8
|
+
BYAN already has advisory learning surfaces (ELO trust, the suitability ledger,
|
|
9
|
+
soul-memory) and the native Claude Code hooks already leave outcome trails on
|
|
10
|
+
disk. This skill closes the loop: it READS those trails, aggregates them into a
|
|
11
|
+
digest, and surfaces GATED proposals. It does not modify a behavior surface.
|
|
12
|
+
|
|
13
|
+
## The one hard rule: observe and propose, do not silently self-modify
|
|
14
|
+
|
|
15
|
+
An agent that rewrote its own routing, personas, or mantra thresholds on a
|
|
16
|
+
heuristic would be the exact silent-downgrade BYAN exists to prevent. So this
|
|
17
|
+
loop stops at a PROPOSAL. Applying a change (a routing tweak, a new checklist
|
|
18
|
+
item, a persona edit) stays a human decision — ideally run as its own FD. The
|
|
19
|
+
advisory data (ELO, suitability) is read-only here; behavior surfaces are left
|
|
20
|
+
to the human gate.
|
|
21
|
+
|
|
22
|
+
## Protocol
|
|
23
|
+
|
|
24
|
+
1. **Harvest.** Call the MCP tool `byan_insight_digest` (read-only, no args). It
|
|
25
|
+
returns `{ gated: true, digest, render }` where `digest` is
|
|
26
|
+
`{ toolHealth, recurringGaps, routingOutcomes, eloTrends, proposals }`.
|
|
27
|
+
- `toolHealth` : call count, failure rate, top failing tools, output-token cost
|
|
28
|
+
(from `_byan-output/tool-log.jsonl`).
|
|
29
|
+
- `recurringGaps` : clustered self-verify gap themes with counts (from
|
|
30
|
+
`.byan-strict/audit.log`) — what BYAN keeps missing.
|
|
31
|
+
- `routingOutcomes` : per cheap-model x leaf keep-rate (from the suitability
|
|
32
|
+
ledger) — where a downgrade is proven good or bad.
|
|
33
|
+
- `eloTrends` : per-domain trust rating.
|
|
34
|
+
- `proposals` : conservative, GATED suggestions (each `gated: true`).
|
|
35
|
+
2. **Present.** Show the `render` text, then the proposals as a numbered list.
|
|
36
|
+
Make explicit that nothing has been applied.
|
|
37
|
+
3. **Gate.** For each proposal the user accepts, run the change as its own scoped
|
|
38
|
+
work (a short FD for a behavior change; a direct edit for a doc/checklist).
|
|
39
|
+
Do not auto-apply a proposal.
|
|
40
|
+
|
|
41
|
+
## CLI equivalent
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
node _byan/mcp/byan-mcp-server/bin/byan-insight-digest.js [--root <dir>] [--json]
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Prints the human-readable digest, or the raw JSON with `--json`. Self-disables
|
|
48
|
+
(empty digest) when the trails are absent, so a fresh checkout is not an error.
|
|
49
|
+
|
|
50
|
+
## What it deliberately leaves alone
|
|
51
|
+
|
|
52
|
+
- It does not call `byan_elo_record` / `byan_suitability_record` for you (those
|
|
53
|
+
stay where the outcome actually happens, e.g. a VALIDATE pass).
|
|
54
|
+
- It does not edit `lib/dispatch.js`, `native-tiers.js`, a persona, or the mantra
|
|
55
|
+
thresholds. Those are behavior surfaces; a proposal names them, a human
|
|
56
|
+
changes them.
|
|
@@ -18,7 +18,7 @@ export const meta = {
|
|
|
18
18
|
// this script). No wall-clock, no randomness: any date/id is passed via args
|
|
19
19
|
// so the runtime can resume deterministically.
|
|
20
20
|
|
|
21
|
-
const planningArtifacts = (args && args.planningArtifacts) || '
|
|
21
|
+
const planningArtifacts = (args && args.planningArtifacts) || '_byan-output/planning-artifacts'
|
|
22
22
|
const reportDate = (args && args.date) || 'unspecified'
|
|
23
23
|
const role = 'an expert Product Manager and Scrum Master specialized in requirements traceability and spotting gaps in planning artifacts. Be adversarial: your job is to find the failures others missed, not to reassure.'
|
|
24
24
|
|
|
@@ -1037,9 +1037,9 @@ L'agent peut invoquer n'importe quel agent specialise :
|
|
|
1037
1037
|
|
|
1038
1038
|
Variables de session disponibles apres chargement config :
|
|
1039
1039
|
- `{project-root}` : Racine du repository
|
|
1040
|
-
- `{output_folder}` : Dossier de sortie (`
|
|
1041
|
-
- `{planning_artifacts}` : `
|
|
1042
|
-
- `{implementation_artifacts}` : `
|
|
1040
|
+
- `{output_folder}` : Dossier de sortie (`_byan-output/`)
|
|
1041
|
+
- `{planning_artifacts}` : `_byan-output/planning-artifacts/`
|
|
1042
|
+
- `{implementation_artifacts}` : `_byan-output/implementation-artifacts/`
|
|
1043
1043
|
- `{user_name}`, `{communication_language}` : Depuis config.yaml
|
|
1044
1044
|
|
|
1045
1045
|
### Orchestration Multi-Agent
|
|
@@ -158,9 +158,9 @@ L'agent peut invoquer n'importe quel agent specialise :
|
|
|
158
158
|
|
|
159
159
|
Variables de session disponibles apres chargement config :
|
|
160
160
|
- `{project-root}` : Racine du repository
|
|
161
|
-
- `{output_folder}` : Dossier de sortie (`
|
|
162
|
-
- `{planning_artifacts}` : `
|
|
163
|
-
- `{implementation_artifacts}` : `
|
|
161
|
+
- `{output_folder}` : Dossier de sortie (`_byan-output/`)
|
|
162
|
+
- `{planning_artifacts}` : `_byan-output/planning-artifacts/`
|
|
163
|
+
- `{implementation_artifacts}` : `_byan-output/implementation-artifacts/`
|
|
164
164
|
- `{user_name}`, `{communication_language}` : Depuis config.yaml
|
|
165
165
|
|
|
166
166
|
### Orchestration Multi-Agent
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { harvest, renderDigest } from '../lib/insight-harvest.js';
|
|
3
|
+
|
|
4
|
+
// Aggregate native Claude Code outcome trails into a GATED improvement digest.
|
|
5
|
+
// Reads: _byan-output/tool-log.jsonl, .byan-strict/audit.log,
|
|
6
|
+
// _byan-output/suitability-ledger.json, _byan/memoire/elo-profile.json
|
|
7
|
+
// Missing trail -> empty; digest self-disables gracefully.
|
|
8
|
+
//
|
|
9
|
+
// Usage: node bin/byan-insight-digest.js [--root <dir>] [--json]
|
|
10
|
+
|
|
11
|
+
function parseArgs(argv) {
|
|
12
|
+
const args = { json: false };
|
|
13
|
+
for (let i = 2; i < argv.length; i++) {
|
|
14
|
+
if (argv[i] === '--json') args.json = true;
|
|
15
|
+
else if (argv[i] === '--root') args.root = argv[++i];
|
|
16
|
+
}
|
|
17
|
+
return args;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const args = parseArgs(process.argv);
|
|
21
|
+
const root = args.root || process.env.CLAUDE_PROJECT_DIR || process.cwd();
|
|
22
|
+
|
|
23
|
+
const digest = harvest({ rootDir: root });
|
|
24
|
+
|
|
25
|
+
if (args.json) {
|
|
26
|
+
process.stdout.write(JSON.stringify(digest, null, 2) + '\n');
|
|
27
|
+
} else {
|
|
28
|
+
process.stdout.write(renderDigest(digest) + '\n');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
process.exit(0);
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
// Advisory auto-feed — the pure planning half of the closed learning loop.
|
|
2
|
+
//
|
|
3
|
+
// BYAN's advisory ledgers (ELO trust, the suitability ledger) only INFORM future
|
|
4
|
+
// decisions; they never override behavior. The open gap was that nothing fed them
|
|
5
|
+
// automatically: the agent had to remember to call a record tool. This loop closes
|
|
6
|
+
// that — outcomes are LOGGED to a buffer during a turn (cheaply, via byan_outcome_log),
|
|
7
|
+
// and a Stop hook DRAINS the buffer into the ledgers at end of turn, with no agent
|
|
8
|
+
// action. Behavior surfaces (routing / personas / mantras) are out of scope: this
|
|
9
|
+
// only writes advisory data.
|
|
10
|
+
//
|
|
11
|
+
// This module is the PURE half (no I/O), so it is exhaustively unit-testable; the
|
|
12
|
+
// Stop hook supplies the buffer text + a cursor and applies the records.
|
|
13
|
+
//
|
|
14
|
+
// Buffer line shapes (jsonl, one outcome per line):
|
|
15
|
+
// { kind: 'elo', domain, result } result: VALIDATED|PARTIAL|BLOCKED
|
|
16
|
+
// { kind: 'suitability', model, leafId, success } success: boolean
|
|
17
|
+
// A line missing required fields or with a bad type is dropped (classifyOutcome -> null),
|
|
18
|
+
// never throwing — a malformed log line must not break the drain.
|
|
19
|
+
|
|
20
|
+
// Parse a jsonl buffer into outcome objects, skipping malformed lines.
|
|
21
|
+
export function parseOutcomes(text) {
|
|
22
|
+
if (!text) return [];
|
|
23
|
+
return text
|
|
24
|
+
.split('\n')
|
|
25
|
+
.filter((l) => l.trim())
|
|
26
|
+
.map((l) => {
|
|
27
|
+
try {
|
|
28
|
+
return JSON.parse(l);
|
|
29
|
+
} catch {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
})
|
|
33
|
+
.filter(Boolean);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Idempotent drain plan keyed on a LINE cursor: everything from `cursor` onward is
|
|
37
|
+
// pending; the new cursor is the full length. A re-fired Stop with no new lines
|
|
38
|
+
// yields an empty `pending`, so an outcome is recorded at most once.
|
|
39
|
+
export function planDrain(outcomes, cursor = 0) {
|
|
40
|
+
const safeCursor = Number.isInteger(cursor) && cursor >= 0 ? cursor : 0;
|
|
41
|
+
const start = Math.min(safeCursor, outcomes.length);
|
|
42
|
+
return { pending: outcomes.slice(start), newCursor: outcomes.length };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// The ELO engine's result vocabulary. The MCP/skill vocabulary uses PARTIAL; the
|
|
46
|
+
// engine uses PARTIALLY_VALID. classifyOutcome normalizes to the engine form.
|
|
47
|
+
const ELO_RESULTS = new Set(['VALIDATED', 'PARTIALLY_VALID', 'BLOCKED']);
|
|
48
|
+
function normalizeEloResult(r) {
|
|
49
|
+
if (r === 'PARTIAL') return 'PARTIALLY_VALID';
|
|
50
|
+
return r;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Validate + normalize one buffer outcome into a record intent, or null if invalid.
|
|
54
|
+
// elo -> { kind: 'elo', domain, result } (result in ELO_RESULTS)
|
|
55
|
+
// suitability -> { kind: 'suitability', model, leafId, success } (success boolean)
|
|
56
|
+
export function classifyOutcome(o) {
|
|
57
|
+
if (!o || typeof o !== 'object') return null;
|
|
58
|
+
if (o.kind === 'elo') {
|
|
59
|
+
const domain = typeof o.domain === 'string' ? o.domain.trim() : '';
|
|
60
|
+
const result = normalizeEloResult(o.result);
|
|
61
|
+
if (!domain || !ELO_RESULTS.has(result)) return null;
|
|
62
|
+
return { kind: 'elo', domain, result };
|
|
63
|
+
}
|
|
64
|
+
if (o.kind === 'suitability') {
|
|
65
|
+
const model = typeof o.model === 'string' ? o.model.trim() : '';
|
|
66
|
+
const leafId = typeof o.leafId === 'string' ? o.leafId.trim() : '';
|
|
67
|
+
if (!model || !leafId || typeof o.success !== 'boolean') return null;
|
|
68
|
+
return { kind: 'suitability', model, leafId, success: o.success };
|
|
69
|
+
}
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Validate an outcome BEFORE it is appended to the buffer (used by byan_outcome_log).
|
|
74
|
+
// Returns the canonical line object to write, or null if the input is not a valid
|
|
75
|
+
// outcome. Keyed on the same rules as classifyOutcome so the buffer only ever holds
|
|
76
|
+
// drainable lines.
|
|
77
|
+
export function validateForLog(input) {
|
|
78
|
+
const rec = classifyOutcome(input);
|
|
79
|
+
if (!rec) return null;
|
|
80
|
+
return rec.kind === 'elo'
|
|
81
|
+
? { kind: 'elo', domain: rec.domain, result: rec.result }
|
|
82
|
+
: { kind: 'suitability', model: rec.model, leafId: rec.leafId, success: rec.success };
|
|
83
|
+
}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
// Session insight harvester — read native Claude Code outcome trails and
|
|
2
|
+
// aggregate them into a GATED improvement digest for BYAN.
|
|
3
|
+
//
|
|
4
|
+
// Philosophy (the whole point): OBSERVE and PROPOSE, never silently self-modify.
|
|
5
|
+
// BYAN already has advisory learning surfaces (ELO trust, the suitability
|
|
6
|
+
// ledger) the agent updates by hand; the native hooks already leave outcome
|
|
7
|
+
// trails on disk. This module closes the loop by READING those trails and
|
|
8
|
+
// surfacing a digest with GATED proposals. It writes nothing back to a behavior
|
|
9
|
+
// surface (routing / personas / mantras): applying any change stays a human
|
|
10
|
+
// decision. An agent that rewrote its own routing on a heuristic would be the
|
|
11
|
+
// exact silent-downgrade BYAN exists to prevent.
|
|
12
|
+
//
|
|
13
|
+
// The aggregation is PURE (no I/O) so it is exhaustively unit-testable; the I/O
|
|
14
|
+
// entry takes an injected reader, mirroring template-sync.js / stub-sync.js.
|
|
15
|
+
//
|
|
16
|
+
// Trails consumed (shapes verified against the live repo):
|
|
17
|
+
// _byan-output/tool-log.jsonl post line {phase:'post', tool, ok, est_output_tokens?}
|
|
18
|
+
// .byan-strict/audit.log {event:'self_verify', verdict:'gap', findings:[]}
|
|
19
|
+
// _byan-output/suitability-ledger.json { "model::leaf": {model, leafId, successes, failures} }
|
|
20
|
+
// _byan/memoire/elo-profile.json { domains: { <domain>: {rating, blocked_streak, ...} } }
|
|
21
|
+
|
|
22
|
+
import fs from 'node:fs';
|
|
23
|
+
import path from 'node:path';
|
|
24
|
+
|
|
25
|
+
// Parse a JSONL blob into an array of objects, skipping malformed lines.
|
|
26
|
+
export function parseJsonl(text) {
|
|
27
|
+
if (!text) return [];
|
|
28
|
+
return text
|
|
29
|
+
.split('\n')
|
|
30
|
+
.filter(Boolean)
|
|
31
|
+
.map((l) => {
|
|
32
|
+
try {
|
|
33
|
+
return JSON.parse(l);
|
|
34
|
+
} catch {
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
})
|
|
38
|
+
.filter(Boolean);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Tool health from tool-log.jsonl post lines: call count, failure rate, the top
|
|
42
|
+
// failing tools, and an output-token cost proxy. est_output_tokens is absent on
|
|
43
|
+
// older lines (added later by the hook), so it defaults to 0.
|
|
44
|
+
export function harvestToolHealth(toolLogEntries) {
|
|
45
|
+
const post = (toolLogEntries || []).filter((e) => e && e.phase === 'post');
|
|
46
|
+
const failures = post.filter((e) => e.ok === false);
|
|
47
|
+
const byTool = {};
|
|
48
|
+
for (const f of failures) byTool[f.tool || 'unknown'] = (byTool[f.tool || 'unknown'] || 0) + 1;
|
|
49
|
+
const topFailing = Object.entries(byTool)
|
|
50
|
+
.sort((a, b) => b[1] - a[1])
|
|
51
|
+
.slice(0, 5)
|
|
52
|
+
.map(([tool, count]) => ({ tool, count }));
|
|
53
|
+
const estOutputTokens = post.reduce((s, e) => s + (e.est_output_tokens || 0), 0);
|
|
54
|
+
return {
|
|
55
|
+
calls: post.length,
|
|
56
|
+
failures: failures.length,
|
|
57
|
+
failureRate: post.length ? +(failures.length / post.length).toFixed(3) : 0,
|
|
58
|
+
topFailing,
|
|
59
|
+
estOutputTokens,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Coarse theme key for a strict gap finding. The categories mirror the recurring
|
|
64
|
+
// gap types BYAN actually hits; anything unmatched is 'other' (never silently
|
|
65
|
+
// dropped — it still counts under 'other').
|
|
66
|
+
function normalizeGap(finding) {
|
|
67
|
+
const s = String(finding).toLowerCase();
|
|
68
|
+
if (/\btest|coverage|spec\b/.test(s)) return 'tests/coverage';
|
|
69
|
+
if (/error|edge|exception|fail|throw/.test(s)) return 'error/edge handling';
|
|
70
|
+
if (/doc|comment|changelog|readme/.test(s)) return 'documentation';
|
|
71
|
+
if (/template|fidelity|sync|twin/.test(s)) return 'template fidelity';
|
|
72
|
+
if (/emoji/.test(s)) return 'emoji';
|
|
73
|
+
if (/scope|downgrade|cut|stub|mvp/.test(s)) return 'scope/downgrade';
|
|
74
|
+
return 'other';
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Recurring strict-gap clustering (L3): mine self_verify gap findings from the
|
|
78
|
+
// audit log and group them into themes. A theme is "recurring" at count >= 2.
|
|
79
|
+
export function harvestStrictGaps(auditEntries) {
|
|
80
|
+
const findings = [];
|
|
81
|
+
for (const e of auditEntries || []) {
|
|
82
|
+
if (e && e.event === 'self_verify' && e.verdict === 'gap' && Array.isArray(e.findings)) {
|
|
83
|
+
findings.push(...e.findings);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
const themes = {};
|
|
87
|
+
for (const f of findings) {
|
|
88
|
+
const key = normalizeGap(f);
|
|
89
|
+
if (!themes[key]) themes[key] = { theme: key, count: 0, samples: [] };
|
|
90
|
+
themes[key].count++;
|
|
91
|
+
if (themes[key].samples.length < 2) themes[key].samples.push(String(f).slice(0, 100));
|
|
92
|
+
}
|
|
93
|
+
const recurring = Object.values(themes)
|
|
94
|
+
.filter((t) => t.count >= 2)
|
|
95
|
+
.sort((a, b) => b.count - a.count);
|
|
96
|
+
return { totalGapFindings: findings.length, recurring };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Routing outcomes (L1): surface the suitability ledger as per (cheap-model x
|
|
100
|
+
// leaf) keep-rate rows, busiest first. keepRate = successes / (successes+failures).
|
|
101
|
+
export function harvestRouting(ledger) {
|
|
102
|
+
const rows = [];
|
|
103
|
+
const entries = ledger && typeof ledger === 'object' ? Object.entries(ledger) : [];
|
|
104
|
+
for (const [key, v] of entries) {
|
|
105
|
+
if (!v || typeof v !== 'object') continue;
|
|
106
|
+
const successes = Number(v.successes || 0);
|
|
107
|
+
const failures = Number(v.failures || 0);
|
|
108
|
+
const n = successes + failures;
|
|
109
|
+
if (!n) continue;
|
|
110
|
+
const model = v.model || key.split('::')[0];
|
|
111
|
+
const leaf = v.leafId || key.split('::')[1] || key;
|
|
112
|
+
rows.push({ model, leaf, successes, failures, n, keepRate: +(successes / n).toFixed(2) });
|
|
113
|
+
}
|
|
114
|
+
return rows.sort((a, b) => b.n - a.n);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Domain trust trends from the ELO profile: rating + blocked streak per domain.
|
|
118
|
+
export function harvestEloTrends(eloProfile) {
|
|
119
|
+
const domains = (eloProfile && eloProfile.domains) || {};
|
|
120
|
+
const rows = [];
|
|
121
|
+
for (const [domain, d] of Object.entries(domains)) {
|
|
122
|
+
if (!d || typeof d !== 'object' || typeof d.rating !== 'number') continue;
|
|
123
|
+
rows.push({ domain, rating: d.rating, blockedStreak: d.blocked_streak || 0 });
|
|
124
|
+
}
|
|
125
|
+
return rows.sort((a, b) => b.rating - a.rating);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Assemble the digest and derive GATED proposals. Every proposal is a suggestion
|
|
129
|
+
// for the human to ratify (gated:true) — none is auto-applied. The thresholds
|
|
130
|
+
// are deliberately conservative so noise does not generate proposals.
|
|
131
|
+
export function buildDigest({ toolHealth, gaps, routing, elo } = {}) {
|
|
132
|
+
const proposals = [];
|
|
133
|
+
|
|
134
|
+
if (toolHealth && toolHealth.failureRate > 0.1 && toolHealth.topFailing.length) {
|
|
135
|
+
const t = toolHealth.topFailing[0];
|
|
136
|
+
proposals.push({
|
|
137
|
+
kind: 'tool-reliability',
|
|
138
|
+
gated: true,
|
|
139
|
+
suggestion: `Tool failure rate ${toolHealth.failureRate}; top offender ${t.tool} (${t.count}). Investigate before relying on it.`,
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
for (const g of (gaps && gaps.recurring) || []) {
|
|
143
|
+
if (g.count >= 3) {
|
|
144
|
+
proposals.push({
|
|
145
|
+
kind: 'recurring-gap',
|
|
146
|
+
gated: true,
|
|
147
|
+
suggestion: `Recurring self-verify gap "${g.theme}" (${g.count}x). Consider a pre-build checklist item.`,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
for (const r of routing || []) {
|
|
152
|
+
if (r.n >= 5 && r.keepRate < 0.5) {
|
|
153
|
+
proposals.push({
|
|
154
|
+
kind: 'routing',
|
|
155
|
+
gated: true,
|
|
156
|
+
suggestion: `Cheap model ${r.model} underperforms on "${r.leaf}" (keepRate ${r.keepRate}, n=${r.n}). Consider keeping that leaf deep.`,
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
toolHealth: toolHealth || null,
|
|
163
|
+
recurringGaps: gaps || { totalGapFindings: 0, recurring: [] },
|
|
164
|
+
routingOutcomes: routing || [],
|
|
165
|
+
eloTrends: elo || [],
|
|
166
|
+
proposals,
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Human-readable render of a digest (for the CLI and the skill).
|
|
171
|
+
export function renderDigest(d) {
|
|
172
|
+
const lines = ['BYAN session insight digest', ''];
|
|
173
|
+
if (d.toolHealth) {
|
|
174
|
+
lines.push(
|
|
175
|
+
`Tool health: ${d.toolHealth.calls} calls, ${d.toolHealth.failures} failures (rate ${d.toolHealth.failureRate}), ~${d.toolHealth.estOutputTokens} output tokens.`
|
|
176
|
+
);
|
|
177
|
+
if (d.toolHealth.topFailing.length) {
|
|
178
|
+
lines.push(` Top failing: ${d.toolHealth.topFailing.map((t) => `${t.tool}(${t.count})`).join(', ')}`);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
lines.push(`Recurring gaps: ${d.recurringGaps.recurring.map((g) => `${g.theme}(${g.count})`).join(', ') || 'none'}`);
|
|
182
|
+
if (d.routingOutcomes.length) {
|
|
183
|
+
lines.push('Routing outcomes (cheap-model keep-rate):');
|
|
184
|
+
for (const r of d.routingOutcomes.slice(0, 8)) {
|
|
185
|
+
lines.push(` ${r.model}::${r.leaf} -> keep ${r.keepRate} (n=${r.n})`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (d.eloTrends.length) {
|
|
189
|
+
lines.push(`ELO trends: ${d.eloTrends.slice(0, 6).map((e) => `${e.domain}=${e.rating}`).join(', ')}`);
|
|
190
|
+
}
|
|
191
|
+
lines.push('', `Proposals (GATED — human ratifies, nothing auto-applied): ${d.proposals.length}`);
|
|
192
|
+
for (const p of d.proposals) lines.push(` [${p.kind}] ${p.suggestion}`);
|
|
193
|
+
return lines.join('\n');
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// I/O entry: read the trails under rootDir (missing trail -> empty, so the digest
|
|
197
|
+
// self-disables gracefully on a fresh checkout) and build the digest.
|
|
198
|
+
export function harvest({ rootDir, io = fs } = {}) {
|
|
199
|
+
const readText = (rel) => {
|
|
200
|
+
try {
|
|
201
|
+
return io.readFileSync(path.join(rootDir, rel), 'utf8');
|
|
202
|
+
} catch {
|
|
203
|
+
return '';
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
const readJson = (rel) => {
|
|
207
|
+
const t = readText(rel);
|
|
208
|
+
if (!t) return null;
|
|
209
|
+
try {
|
|
210
|
+
return JSON.parse(t);
|
|
211
|
+
} catch {
|
|
212
|
+
return null;
|
|
213
|
+
}
|
|
214
|
+
};
|
|
215
|
+
const toolHealth = harvestToolHealth(parseJsonl(readText('_byan-output/tool-log.jsonl')));
|
|
216
|
+
const gaps = harvestStrictGaps(parseJsonl(readText('.byan-strict/audit.log')));
|
|
217
|
+
const routing = harvestRouting(readJson('_byan-output/suitability-ledger.json'));
|
|
218
|
+
const elo = harvestEloTrends(readJson('_byan/memoire/elo-profile.json'));
|
|
219
|
+
return buildDigest({ toolHealth, gaps, routing, elo });
|
|
220
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
// Outcome buffer — the append-only capture file the advisory auto-feed drains.
|
|
2
|
+
//
|
|
3
|
+
// byan_outcome_log appends one validated outcome per line here during a turn; the
|
|
4
|
+
// drain-advisory Stop hook reads it at end of turn and records each new line into
|
|
5
|
+
// the advisory ledgers, advancing a line cursor for idempotency. Both sides take an
|
|
6
|
+
// injected `io` so the logic is testable without touching the real filesystem, and
|
|
7
|
+
// every operation is best-effort: a capture buffer must never break a turn.
|
|
8
|
+
|
|
9
|
+
import fs from 'node:fs';
|
|
10
|
+
import path from 'node:path';
|
|
11
|
+
|
|
12
|
+
export const BUFFER_REL = path.join('_byan-output', 'pending-outcomes.jsonl');
|
|
13
|
+
export const CURSOR_REL = path.join('_byan-output', '.advisory-cursor.json');
|
|
14
|
+
|
|
15
|
+
function bufferPath(rootDir) {
|
|
16
|
+
return path.join(rootDir, BUFFER_REL);
|
|
17
|
+
}
|
|
18
|
+
function cursorPath(rootDir) {
|
|
19
|
+
return path.join(rootDir, CURSOR_REL);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Append one outcome object as a jsonl line. Best-effort: returns true on write,
|
|
23
|
+
// false if the write threw (the caller stays safe).
|
|
24
|
+
export function appendOutcome(outcome, { rootDir, io = fs } = {}) {
|
|
25
|
+
try {
|
|
26
|
+
const p = bufferPath(rootDir);
|
|
27
|
+
io.mkdirSync(path.dirname(p), { recursive: true });
|
|
28
|
+
io.appendFileSync(p, JSON.stringify(outcome) + '\n');
|
|
29
|
+
return true;
|
|
30
|
+
} catch {
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Read the raw buffer text, or '' if absent/unreadable.
|
|
36
|
+
export function readBuffer({ rootDir, io = fs } = {}) {
|
|
37
|
+
try {
|
|
38
|
+
return io.readFileSync(bufferPath(rootDir), 'utf8');
|
|
39
|
+
} catch {
|
|
40
|
+
return '';
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Read the drain cursor (number of buffer lines already recorded), or 0.
|
|
45
|
+
export function readCursor({ rootDir, io = fs } = {}) {
|
|
46
|
+
try {
|
|
47
|
+
const obj = JSON.parse(io.readFileSync(cursorPath(rootDir), 'utf8'));
|
|
48
|
+
return Number.isInteger(obj && obj.drained) && obj.drained >= 0 ? obj.drained : 0;
|
|
49
|
+
} catch {
|
|
50
|
+
return 0;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Persist the drain cursor. Best-effort.
|
|
55
|
+
export function writeCursor(drained, { rootDir, io = fs } = {}) {
|
|
56
|
+
try {
|
|
57
|
+
const p = cursorPath(rootDir);
|
|
58
|
+
io.mkdirSync(path.dirname(p), { recursive: true });
|
|
59
|
+
io.writeFileSync(p, JSON.stringify({ drained }) + '\n');
|
|
60
|
+
return true;
|
|
61
|
+
} catch {
|
|
62
|
+
return false;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -9,6 +9,9 @@ import {
|
|
|
9
9
|
ListToolsRequestSchema,
|
|
10
10
|
} from '@modelcontextprotocol/sdk/types.js';
|
|
11
11
|
import { dispatch } from './lib/dispatch.js';
|
|
12
|
+
import { harvest as harvestInsights, renderDigest as renderInsightDigest } from './lib/insight-harvest.js';
|
|
13
|
+
import { appendOutcome } from './lib/outcome-buffer.js';
|
|
14
|
+
import { validateForLog } from './lib/advisory-autofeed.js';
|
|
12
15
|
import { readSoul, appendSoulMemory } from './lib/soul.js';
|
|
13
16
|
import { listSessions, readSessionEvents, searchSessions } from './lib/copilot.js';
|
|
14
17
|
import {
|
|
@@ -545,6 +548,34 @@ const tools = [
|
|
|
545
548
|
additionalProperties: false,
|
|
546
549
|
},
|
|
547
550
|
},
|
|
551
|
+
{
|
|
552
|
+
name: 'byan_insight_digest',
|
|
553
|
+
description:
|
|
554
|
+
'Harvest native Claude Code outcome trails (tool-log, strict-audit gaps, the suitability ledger, ELO) into a GATED improvement digest for BYAN. Read-only: it OBSERVES and PROPOSES; every proposal is gated for a human to ratify, nothing is auto-applied to routing / personas / mantras. Returns { toolHealth, recurringGaps, routingOutcomes, eloTrends, proposals }.',
|
|
555
|
+
inputSchema: {
|
|
556
|
+
type: 'object',
|
|
557
|
+
properties: {},
|
|
558
|
+
additionalProperties: false,
|
|
559
|
+
},
|
|
560
|
+
},
|
|
561
|
+
{
|
|
562
|
+
name: 'byan_outcome_log',
|
|
563
|
+
description:
|
|
564
|
+
'Log one ADVISORY outcome to the auto-feed buffer (cheap append; it never writes a ledger directly). The drain-advisory Stop hook records buffered outcomes into the ELO / suitability ledgers at end of turn, so BYAN auto-learns without the agent recording by hand. kind=elo needs { domain, result: VALIDATED|PARTIAL|BLOCKED }; kind=suitability needs { model, leafId, success }. Advisory-only: behavior surfaces (routing / personas / mantras) are never written.',
|
|
565
|
+
inputSchema: {
|
|
566
|
+
type: 'object',
|
|
567
|
+
properties: {
|
|
568
|
+
kind: { type: 'string', enum: ['elo', 'suitability'] },
|
|
569
|
+
domain: { type: 'string', description: 'elo: the technical domain of the claim' },
|
|
570
|
+
result: { type: 'string', enum: ['VALIDATED', 'PARTIAL', 'BLOCKED'], description: 'elo: the claim verdict' },
|
|
571
|
+
model: { type: 'string', description: 'suitability: the cheap model tier/id' },
|
|
572
|
+
leafId: { type: 'string', description: 'suitability: the workflow leaf' },
|
|
573
|
+
success: { type: 'boolean', description: 'suitability: did the cheap model survive adversarial review' },
|
|
574
|
+
},
|
|
575
|
+
required: ['kind'],
|
|
576
|
+
additionalProperties: false,
|
|
577
|
+
},
|
|
578
|
+
},
|
|
548
579
|
{
|
|
549
580
|
name: 'byan_strict_lock_scope',
|
|
550
581
|
description:
|
|
@@ -1383,6 +1414,33 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1383
1414
|
};
|
|
1384
1415
|
}
|
|
1385
1416
|
|
|
1417
|
+
if (name === 'byan_insight_digest') {
|
|
1418
|
+
const rootDir = process.env.CLAUDE_PROJECT_DIR || process.cwd();
|
|
1419
|
+
const digest = harvestInsights({ rootDir });
|
|
1420
|
+
return {
|
|
1421
|
+
content: [
|
|
1422
|
+
{
|
|
1423
|
+
type: 'text',
|
|
1424
|
+
text: JSON.stringify({ gated: true, digest, render: renderInsightDigest(digest) }, null, 2),
|
|
1425
|
+
},
|
|
1426
|
+
],
|
|
1427
|
+
};
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
if (name === 'byan_outcome_log') {
|
|
1431
|
+
const line = validateForLog(args);
|
|
1432
|
+
if (!line) {
|
|
1433
|
+
return {
|
|
1434
|
+
content: [{ type: 'text', text: JSON.stringify({ logged: false, reason: 'invalid_outcome' }) }],
|
|
1435
|
+
};
|
|
1436
|
+
}
|
|
1437
|
+
const rootDir = process.env.CLAUDE_PROJECT_DIR || process.cwd();
|
|
1438
|
+
const ok = appendOutcome(line, { rootDir });
|
|
1439
|
+
return {
|
|
1440
|
+
content: [{ type: 'text', text: JSON.stringify({ logged: ok, outcome: line }) }],
|
|
1441
|
+
};
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1386
1444
|
if (name === 'byan_strict_lock_scope') {
|
|
1387
1445
|
const r = strictLockScope({
|
|
1388
1446
|
scopeText: args.scopeText,
|
|
@@ -287,24 +287,25 @@ very different optimal targets depending on whether they run **alongside
|
|
|
287
287
|
siblings** (parallel) or **in sequence**. The v2 router adds a
|
|
288
288
|
`parallelizable` axis and emits an **execution strategy**, not a model.
|
|
289
289
|
|
|
290
|
-
Implementation :
|
|
291
|
-
|
|
290
|
+
Implementation : the MCP tool `byan_dispatch`
|
|
291
|
+
(`_byan/mcp/byan-mcp-server/lib/dispatch.js`), the single source of truth. The
|
|
292
|
+
strategy comes from the score + `parallelizable` ; the model tier is a separate
|
|
293
|
+
axis, derived from the task NATURE via `native-tiers.js`.
|
|
292
294
|
|
|
293
295
|
```
|
|
294
296
|
score < 15 → main-thread
|
|
295
297
|
score 15-39 + parallelizable: true → agent-subagent-worktree
|
|
296
|
-
score 15-39 + parallelizable: false → mcp-worker
|
|
297
|
-
score >= 40 → main-thread
|
|
298
|
+
score 15-39 + parallelizable: false → mcp-worker
|
|
299
|
+
score >= 40 → main-thread (heavy)
|
|
298
300
|
```
|
|
299
301
|
|
|
300
302
|
Rationale :
|
|
301
303
|
|
|
302
304
|
| Strategy | When | Why |
|
|
303
305
|
|---|---|---|
|
|
304
|
-
| `main-thread` | Trivial task | Spawning
|
|
306
|
+
| `main-thread` | Trivial or heavy task | Spawning costs more than solving inline (trivial), or the work is heavy and stays in the main thread. |
|
|
305
307
|
| `agent-subagent-worktree` | Medium parallel | Claude Code Agent tool with `isolation: "worktree"` amortizes boot cost across the wall-clock savings. |
|
|
306
|
-
| `mcp-worker
|
|
307
|
-
| `main-thread-opus` | Complex | Reasoning depth needed; subagent boot + context handoff would waste more than the delegation saves. |
|
|
308
|
+
| `mcp-worker` | Medium sequential | Delegate to a worker via MCP tool — no subagent boot, cheaper than the main thread. The model tier is set separately, by nature. |
|
|
308
309
|
|
|
309
310
|
The score threshold of 15 is where Claude Code `Agent` tool boot overhead
|
|
310
311
|
(~5-10k tokens for system prompt + tools) stops being worth it for
|
|
@@ -111,8 +111,8 @@ INIT
|
|
|
111
111
|
|------------------|-------|-----------|
|
|
112
112
|
| < 15 | `main-thread` | Inline dans le contexte courant, zéro overhead de délégation |
|
|
113
113
|
| < 40 + parallélisable | `agent-subagent-worktree` | Agent tool Claude Code avec isolation worktree |
|
|
114
|
-
| < 40 séquentiel | `mcp-worker
|
|
115
|
-
| ≥ 40 | `main-thread
|
|
114
|
+
| < 40 séquentiel | `mcp-worker` | Worker léger via MCP (le tier de modèle vient de la nature, pas de la taille) |
|
|
115
|
+
| ≥ 40 | `main-thread` | Garde en main thread (lourd) ; modèle hérité de la session |
|
|
116
116
|
|
|
117
117
|
> Le score (0-100) est estimé depuis la complexité de la tâche (longueur si absent). Appeler `byan_dispatch` pour le calcul — ne pas réinventer les seuils ici.
|
|
118
118
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "create-byan-agent",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.25.0",
|
|
4
4
|
"description": "BYAN v2.8 - Intelligent AI agent creator with ELO trust system + scientific fact-check + Hermes universal dispatcher + native Claude Code integration (hooks, skills, MCP server). Multi-platform (Copilot CLI, Claude Code, Codex). Merise Agile + TDD + 71 Mantras. ~54% LLM cost savings.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* - Integration with RateLimitTracker + SharedStateStore + SessionBridge
|
|
9
9
|
*
|
|
10
10
|
* Sits ABOVE existing BYAN routers:
|
|
11
|
-
* LoadBalancer (picks PLATFORM) →
|
|
11
|
+
* LoadBalancer (picks PLATFORM) → byan_dispatch (picks STRATEGY + model TIER)
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
const { EventEmitter } = require('events');
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Execution strategy router.
|
|
3
|
-
*
|
|
4
|
-
* Decides WHERE a task runs (not which model). Four strategies, routed by
|
|
5
|
-
* complexity score and whether the task is parallelizable with siblings:
|
|
6
|
-
*
|
|
7
|
-
* main-thread score < 15
|
|
8
|
-
* agent-subagent-worktree score 15-39 + parallelizable = true
|
|
9
|
-
* mcp-worker-haiku score 15-39 + sequential
|
|
10
|
-
* main-thread-opus score >= 40
|
|
11
|
-
*
|
|
12
|
-
* Complementary to EconomicDispatcher (which picks the model).
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
class ExecutionRouter {
|
|
16
|
-
/**
|
|
17
|
-
* @param {{task?: string, complexity?: number, parallelizable?: boolean}} input
|
|
18
|
-
* @returns {{score: number, strategy: string, reasoning: string, parallelizable: boolean}}
|
|
19
|
-
*/
|
|
20
|
-
route(input = {}) {
|
|
21
|
-
const { task, complexity, parallelizable } = input;
|
|
22
|
-
|
|
23
|
-
const score =
|
|
24
|
-
typeof complexity === 'number'
|
|
25
|
-
? complexity
|
|
26
|
-
: Math.min(100, Math.floor((task?.length || 0) / 10));
|
|
27
|
-
|
|
28
|
-
const isPar = parallelizable === true;
|
|
29
|
-
|
|
30
|
-
if (score < 15) {
|
|
31
|
-
return {
|
|
32
|
-
score,
|
|
33
|
-
strategy: 'main-thread',
|
|
34
|
-
reasoning: `Score ${score} < 15. Inline in current context, no delegation overhead.`,
|
|
35
|
-
parallelizable: isPar,
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
if (score < 40 && isPar) {
|
|
40
|
-
return {
|
|
41
|
-
score,
|
|
42
|
-
strategy: 'agent-subagent-worktree',
|
|
43
|
-
reasoning: `Score ${score} + parallelizable. Spawn Claude Code Agent tool with worktree isolation.`,
|
|
44
|
-
parallelizable: isPar,
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
if (score < 40) {
|
|
49
|
-
return {
|
|
50
|
-
score,
|
|
51
|
-
strategy: 'mcp-worker-haiku',
|
|
52
|
-
reasoning: `Score ${score}, sequential. Delegate to lightweight Haiku worker via MCP.`,
|
|
53
|
-
parallelizable: isPar,
|
|
54
|
-
};
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
return {
|
|
58
|
-
score,
|
|
59
|
-
strategy: 'main-thread-opus',
|
|
60
|
-
reasoning: `Score ${score} >= 40. Complex task, keep in main thread with Opus reasoning.`,
|
|
61
|
-
parallelizable: isPar,
|
|
62
|
-
};
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
module.exports = ExecutionRouter;
|