@fiale-plus/pi-rogue 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/node_modules/@fiale-plus/pi-rogue-advisor/README.md +1 -0
- package/node_modules/@fiale-plus/pi-rogue-advisor/src/binary-gate-features.test.ts +8 -0
- package/node_modules/@fiale-plus/pi-rogue-advisor/src/binary-gate-features.ts +7 -0
- package/node_modules/@fiale-plus/pi-rogue-advisor/src/router.test.ts +26 -0
- package/node_modules/@fiale-plus/pi-rogue-advisor/src/router.ts +10 -1
- package/node_modules/@fiale-plus/pi-rogue-orchestration/README.md +3 -3
- package/node_modules/@fiale-plus/pi-rogue-orchestration/package.json +3 -0
- package/node_modules/@fiale-plus/pi-rogue-orchestration/skills/orchestration/SKILL.md +3 -2
- package/node_modules/@fiale-plus/pi-rogue-orchestration/src/goal.test.ts +65 -2
- package/node_modules/@fiale-plus/pi-rogue-orchestration/src/goal.ts +84 -4
- package/node_modules/@fiale-plus/pi-rogue-orchestration/src/loop.ts +3 -0
- package/node_modules/@fiale-plus/pi-rogue-orchestration/src/novelty-guard.test.ts +43 -0
- package/node_modules/@fiale-plus/pi-rogue-orchestration/src/novelty-guard.ts +96 -11
- package/node_modules/@fiale-plus/pi-rogue-router/README.md +45 -6
- package/node_modules/@fiale-plus/pi-rogue-router/src/binary-gate.test.ts +88 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/binary-gate.ts +232 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/cli.ts +123 -9
- package/node_modules/@fiale-plus/pi-rogue-router/src/completions.ts +39 -16
- package/node_modules/@fiale-plus/pi-rogue-router/src/config-extension.test.ts +111 -4
- package/node_modules/@fiale-plus/pi-rogue-router/src/config.ts +17 -2
- package/node_modules/@fiale-plus/pi-rogue-router/src/extension.ts +67 -7
- package/node_modules/@fiale-plus/pi-rogue-router/src/index.ts +4 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/observe.ts +76 -5
- package/node_modules/@fiale-plus/pi-rogue-router/src/outcomes.ts +130 -6
- package/node_modules/@fiale-plus/pi-rogue-router/src/reports.test.ts +92 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/reports.ts +116 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/sharpening.test.ts +223 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/sharpening.ts +344 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/teacher-runner.test.ts +126 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/teacher-runner.ts +238 -0
- package/node_modules/@fiale-plus/pi-rogue-router/src/v1-telemetry.test.ts +54 -1
- package/package.json +1 -1
|
@@ -11,24 +11,63 @@ npm run router:rebuild -- --session ./current-session.jsonl --workspace-diff --o
|
|
|
11
11
|
npm run router:decide -- --checkpoint-file .pi/router/checkpoints.jsonl --ledger .pi/router/events.jsonl
|
|
12
12
|
npm run router:cards -- --events .pi/router/events.jsonl --output .pi/router/model-cards.jsonl
|
|
13
13
|
npm run router:outcomes -- --checkpoint-file .pi/router/checkpoints.jsonl --events .pi/router/events.jsonl --output .pi/router/outcomes.jsonl
|
|
14
|
+
npm run router:outcome-enrich -- --outcomes .pi/router/outcomes.jsonl --checkpoint-file .pi/router/checkpoints.jsonl --events .pi/router/events.jsonl --output .pi/router/outcomes.enriched.jsonl
|
|
14
15
|
npm run router:teacher-requests -- --checkpoint-file .pi/router/checkpoints.jsonl --output .pi/router/teacher-requests.jsonl --teacher openai-codex/gpt-5.5
|
|
15
|
-
npm run router:
|
|
16
|
+
npm run router:teacher-label -- --requests .pi/router/teacher-requests.jsonl --teacher-output .pi/router/teacher-decisions.jsonl --labels .pi/router/labels/teacher-labels.jsonl --teacher openai-codex/gpt-5.5
|
|
17
|
+
npm run router:reflect -- --checkpoint-file .pi/router/checkpoints.jsonl --labels .pi/router/labels/teacher-labels.jsonl --reflection .pi/router/reflections/session.md --teacher openai-codex/gpt-5.5 --teacher-output .pi/router/teacher-decisions.jsonl
|
|
16
18
|
npm run router:dataset -- --checkpoint-file .pi/router/checkpoints.jsonl --events .pi/router/events.jsonl --outcomes .pi/router/outcomes.jsonl --labels .pi/router/labels/teacher-labels.jsonl --output .pi/router/training.jsonl
|
|
19
|
+
npm run router:gate-train -- --dataset .pi/router/training.train.jsonl --eval-dataset .pi/router/training.eval.jsonl --artifact .pi/router/binary-gate.json --report .pi/router/binary-gate-report.json
|
|
20
|
+
npm run router:report -- --events .pi/router/events.jsonl --outcomes .pi/router/outcomes.jsonl --dataset .pi/router/training.eval.jsonl --gate-report .pi/router/binary-gate-report.json --output .pi/router/report.json --markdown .pi/router/report.md
|
|
21
|
+
npm run router:sharpen -- --events .pi/router/events.jsonl --outcomes .pi/router/outcomes.jsonl --cards .pi/router/model-cards.jsonl --output .pi/router/sharpening-hints.json
|
|
17
22
|
npm run router:shadow -- --checkpoint-file .pi/router/checkpoints.jsonl --ledger .pi/router/events.jsonl --output .pi/router/shadow-report.json
|
|
18
23
|
|
|
19
|
-
# Live
|
|
20
|
-
# /router on|off|
|
|
24
|
+
# Live router extension commands:
|
|
25
|
+
# /router status|help|on|off|mode|profile|print|profiles|models|configure|cycle
|
|
26
|
+
# /router mode observe # default: recommendations only
|
|
27
|
+
# /router mode auto_model # explicit: apply model switches only
|
|
28
|
+
# /router profile spark-smart
|
|
29
|
+
# /router print mismatch_only|all|off
|
|
21
30
|
# ctrl+alt+p cycles router profiles (Ctrl-P is reserved by Pi model cycling).
|
|
22
31
|
```
|
|
23
32
|
|
|
24
33
|
## V1 telemetry notes
|
|
25
34
|
|
|
26
|
-
Router v1
|
|
35
|
+
Router v1 defaults to observe-only. It adds outcome skeletons, stronger diff/error fingerprints, teacher-label request export, binary gate dataset export, and subagent-aware telemetry schemas. It does not spawn agents/subagents or promote policies automatically. The explicit `auto_model` mode may only switch the active model for future turns.
|
|
27
36
|
|
|
28
|
-
Live config is repo-global at `.pi/router/config.json`, while mutable live state and route ledgers are isolated per Pi session under `.pi/router/sessions/<session-key>/state.json` and `events.jsonl`.
|
|
37
|
+
Live config is repo-global at `.pi/router/config.json`, while mutable live state and route ledgers are isolated per Pi session under `.pi/router/sessions/<session-key>/state.json` and `events.jsonl`. The default `mode` is `observe`; `auto_model` must be explicitly selected and does not alter agents, subagents, tools, or execution paths.
|
|
29
38
|
|
|
30
39
|
- Diff telemetry stores counts and hashes from `git diff`, not raw patches. Offline rebuilds remain deterministic by default; use `--workspace-diff` only with one current live session/worktree snapshot.
|
|
40
|
+
- `router:outcome-enrich` upgrades conservative outcome skeletons with checkpoint/event-derived verifier, rework, interruption, override, and accepted-diff signals.
|
|
31
41
|
- Error fingerprints normalize paths, line numbers, timestamps, UUIDs, ports, and object ids before hashing.
|
|
32
|
-
- `router:teacher-requests` writes local JSONL requests for an explicit teacher model;
|
|
42
|
+
- `router:teacher-requests` writes local JSONL requests for an explicit teacher model; `router:teacher-label` calls the explicitly configured teacher and writes decision/label JSONL artifacts.
|
|
33
43
|
- `router:dataset` excludes `local-rule` labels by default so a future model does not merely imitate the current rules.
|
|
44
|
+
- `router:gate-train` trains a local binary continue-vs-intervene gate and evaluates it on a distinct labeled eval dataset; local-rule labels are rejected as training/eval truth and promotion remains manual/eval-gated.
|
|
45
|
+
- `router:report` writes JSON plus optional Markdown summaries across route ledgers, enriched outcomes, dataset labels, and gate evaluation reports.
|
|
46
|
+
- `router:sharpen` writes local-only `pi-router.sharpening-hints.v1` recommendations from route ledgers, optional outcomes, and optional capability cards. Hints include sample-size/confidence/auto-use guardrails, repo-local learning policy, and provenance, but never mutate config or promote policy automatically.
|
|
47
|
+
|
|
48
|
+
### Automated, upgrade-safe sharpening persistence
|
|
49
|
+
|
|
50
|
+
Use this one-shot command for cron/background automation:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
npm run router:sharpen:auto -- --workspace .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
By default it stores artifacts at:
|
|
57
|
+
|
|
58
|
+
- Linux/BSD: `<XDG_DATA_HOME || ~/.local/share>/pi-rogue-router/learning/<repo-name>-<hash>/`
|
|
59
|
+
- macOS: `~/Library/Application Support/pi-rogue-router/learning/<repo-name>-<hash>/`
|
|
60
|
+
|
|
61
|
+
The script:
|
|
62
|
+
- writes `latest.json` and `history/*.json` artifacts;
|
|
63
|
+
- writes `manifest.json` with source fingerprints for change detection;
|
|
64
|
+
- skips re-computation if inputs are unchanged (unless `--force`);
|
|
65
|
+
- migrates legacy `.pi/router/sharpening-hints.json` into the stable learning directory when present.
|
|
66
|
+
|
|
67
|
+
Cron example:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
*/30 * * * * cd /path/to/pi-rogue && npm run router:sharpen:auto -- --workspace /path/to/pi-rogue
|
|
71
|
+
```
|
|
72
|
+
|
|
34
73
|
- Subagent route/ledger schemas describe parent-child evidence flow, but live autonomous spawning remains out of scope.
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { mkdtempSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { describe, expect, it } from "vitest";
|
|
5
|
+
import { trainBinaryGate, writeBinaryGateTraining } from "./binary-gate.js";
|
|
6
|
+
import type { RouterTrainingRow } from "./dataset.js";
|
|
7
|
+
|
|
8
|
+
function tempFile(name: string): string {
|
|
9
|
+
return join(mkdtempSync(join(tmpdir(), "pi-router-gate-")), name);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
function row(id: string, label: "continue" | "intervene" | "unknown", overrides: Partial<RouterTrainingRow["features"]> = {}, source: RouterTrainingRow["labels"]["source"] = label === "unknown" ? "unknown" : "teacher"): RouterTrainingRow {
|
|
13
|
+
return {
|
|
14
|
+
schema: "pi-router.training-row.v1",
|
|
15
|
+
checkpointId: id,
|
|
16
|
+
sessionId: "session-1",
|
|
17
|
+
rawSessionRef: { schema: "pi-router.raw-session-ref.v1", path: "/tmp/session.jsonl", fromEvent: 0, toEvent: 1, fromByte: 0, toByte: 1, contentHash: "hash" },
|
|
18
|
+
features: {
|
|
19
|
+
phase: "implementation",
|
|
20
|
+
activeModel: "qwen",
|
|
21
|
+
provider: "local",
|
|
22
|
+
contextTokensApprox: 1000,
|
|
23
|
+
sameCommandRepeatedCount: 1,
|
|
24
|
+
sameErrorRepeatedCount: 0,
|
|
25
|
+
loopScore: 0.1,
|
|
26
|
+
progressScore: 0.9,
|
|
27
|
+
verifierUsed: true,
|
|
28
|
+
noVerifierUsed: false,
|
|
29
|
+
diffLines: 10,
|
|
30
|
+
diffFilesChanged: 1,
|
|
31
|
+
diffChurnScore: 0.01,
|
|
32
|
+
filesTouched: 1,
|
|
33
|
+
...overrides,
|
|
34
|
+
},
|
|
35
|
+
labels: { routeAction: label === "intervene" ? "run_verifier" : label === "continue" ? "continue_current" : null, binaryGate: label, source, confidence: label === "unknown" ? null : 0.8 },
|
|
36
|
+
outcome: { taskStatus: "unknown", testsPassedAfter: null, acceptedDiff: null, userOverrodeDecision: null, reworkTurns: null },
|
|
37
|
+
provenance: { localRuleAction: label === "intervene" ? "run_verifier" : "continue_current", excludedLocalRuleAsTruth: false },
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
describe("router binary gate training", () => {
|
|
42
|
+
it("trains a threshold artifact and reports candidate vs rule baseline", () => {
|
|
43
|
+
const rows = [
|
|
44
|
+
row("continue-1", "continue"),
|
|
45
|
+
row("intervene-1", "intervene", { phase: "debug", loopScore: 0.8, progressScore: 0.2, sameErrorRepeatedCount: 3, noVerifierUsed: true, verifierUsed: false }),
|
|
46
|
+
row("unknown-1", "unknown"),
|
|
47
|
+
];
|
|
48
|
+
|
|
49
|
+
const evalRows = [
|
|
50
|
+
row("eval-continue-1", "continue"),
|
|
51
|
+
row("eval-intervene-1", "intervene", { phase: "debug", loopScore: 0.7, progressScore: 0.3, sameErrorRepeatedCount: 2 }),
|
|
52
|
+
];
|
|
53
|
+
|
|
54
|
+
const { artifact, report } = trainBinaryGate(rows, evalRows, "2026-06-14T00:00:00.000Z");
|
|
55
|
+
|
|
56
|
+
expect(artifact).toMatchObject({ schema: "pi-router.binary-gate-artifact.v1", manualPromotionRequired: true, training: { rows: 3, labeledRows: 2 }, evaluation: { rows: 2, labeledRows: 2 } });
|
|
57
|
+
expect(report).toMatchObject({ schema: "pi-router.binary-gate-eval.v1", trainRows: 3, trainLabeledRows: 2, evalRows: 2, evalLabeledRows: 2, manualPromotionRequired: true });
|
|
58
|
+
expect(report.thresholdSweep.length).toBeGreaterThan(1);
|
|
59
|
+
expect(report.candidate.truePositive + report.candidate.trueNegative + report.candidate.falsePositive + report.candidate.falseNegative).toBe(2);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it("writes gate artifact and eval report", () => {
|
|
63
|
+
const input = tempFile("training.jsonl");
|
|
64
|
+
const evalInput = tempFile("eval.jsonl");
|
|
65
|
+
const artifact = tempFile("gate.json");
|
|
66
|
+
const report = tempFile("report.json");
|
|
67
|
+
writeFileSync(input, [
|
|
68
|
+
JSON.stringify(row("continue-1", "continue")),
|
|
69
|
+
JSON.stringify(row("intervene-1", "intervene", { loopScore: 0.9, progressScore: 0.1, noVerifierUsed: true })),
|
|
70
|
+
].join("\n") + "\n");
|
|
71
|
+
writeFileSync(evalInput, [
|
|
72
|
+
JSON.stringify(row("eval-continue-1", "continue")),
|
|
73
|
+
JSON.stringify(row("eval-intervene-1", "intervene", { loopScore: 0.8, progressScore: 0.2, sameErrorRepeatedCount: 3 })),
|
|
74
|
+
].join("\n") + "\n");
|
|
75
|
+
|
|
76
|
+
const summary = writeBinaryGateTraining({ trainingRowsPath: input, evalRowsPath: evalInput, artifactPath: artifact, reportPath: report });
|
|
77
|
+
|
|
78
|
+
expect(summary).toMatchObject({ schema: "pi-router.binary-gate-train-summary.v1", trainRows: 2, trainLabeledRows: 2, evalRows: 2, evalLabeledRows: 2 });
|
|
79
|
+
expect(JSON.parse(readFileSync(artifact, "utf8")).schema).toBe("pi-router.binary-gate-artifact.v1");
|
|
80
|
+
expect(JSON.parse(readFileSync(report, "utf8")).schema).toBe("pi-router.binary-gate-eval.v1");
|
|
81
|
+
expect(() => writeBinaryGateTraining({ trainingRowsPath: input, evalRowsPath: input, artifactPath: tempFile("bad-gate.json"), reportPath: tempFile("bad-report.json") })).toThrow(/distinct --eval-dataset/);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("rejects unusable labels", () => {
|
|
85
|
+
expect(() => trainBinaryGate([row("rule-1", "continue", {}, "local-rule"), row("rule-2", "intervene", {}, "local-rule")], [row("eval-1", "continue"), row("eval-2", "intervene")])).toThrow(/no usable/);
|
|
86
|
+
expect(() => trainBinaryGate([row("only-continue", "continue")], [row("eval-1", "continue"), row("eval-2", "intervene")])).toThrow(/both continue and intervene/);
|
|
87
|
+
});
|
|
88
|
+
});
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { dirname, resolve } from "node:path";
|
|
3
|
+
import type { BinaryGateLabel, RouterTrainingRow } from "./dataset.js";
|
|
4
|
+
|
|
5
|
+
export const BINARY_GATE_ARTIFACT_SCHEMA = "pi-router.binary-gate-artifact.v1" as const;
|
|
6
|
+
export const BINARY_GATE_EVAL_SCHEMA = "pi-router.binary-gate-eval.v1" as const;
|
|
7
|
+
|
|
8
|
+
export interface BinaryGateArtifact {
|
|
9
|
+
schema: typeof BINARY_GATE_ARTIFACT_SCHEMA;
|
|
10
|
+
generatedAt: string;
|
|
11
|
+
policyVersion: string;
|
|
12
|
+
model: {
|
|
13
|
+
kind: "linear-threshold";
|
|
14
|
+
threshold: number;
|
|
15
|
+
weights: Record<string, number>;
|
|
16
|
+
};
|
|
17
|
+
training: {
|
|
18
|
+
rows: number;
|
|
19
|
+
labeledRows: number;
|
|
20
|
+
positiveIntervene: number;
|
|
21
|
+
negativeContinue: number;
|
|
22
|
+
};
|
|
23
|
+
evaluation: {
|
|
24
|
+
rows: number;
|
|
25
|
+
labeledRows: number;
|
|
26
|
+
positiveIntervene: number;
|
|
27
|
+
negativeContinue: number;
|
|
28
|
+
};
|
|
29
|
+
manualPromotionRequired: true;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface ConfusionMatrix {
|
|
33
|
+
truePositive: number;
|
|
34
|
+
trueNegative: number;
|
|
35
|
+
falsePositive: number;
|
|
36
|
+
falseNegative: number;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface BinaryGateEvalReport {
|
|
40
|
+
schema: typeof BINARY_GATE_EVAL_SCHEMA;
|
|
41
|
+
generatedAt: string;
|
|
42
|
+
policyVersion: string;
|
|
43
|
+
trainRows: number;
|
|
44
|
+
trainLabeledRows: number;
|
|
45
|
+
evalRows: number;
|
|
46
|
+
evalLabeledRows: number;
|
|
47
|
+
candidate: ConfusionMatrix & { accuracy: number; precision: number; recall: number; f1: number };
|
|
48
|
+
ruleBaseline: ConfusionMatrix & { accuracy: number; precision: number; recall: number; f1: number };
|
|
49
|
+
thresholdSweep: Array<{ threshold: number; trainAccuracy: number; trainF1: number; evalAccuracy: number; evalF1: number }>;
|
|
50
|
+
manualPromotionRequired: true;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface GateTrainSummary {
|
|
54
|
+
schema: "pi-router.binary-gate-train-summary.v1";
|
|
55
|
+
artifact: string;
|
|
56
|
+
report: string;
|
|
57
|
+
trainRows: number;
|
|
58
|
+
trainLabeledRows: number;
|
|
59
|
+
evalRows: number;
|
|
60
|
+
evalLabeledRows: number;
|
|
61
|
+
threshold: number;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function round(value: number): number {
|
|
65
|
+
return Number(value.toFixed(4));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function readTrainingRows(path: string): RouterTrainingRow[] {
|
|
69
|
+
const resolved = resolve(path);
|
|
70
|
+
if (!existsSync(resolved)) throw new Error(`training rows file not found: ${path}`);
|
|
71
|
+
return readFileSync(resolved, "utf8")
|
|
72
|
+
.split("\n")
|
|
73
|
+
.filter((line) => line.trim())
|
|
74
|
+
.map((line, index) => {
|
|
75
|
+
const row = JSON.parse(line) as RouterTrainingRow;
|
|
76
|
+
if (row.schema !== "pi-router.training-row.v1") throw new Error(`invalid training row schema at ${path}:${index + 1}`);
|
|
77
|
+
return row;
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function scoreRow(row: RouterTrainingRow): number {
|
|
82
|
+
const weights = DEFAULT_WEIGHTS;
|
|
83
|
+
const context = row.features.contextTokensApprox ? Math.min(row.features.contextTokensApprox / 100_000, 1) : 0;
|
|
84
|
+
const diff = Math.min(row.features.diffLines / 600, 1);
|
|
85
|
+
const repeatErrors = Math.min(row.features.sameErrorRepeatedCount / 4, 1);
|
|
86
|
+
const repeatCommands = Math.min(row.features.sameCommandRepeatedCount / 4, 1);
|
|
87
|
+
const noVerifier = row.features.noVerifierUsed ? 1 : 0;
|
|
88
|
+
const lowProgress = 1 - row.features.progressScore;
|
|
89
|
+
const phaseRisk = row.features.phase === "debug" || row.features.phase === "review" ? 0.15 : 0;
|
|
90
|
+
return Math.max(0, Math.min(1,
|
|
91
|
+
weights.bias
|
|
92
|
+
+ weights.loopScore * row.features.loopScore
|
|
93
|
+
+ weights.lowProgress * lowProgress
|
|
94
|
+
+ weights.sameErrorRepeatedCount * repeatErrors
|
|
95
|
+
+ weights.sameCommandRepeatedCount * repeatCommands
|
|
96
|
+
+ weights.noVerifierUsed * noVerifier
|
|
97
|
+
+ weights.diffLines * diff
|
|
98
|
+
+ weights.contextPressure * context
|
|
99
|
+
+ phaseRisk,
|
|
100
|
+
));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const DEFAULT_WEIGHTS: Record<string, number> = {
|
|
104
|
+
bias: -0.08,
|
|
105
|
+
loopScore: 0.38,
|
|
106
|
+
lowProgress: 0.22,
|
|
107
|
+
sameErrorRepeatedCount: 0.18,
|
|
108
|
+
sameCommandRepeatedCount: 0.08,
|
|
109
|
+
noVerifierUsed: 0.16,
|
|
110
|
+
diffLines: 0.08,
|
|
111
|
+
contextPressure: 0.12,
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
function isIntervene(label: BinaryGateLabel): boolean | null {
|
|
115
|
+
if (label === "intervene") return true;
|
|
116
|
+
if (label === "continue") return false;
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function ruleGate(row: RouterTrainingRow): BinaryGateLabel {
|
|
121
|
+
return row.provenance.localRuleAction === "continue_current" || row.provenance.localRuleAction === "continue_local" ? "continue" : "intervene";
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function metrics(matrix: ConfusionMatrix): ConfusionMatrix & { accuracy: number; precision: number; recall: number; f1: number } {
|
|
125
|
+
const total = matrix.truePositive + matrix.trueNegative + matrix.falsePositive + matrix.falseNegative;
|
|
126
|
+
const precision = matrix.truePositive + matrix.falsePositive ? matrix.truePositive / (matrix.truePositive + matrix.falsePositive) : 0;
|
|
127
|
+
const recall = matrix.truePositive + matrix.falseNegative ? matrix.truePositive / (matrix.truePositive + matrix.falseNegative) : 0;
|
|
128
|
+
const f1 = precision + recall ? (2 * precision * recall) / (precision + recall) : 0;
|
|
129
|
+
return {
|
|
130
|
+
...matrix,
|
|
131
|
+
accuracy: total ? round((matrix.truePositive + matrix.trueNegative) / total) : 0,
|
|
132
|
+
precision: round(precision),
|
|
133
|
+
recall: round(recall),
|
|
134
|
+
f1: round(f1),
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function confusion(rows: RouterTrainingRow[], predict: (row: RouterTrainingRow) => BinaryGateLabel): ConfusionMatrix {
|
|
139
|
+
const matrix: ConfusionMatrix = { truePositive: 0, trueNegative: 0, falsePositive: 0, falseNegative: 0 };
|
|
140
|
+
for (const row of rows) {
|
|
141
|
+
const truth = isIntervene(row.labels.binaryGate);
|
|
142
|
+
if (truth === null) continue;
|
|
143
|
+
const predicted = predict(row) === "intervene";
|
|
144
|
+
if (truth && predicted) matrix.truePositive++;
|
|
145
|
+
else if (!truth && !predicted) matrix.trueNegative++;
|
|
146
|
+
else if (!truth && predicted) matrix.falsePositive++;
|
|
147
|
+
else matrix.falseNegative++;
|
|
148
|
+
}
|
|
149
|
+
return matrix;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function thresholdValues(): number[] {
|
|
153
|
+
return Array.from({ length: 19 }, (_, index) => round(0.05 + index * 0.05));
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function usableLabeledRows(rows: RouterTrainingRow[], label: string): RouterTrainingRow[] {
|
|
157
|
+
const labeled = rows.filter((row) => row.labels.binaryGate !== "unknown" && row.labels.source !== "local-rule");
|
|
158
|
+
const positives = labeled.filter((row) => row.labels.binaryGate === "intervene").length;
|
|
159
|
+
const negatives = labeled.filter((row) => row.labels.binaryGate === "continue").length;
|
|
160
|
+
if (labeled.length === 0) throw new Error(`${label} dataset has no usable teacher/human labeled rows`);
|
|
161
|
+
if (positives === 0 || negatives === 0) throw new Error(`${label} dataset must contain both continue and intervene labels`);
|
|
162
|
+
return labeled;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function labelCounts(rows: RouterTrainingRow[]): { positiveIntervene: number; negativeContinue: number } {
|
|
166
|
+
return {
|
|
167
|
+
positiveIntervene: rows.filter((row) => row.labels.binaryGate === "intervene").length,
|
|
168
|
+
negativeContinue: rows.filter((row) => row.labels.binaryGate === "continue").length,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
export function trainBinaryGate(trainRows: RouterTrainingRow[], evalRows: RouterTrainingRow[], generatedAt = new Date().toISOString()): { artifact: BinaryGateArtifact; report: BinaryGateEvalReport } {
|
|
173
|
+
const trainLabeled = usableLabeledRows(trainRows, "training");
|
|
174
|
+
const evalLabeled = usableLabeledRows(evalRows, "eval");
|
|
175
|
+
const trainSweep = thresholdValues().map((threshold) => {
|
|
176
|
+
const result = metrics(confusion(trainLabeled, (row) => scoreRow(row) >= threshold ? "intervene" : "continue"));
|
|
177
|
+
return { threshold, accuracy: result.accuracy, f1: result.f1 };
|
|
178
|
+
});
|
|
179
|
+
const best = trainSweep.reduce((winner, item) => item.f1 > winner.f1 || (item.f1 === winner.f1 && item.accuracy > winner.accuracy) ? item : winner, trainSweep[0]);
|
|
180
|
+
const policyVersion = `pi-router.binary-gate.v1.threshold-${best.threshold}`;
|
|
181
|
+
const trainCounts = labelCounts(trainLabeled);
|
|
182
|
+
const evalCounts = labelCounts(evalLabeled);
|
|
183
|
+
const thresholdSweep = thresholdValues().map((threshold) => {
|
|
184
|
+
const train = metrics(confusion(trainLabeled, (row) => scoreRow(row) >= threshold ? "intervene" : "continue"));
|
|
185
|
+
const evaluation = metrics(confusion(evalLabeled, (row) => scoreRow(row) >= threshold ? "intervene" : "continue"));
|
|
186
|
+
return { threshold, trainAccuracy: train.accuracy, trainF1: train.f1, evalAccuracy: evaluation.accuracy, evalF1: evaluation.f1 };
|
|
187
|
+
});
|
|
188
|
+
const artifact: BinaryGateArtifact = {
|
|
189
|
+
schema: BINARY_GATE_ARTIFACT_SCHEMA,
|
|
190
|
+
generatedAt,
|
|
191
|
+
policyVersion,
|
|
192
|
+
model: { kind: "linear-threshold", threshold: best.threshold, weights: DEFAULT_WEIGHTS },
|
|
193
|
+
training: { rows: trainRows.length, labeledRows: trainLabeled.length, ...trainCounts },
|
|
194
|
+
evaluation: { rows: evalRows.length, labeledRows: evalLabeled.length, ...evalCounts },
|
|
195
|
+
manualPromotionRequired: true,
|
|
196
|
+
};
|
|
197
|
+
const report: BinaryGateEvalReport = {
|
|
198
|
+
schema: BINARY_GATE_EVAL_SCHEMA,
|
|
199
|
+
generatedAt,
|
|
200
|
+
policyVersion,
|
|
201
|
+
trainRows: trainRows.length,
|
|
202
|
+
trainLabeledRows: trainLabeled.length,
|
|
203
|
+
evalRows: evalRows.length,
|
|
204
|
+
evalLabeledRows: evalLabeled.length,
|
|
205
|
+
candidate: metrics(confusion(evalLabeled, (row) => scoreRow(row) >= best.threshold ? "intervene" : "continue")),
|
|
206
|
+
ruleBaseline: metrics(confusion(evalLabeled, ruleGate)),
|
|
207
|
+
thresholdSweep,
|
|
208
|
+
manualPromotionRequired: true,
|
|
209
|
+
};
|
|
210
|
+
return { artifact, report };
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export function writeBinaryGateTraining(options: { trainingRowsPath: string; evalRowsPath: string; artifactPath: string; reportPath: string }): GateTrainSummary {
|
|
214
|
+
const rows = readTrainingRows(options.trainingRowsPath);
|
|
215
|
+
const evalRows = readTrainingRows(options.evalRowsPath);
|
|
216
|
+
if (resolve(options.trainingRowsPath) === resolve(options.evalRowsPath)) throw new Error("gate training requires a distinct --eval-dataset file for out-of-sample evaluation");
|
|
217
|
+
const { artifact, report } = trainBinaryGate(rows, evalRows);
|
|
218
|
+
mkdirSync(dirname(resolve(options.artifactPath)), { recursive: true });
|
|
219
|
+
mkdirSync(dirname(resolve(options.reportPath)), { recursive: true });
|
|
220
|
+
writeFileSync(resolve(options.artifactPath), `${JSON.stringify(artifact, null, 2)}\n`);
|
|
221
|
+
writeFileSync(resolve(options.reportPath), `${JSON.stringify(report, null, 2)}\n`);
|
|
222
|
+
return {
|
|
223
|
+
schema: "pi-router.binary-gate-train-summary.v1",
|
|
224
|
+
artifact: resolve(options.artifactPath),
|
|
225
|
+
report: resolve(options.reportPath),
|
|
226
|
+
trainRows: rows.length,
|
|
227
|
+
trainLabeledRows: artifact.training.labeledRows,
|
|
228
|
+
evalRows: evalRows.length,
|
|
229
|
+
evalLabeledRows: artifact.evaluation.labeledRows,
|
|
230
|
+
threshold: artifact.model.threshold,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
@@ -2,11 +2,15 @@
|
|
|
2
2
|
import { existsSync, readdirSync, statSync } from "node:fs";
|
|
3
3
|
import { join, resolve } from "node:path";
|
|
4
4
|
import { decideRoute, readCheckpointJsonl, selectCheckpoint } from "./decision.js";
|
|
5
|
+
import { writeBinaryGateTraining } from "./binary-gate.js";
|
|
5
6
|
import { writeSessionCheckpointsJsonl } from "./checkpoints.js";
|
|
6
7
|
import { appendRouteEvent, buildRouteEvent } from "./ledger.js";
|
|
7
8
|
import { writeCapabilityCards, writeShadowEval, writeTeacherPromptRequests, writeTeacherReflection } from "./learning.js";
|
|
8
9
|
import { writeTrainingRows } from "./dataset.js";
|
|
9
|
-
import { writeInferredOutcomes } from "./outcomes.js";
|
|
10
|
+
import { writeEnrichedOutcomes, writeInferredOutcomes } from "./outcomes.js";
|
|
11
|
+
import { writeRouterReport } from "./reports.js";
|
|
12
|
+
import { writeSharpeningHints } from "./sharpening.js";
|
|
13
|
+
import { runTeacherLabeling } from "./teacher-runner.js";
|
|
10
14
|
|
|
11
15
|
interface Args {
|
|
12
16
|
command?: string;
|
|
@@ -19,12 +23,22 @@ interface Args {
|
|
|
19
23
|
events?: string;
|
|
20
24
|
labels?: string;
|
|
21
25
|
reflection?: string;
|
|
26
|
+
artifact?: string;
|
|
27
|
+
report?: string;
|
|
28
|
+
dataset?: string;
|
|
29
|
+
evalDataset?: string;
|
|
30
|
+
markdown?: string;
|
|
31
|
+
gateReport?: string;
|
|
22
32
|
teacher?: string;
|
|
23
33
|
teacherOutput?: string;
|
|
24
34
|
teacherPrompts?: string;
|
|
35
|
+
requests?: string;
|
|
36
|
+
decisionsOutput?: string;
|
|
25
37
|
outcomes?: string;
|
|
38
|
+
cards?: string;
|
|
26
39
|
includeLocalRuleLabels?: boolean;
|
|
27
40
|
workspaceDiff?: boolean;
|
|
41
|
+
dryRun?: boolean;
|
|
28
42
|
pretty: boolean;
|
|
29
43
|
}
|
|
30
44
|
|
|
@@ -35,9 +49,14 @@ function usage(): never {
|
|
|
35
49
|
npm run router:decide -- --checkpoint-file <checkpoints.jsonl> [--checkpoint-id <id>] [--ledger <events.jsonl>] [--pretty]
|
|
36
50
|
npm run router:cards -- --events <events.jsonl> --output <model-cards.jsonl> [--outcomes <outcomes.jsonl>] [--pretty]
|
|
37
51
|
npm run router:outcomes -- --checkpoint-file <checkpoints.jsonl> --events <events.jsonl> --output <outcomes.jsonl> [--pretty]
|
|
52
|
+
npm run router:outcome-enrich -- --outcomes <outcomes.jsonl> --output <enriched-outcomes.jsonl> [--checkpoint-file <checkpoints.jsonl>] [--events <events.jsonl>] [--pretty]
|
|
38
53
|
npm run router:teacher-requests -- --checkpoint-file <checkpoints.jsonl> --output <requests.jsonl> [--teacher openai-codex/gpt-5.5] [--pretty]
|
|
54
|
+
npm run router:teacher-label -- --requests <requests.jsonl> --teacher-output <decisions.jsonl> --labels <labels.jsonl> [--teacher openai-codex/gpt-5.5] [--dry-run] [--pretty]
|
|
39
55
|
npm run router:reflect -- --checkpoint-file <checkpoints.jsonl> --labels <labels.jsonl> --reflection <reflection.md> [--teacher local-rule] [--teacher-output <decisions.jsonl>] [--teacher-prompts <requests.jsonl>] [--pretty]
|
|
40
56
|
npm run router:dataset -- --checkpoint-file <checkpoints.jsonl> --output <training.jsonl> [--events <events.jsonl>] [--outcomes <outcomes.jsonl>] [--labels <labels.jsonl>] [--include-local-rule-labels] [--pretty]
|
|
57
|
+
npm run router:gate-train -- --dataset <training.jsonl> --eval-dataset <eval.jsonl> --artifact <gate.json> --report <gate-report.json> [--pretty]
|
|
58
|
+
npm run router:report -- --output <report.json> [--markdown <report.md>] [--events <events.jsonl>] [--outcomes <outcomes.jsonl>] [--dataset <training.jsonl>] [--gate-report <gate-report.json>] [--pretty]
|
|
59
|
+
npm run router:sharpen -- --events <events.jsonl> --output <hints.json> [--outcomes <outcomes.jsonl>] [--cards <model-cards.jsonl>] [--pretty]
|
|
41
60
|
npm run router:shadow -- --checkpoint-file <checkpoints.jsonl> --output <report.json> [--ledger <events.jsonl>] [--pretty]
|
|
42
61
|
|
|
43
62
|
Commands:
|
|
@@ -45,9 +64,14 @@ Commands:
|
|
|
45
64
|
decide Emit a strict JSON route decision for a checkpoint and optionally append a route event.
|
|
46
65
|
cards Generate local observed model capability cards from route events and optional outcomes.
|
|
47
66
|
outcomes Infer conservative outcome skeletons that can be manually enriched.
|
|
67
|
+
outcome-enrich Enrich outcome records from checkpoints and route events.
|
|
48
68
|
teacher-requests Generate local JSONL prompt requests for explicit teacher labeling.
|
|
69
|
+
teacher-label Run explicit teacher model labeling over request JSONL.
|
|
49
70
|
reflect Generate command-triggered soft routing labels and a reflection artifact.
|
|
50
71
|
dataset Export trainable rows for a conservative continue-vs-intervene gate.
|
|
72
|
+
gate-train Train/evaluate a local binary continue-vs-intervene gate artifact.
|
|
73
|
+
report Summarize route events, outcomes, labels, and gate eval metrics.
|
|
74
|
+
sharpen Generate local, provenance-backed router sharpening hints without mutating policy.
|
|
51
75
|
shadow Shadow-evaluate the current rule policy over historical checkpoints.
|
|
52
76
|
`);
|
|
53
77
|
process.exit(2);
|
|
@@ -104,6 +128,36 @@ function parseArgs(argv: string[]): Args {
|
|
|
104
128
|
index++;
|
|
105
129
|
continue;
|
|
106
130
|
}
|
|
131
|
+
if (arg === "--artifact" && next) {
|
|
132
|
+
args.artifact = next;
|
|
133
|
+
index++;
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
if (arg === "--report" && next) {
|
|
137
|
+
args.report = next;
|
|
138
|
+
index++;
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
if (arg === "--dataset" && next) {
|
|
142
|
+
args.dataset = next;
|
|
143
|
+
index++;
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
if (arg === "--eval-dataset" && next) {
|
|
147
|
+
args.evalDataset = next;
|
|
148
|
+
index++;
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
if (arg === "--markdown" && next) {
|
|
152
|
+
args.markdown = next;
|
|
153
|
+
index++;
|
|
154
|
+
continue;
|
|
155
|
+
}
|
|
156
|
+
if (arg === "--gate-report" && next) {
|
|
157
|
+
args.gateReport = next;
|
|
158
|
+
index++;
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
107
161
|
if (arg === "--teacher" && next) {
|
|
108
162
|
args.teacher = next;
|
|
109
163
|
index++;
|
|
@@ -119,11 +173,26 @@ function parseArgs(argv: string[]): Args {
|
|
|
119
173
|
index++;
|
|
120
174
|
continue;
|
|
121
175
|
}
|
|
176
|
+
if (arg === "--requests" && next) {
|
|
177
|
+
args.requests = next;
|
|
178
|
+
index++;
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
if (arg === "--decisions-output" && next) {
|
|
182
|
+
args.decisionsOutput = next;
|
|
183
|
+
index++;
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
122
186
|
if (arg === "--outcomes" && next) {
|
|
123
187
|
args.outcomes = next;
|
|
124
188
|
index++;
|
|
125
189
|
continue;
|
|
126
190
|
}
|
|
191
|
+
if (arg === "--cards" && next) {
|
|
192
|
+
args.cards = next;
|
|
193
|
+
index++;
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
127
196
|
if (arg === "--include-local-rule-labels") {
|
|
128
197
|
args.includeLocalRuleLabels = true;
|
|
129
198
|
continue;
|
|
@@ -132,6 +201,10 @@ function parseArgs(argv: string[]): Args {
|
|
|
132
201
|
args.workspaceDiff = true;
|
|
133
202
|
continue;
|
|
134
203
|
}
|
|
204
|
+
if (arg === "--dry-run") {
|
|
205
|
+
args.dryRun = true;
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
135
208
|
if (arg === "--pretty") {
|
|
136
209
|
args.pretty = true;
|
|
137
210
|
continue;
|
|
@@ -207,12 +280,28 @@ function outcomes(args: Args): unknown {
|
|
|
207
280
|
return writeInferredOutcomes({ checkpointPath: args.checkpointFile, eventsPath: args.events, outputPath: args.output });
|
|
208
281
|
}
|
|
209
282
|
|
|
283
|
+
function outcomeEnrich(args: Args): unknown {
|
|
284
|
+
if (!args.outcomes || !args.output) usage();
|
|
285
|
+
return writeEnrichedOutcomes({ outcomesPath: args.outcomes, outputPath: args.output, checkpointPath: args.checkpointFile, eventsPath: args.events });
|
|
286
|
+
}
|
|
287
|
+
|
|
210
288
|
function teacherRequests(args: Args): unknown {
|
|
211
289
|
if (!args.checkpointFile || !args.output) usage();
|
|
212
290
|
const requests = writeTeacherPromptRequests(args.checkpointFile, args.output, args.teacher ?? "openai-codex/gpt-5.5");
|
|
213
291
|
return { schema: "pi-router.teacher-requests-summary.v1", output: resolve(args.output), requests: requests.length, teacher: args.teacher ?? "openai-codex/gpt-5.5" };
|
|
214
292
|
}
|
|
215
293
|
|
|
294
|
+
async function teacherLabel(args: Args): Promise<unknown> {
|
|
295
|
+
if (!args.requests || !args.teacherOutput || !args.labels) usage();
|
|
296
|
+
return runTeacherLabeling({
|
|
297
|
+
requestsPath: args.requests,
|
|
298
|
+
decisionsOutputPath: args.teacherOutput,
|
|
299
|
+
labelsOutputPath: args.labels,
|
|
300
|
+
teacher: args.teacher,
|
|
301
|
+
dryRun: args.dryRun,
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
|
|
216
305
|
function reflect(args: Args): unknown {
|
|
217
306
|
if (!args.checkpointFile || !args.labels || !args.reflection) usage();
|
|
218
307
|
const result = writeTeacherReflection({
|
|
@@ -244,6 +333,21 @@ function dataset(args: Args): unknown {
|
|
|
244
333
|
});
|
|
245
334
|
}
|
|
246
335
|
|
|
336
|
+
function gateTrain(args: Args): unknown {
|
|
337
|
+
if (!args.dataset || !args.evalDataset || !args.artifact || !args.report) usage();
|
|
338
|
+
return writeBinaryGateTraining({ trainingRowsPath: args.dataset, evalRowsPath: args.evalDataset, artifactPath: args.artifact, reportPath: args.report });
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
function report(args: Args): unknown {
|
|
342
|
+
if (!args.output) usage();
|
|
343
|
+
return writeRouterReport({ outputPath: args.output, markdownPath: args.markdown, eventsPath: args.events, outcomesPath: args.outcomes, trainingRowsPath: args.dataset, gateReportPath: args.gateReport });
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
function sharpen(args: Args): unknown {
|
|
347
|
+
if (!args.events || !args.output) usage();
|
|
348
|
+
return writeSharpeningHints({ eventsPath: args.events, outputPath: args.output, outcomesPath: args.outcomes, cardsPath: args.cards });
|
|
349
|
+
}
|
|
350
|
+
|
|
247
351
|
function shadow(args: Args): unknown {
|
|
248
352
|
if (!args.checkpointFile || !args.output) usage();
|
|
249
353
|
return writeShadowEval(args.checkpointFile, args.output, args.ledger);
|
|
@@ -259,15 +363,25 @@ async function main(): Promise<void> {
|
|
|
259
363
|
? cards(args)
|
|
260
364
|
: args.command === "outcomes"
|
|
261
365
|
? outcomes(args)
|
|
262
|
-
: args.command === "
|
|
366
|
+
: args.command === "outcome-enrich"
|
|
367
|
+
? outcomeEnrich(args)
|
|
368
|
+
: args.command === "teacher-requests"
|
|
263
369
|
? teacherRequests(args)
|
|
264
|
-
: args.command === "
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
370
|
+
: args.command === "teacher-label"
|
|
371
|
+
? await teacherLabel(args)
|
|
372
|
+
: args.command === "reflect"
|
|
373
|
+
? reflect(args)
|
|
374
|
+
: args.command === "dataset"
|
|
375
|
+
? dataset(args)
|
|
376
|
+
: args.command === "gate-train"
|
|
377
|
+
? gateTrain(args)
|
|
378
|
+
: args.command === "report"
|
|
379
|
+
? report(args)
|
|
380
|
+
: args.command === "sharpen"
|
|
381
|
+
? sharpen(args)
|
|
382
|
+
: args.command === "shadow"
|
|
383
|
+
? shadow(args)
|
|
384
|
+
: usage();
|
|
271
385
|
console.log(args.pretty ? JSON.stringify(result, null, 2) : JSON.stringify(result));
|
|
272
386
|
}
|
|
273
387
|
|