@fiale-plus/pi-rogue 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +17 -1
  2. package/node_modules/@fiale-plus/pi-rogue-advisor/README.md +1 -0
  3. package/node_modules/@fiale-plus/pi-rogue-advisor/src/binary-gate-features.test.ts +8 -0
  4. package/node_modules/@fiale-plus/pi-rogue-advisor/src/binary-gate-features.ts +7 -0
  5. package/node_modules/@fiale-plus/pi-rogue-advisor/src/router.test.ts +26 -0
  6. package/node_modules/@fiale-plus/pi-rogue-advisor/src/router.ts +10 -1
  7. package/node_modules/@fiale-plus/pi-rogue-orchestration/README.md +3 -3
  8. package/node_modules/@fiale-plus/pi-rogue-orchestration/package.json +3 -0
  9. package/node_modules/@fiale-plus/pi-rogue-orchestration/skills/orchestration/SKILL.md +3 -2
  10. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/goal.test.ts +65 -2
  11. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/goal.ts +84 -4
  12. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/loop.ts +3 -0
  13. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/novelty-guard.test.ts +43 -0
  14. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/novelty-guard.ts +96 -11
  15. package/node_modules/@fiale-plus/pi-rogue-router/README.md +45 -6
  16. package/node_modules/@fiale-plus/pi-rogue-router/src/binary-gate.test.ts +88 -0
  17. package/node_modules/@fiale-plus/pi-rogue-router/src/binary-gate.ts +232 -0
  18. package/node_modules/@fiale-plus/pi-rogue-router/src/cli.ts +123 -9
  19. package/node_modules/@fiale-plus/pi-rogue-router/src/completions.ts +39 -16
  20. package/node_modules/@fiale-plus/pi-rogue-router/src/config-extension.test.ts +111 -4
  21. package/node_modules/@fiale-plus/pi-rogue-router/src/config.ts +17 -2
  22. package/node_modules/@fiale-plus/pi-rogue-router/src/extension.ts +67 -7
  23. package/node_modules/@fiale-plus/pi-rogue-router/src/index.ts +4 -0
  24. package/node_modules/@fiale-plus/pi-rogue-router/src/observe.ts +76 -5
  25. package/node_modules/@fiale-plus/pi-rogue-router/src/outcomes.ts +130 -6
  26. package/node_modules/@fiale-plus/pi-rogue-router/src/reports.test.ts +92 -0
  27. package/node_modules/@fiale-plus/pi-rogue-router/src/reports.ts +116 -0
  28. package/node_modules/@fiale-plus/pi-rogue-router/src/sharpening.test.ts +223 -0
  29. package/node_modules/@fiale-plus/pi-rogue-router/src/sharpening.ts +344 -0
  30. package/node_modules/@fiale-plus/pi-rogue-router/src/teacher-runner.test.ts +126 -0
  31. package/node_modules/@fiale-plus/pi-rogue-router/src/teacher-runner.ts +238 -0
  32. package/node_modules/@fiale-plus/pi-rogue-router/src/v1-telemetry.test.ts +54 -1
  33. package/package.json +1 -1
@@ -6,6 +6,8 @@ const STATE_FILE = "repetition-guard.json";
6
6
  const MAX_ASSISTANT_TURNS = 6;
7
7
  const REPEAT_COUNT = 3;
8
8
  const REPEAT_THRESHOLD = 0.8;
9
+ const NO_PROGRESS_COUNT = 3;
10
+ const BOUNDED_RECOVERY_COUNT = 5;
9
11
 
10
12
  export interface RepetitionGuardTurn {
11
13
  at: string;
@@ -18,9 +20,17 @@ export interface RepetitionGuardRepeat {
18
20
  text: string;
19
21
  }
20
22
 
23
+ export interface NoProgressSignal {
24
+ at: string;
25
+ count: number;
26
+ text: string;
27
+ reason: string;
28
+ }
29
+
21
30
  export interface RepetitionGuardState {
22
31
  recentAssistantTurns: RepetitionGuardTurn[];
23
32
  assistantRepeat?: RepetitionGuardRepeat;
33
+ noProgress?: NoProgressSignal;
24
34
  }
25
35
 
26
36
  export function defaultRepetitionGuardState(): RepetitionGuardState {
@@ -70,7 +80,18 @@ export function detectAssistantRepetition(state: RepetitionGuardState, minCount
70
80
  };
71
81
  }
72
82
 
73
- export function recordAssistantTurn(state: RepetitionGuardState, text: string): RepetitionGuardState {
83
+ export function looksLikeNoProgressTurn(text: string): boolean {
84
+ const normalized = normalizeTurn(text);
85
+ if (normalized.length < 24) return false;
86
+
87
+ const planning = /\b(i will|i'll|let me|going to|we need to|we should|next i|plan|planning|approach|think through|summarize|restate)\b/i.test(text);
88
+ if (!planning) return false;
89
+
90
+ const concreteProgress = /\b(changed|edited|created|wrote|updated|implemented|removed|ran|tested|passed|failed|verified|validated|found|inspected|read|opened|committed|pushed|fixed|completed|result|error|diff)\b/i.test(text);
91
+ return !concreteProgress;
92
+ }
93
+
94
+ export function recordAssistantTurn(state: RepetitionGuardState, text: string, options: { activeOrchestration?: boolean } = {}): RepetitionGuardState {
74
95
  const trimmed = String(text ?? "").trim();
75
96
  if (!trimmed) return state;
76
97
  const next: RepetitionGuardState = {
@@ -78,7 +99,22 @@ export function recordAssistantTurn(state: RepetitionGuardState, text: string):
78
99
  recentAssistantTurns: [...state.recentAssistantTurns, { at: new Date().toISOString(), text: truncate(trimmed, 1200) }].slice(-MAX_ASSISTANT_TURNS),
79
100
  };
80
101
  const repeat = detectAssistantRepetition(next);
81
- return repeat ? { ...next, assistantRepeat: repeat } : { ...next, assistantRepeat: undefined };
102
+ const noProgressCount = options.activeOrchestration && looksLikeNoProgressTurn(trimmed)
103
+ ? (state.noProgress?.count ?? 0) + 1
104
+ : 0;
105
+
106
+ return {
107
+ ...next,
108
+ assistantRepeat: repeat ?? undefined,
109
+ noProgress: noProgressCount > 0
110
+ ? {
111
+ at: new Date().toISOString(),
112
+ count: noProgressCount,
113
+ text: truncate(trimmed, 240),
114
+ reason: "repeated planning/self-talk without concrete progress while orchestration is active",
115
+ }
116
+ : undefined,
117
+ };
82
118
  }
83
119
 
84
120
  function parseState(raw: string): RepetitionGuardState {
@@ -96,6 +132,14 @@ function parseState(raw: string): RepetitionGuardState {
96
132
  text: parsed.assistantRepeat.text,
97
133
  }
98
134
  : undefined,
135
+ noProgress: parsed.noProgress && typeof parsed.noProgress.text === "string"
136
+ ? {
137
+ at: String(parsed.noProgress.at ?? new Date().toISOString()),
138
+ count: Number(parsed.noProgress.count) || NO_PROGRESS_COUNT,
139
+ text: parsed.noProgress.text,
140
+ reason: String(parsed.noProgress.reason ?? "no concrete progress detected"),
141
+ }
142
+ : undefined,
99
143
  };
100
144
  } catch {
101
145
  return defaultRepetitionGuardState();
@@ -110,6 +154,44 @@ function writeGuardState(ctx: any, state: RepetitionGuardState): void {
110
154
  writeText(sessionFile(FEATURE, ctx, STATE_FILE), `${JSON.stringify(state, null, 2)}\n`);
111
155
  }
112
156
 
157
+ export function clearNoProgressRecovery(ctx: any): void {
158
+ const state = readGuardState(ctx);
159
+ if (!state.noProgress) return;
160
+ writeGuardState(ctx, { ...state, noProgress: undefined });
161
+ }
162
+
163
+ function hasActiveOrchestration(ctx: any): boolean {
164
+ if (readText(sessionFile(FEATURE, ctx, "goal.md")).trim()) return true;
165
+ for (const file of ["loop.json", "autoresearch.json"]) {
166
+ try {
167
+ const parsed = JSON.parse(readText(sessionFile(FEATURE, ctx, file), "{}"));
168
+ if (parsed?.enabled || parsed?.instruction) return true;
169
+ } catch {
170
+ // ignore malformed state files; they should not trigger recovery
171
+ }
172
+ }
173
+ return false;
174
+ }
175
+
176
+ function recoveryPrompt(repeat?: RepetitionGuardRepeat, noProgress?: NoProgressSignal): string | null {
177
+ if (noProgress && noProgress.count < NO_PROGRESS_COUNT) noProgress = undefined;
178
+ if (!repeat && !noProgress) return null;
179
+ const count = Math.max(repeat?.count ?? 0, noProgress?.count ?? 0);
180
+ const bounded = count >= BOUNDED_RECOVERY_COUNT;
181
+ const signal = repeat
182
+ ? `Repeated assistant output (${repeat.count} turns): ${truncate(repeat.text, 180)}`
183
+ : `No-progress streak (${noProgress?.count} turns): ${truncate(noProgress?.text ?? "", 180)}`;
184
+
185
+ return [
186
+ bounded ? "Pi-Rogue bounded no-progress recovery:" : "Pi-Rogue no-progress recovery:",
187
+ signal,
188
+ noProgress?.reason ? `Reason: ${noProgress.reason}.` : "Reason: repeated output suggests the current approach is stuck.",
189
+ bounded
190
+ ? "Recovery is bounded now: do not stack another retry. If one safe, concrete alternative action is available, take exactly that action; otherwise stop and ask the user for direction with the current blocker."
191
+ : "Summarize the current state in one sentence, choose one concrete alternative action, and take it now. Do not only restate the plan or repeat the same response.",
192
+ ].join("\n");
193
+ }
194
+
113
195
  export function registerNoveltyGuard(pi: ExtensionAPI): void {
114
196
  const p = pi as any;
115
197
  if (p.__piRogueNoveltyGuardRegistered) return;
@@ -117,16 +199,16 @@ export function registerNoveltyGuard(pi: ExtensionAPI): void {
117
199
 
118
200
  pi.on("before_agent_start", async (event, ctx) => {
119
201
  const state = readGuardState(ctx);
120
- const repeat = detectAssistantRepetition(state) ?? state.assistantRepeat;
121
- if (!repeat) return { systemPrompt: event.systemPrompt };
202
+ const active = hasActiveOrchestration(ctx);
203
+ const noProgress = active ? state.noProgress : undefined;
204
+ if (!active && state.noProgress) {
205
+ clearNoProgressRecovery(ctx);
206
+ }
207
+ const prompt = recoveryPrompt(detectAssistantRepetition(state) ?? state.assistantRepeat, noProgress);
208
+ if (!prompt) return { systemPrompt: event.systemPrompt };
122
209
 
123
210
  return {
124
- systemPrompt: [
125
- event.systemPrompt,
126
- "Pi-Rogue repetition guard:",
127
- `The previous assistant output repeated ${repeat.count} times: ${truncate(repeat.text, 180)}`,
128
- "Inspect current state before continuing, then apply only the smallest missing delta. Do not repeat the same response.",
129
- ].join("\n\n"),
211
+ systemPrompt: [event.systemPrompt, prompt].join("\n\n"),
130
212
  };
131
213
  });
132
214
 
@@ -136,10 +218,13 @@ export function registerNoveltyGuard(pi: ExtensionAPI): void {
136
218
  if (!text) return;
137
219
 
138
220
  const previous = readGuardState(ctx);
139
- const next = recordAssistantTurn(previous, text);
221
+ const next = recordAssistantTurn(previous, text, { activeOrchestration: hasActiveOrchestration(ctx) });
140
222
  writeGuardState(ctx, next);
141
223
  if (next.assistantRepeat && (!previous.assistantRepeat || next.assistantRepeat.count > previous.assistantRepeat.count)) {
142
224
  ctx.ui.notify("Repetition guard detected repeated assistant output; the next turn will inspect current state before retrying.", "warning");
143
225
  }
226
+ if (next.noProgress && next.noProgress.count >= NO_PROGRESS_COUNT && (!previous.noProgress || next.noProgress.count > previous.noProgress.count)) {
227
+ ctx.ui.notify("No-progress recovery detected repeated planning without concrete progress; the next turn will take one alternative action or stop.", "warning");
228
+ }
144
229
  });
145
230
  }
@@ -11,24 +11,63 @@ npm run router:rebuild -- --session ./current-session.jsonl --workspace-diff --o
11
11
  npm run router:decide -- --checkpoint-file .pi/router/checkpoints.jsonl --ledger .pi/router/events.jsonl
12
12
  npm run router:cards -- --events .pi/router/events.jsonl --output .pi/router/model-cards.jsonl
13
13
  npm run router:outcomes -- --checkpoint-file .pi/router/checkpoints.jsonl --events .pi/router/events.jsonl --output .pi/router/outcomes.jsonl
14
+ npm run router:outcome-enrich -- --outcomes .pi/router/outcomes.jsonl --checkpoint-file .pi/router/checkpoints.jsonl --events .pi/router/events.jsonl --output .pi/router/outcomes.enriched.jsonl
14
15
  npm run router:teacher-requests -- --checkpoint-file .pi/router/checkpoints.jsonl --output .pi/router/teacher-requests.jsonl --teacher openai-codex/gpt-5.5
15
- npm run router:reflect -- --checkpoint-file .pi/router/checkpoints.jsonl --labels .pi/router/labels/teacher-labels.jsonl --reflection .pi/router/reflections/session.md --teacher local-rule
16
+ npm run router:teacher-label -- --requests .pi/router/teacher-requests.jsonl --teacher-output .pi/router/teacher-decisions.jsonl --labels .pi/router/labels/teacher-labels.jsonl --teacher openai-codex/gpt-5.5
17
+ npm run router:reflect -- --checkpoint-file .pi/router/checkpoints.jsonl --labels .pi/router/labels/teacher-labels.jsonl --reflection .pi/router/reflections/session.md --teacher openai-codex/gpt-5.5 --teacher-output .pi/router/teacher-decisions.jsonl
16
18
  npm run router:dataset -- --checkpoint-file .pi/router/checkpoints.jsonl --events .pi/router/events.jsonl --outcomes .pi/router/outcomes.jsonl --labels .pi/router/labels/teacher-labels.jsonl --output .pi/router/training.jsonl
19
+ npm run router:gate-train -- --dataset .pi/router/training.train.jsonl --eval-dataset .pi/router/training.eval.jsonl --artifact .pi/router/binary-gate.json --report .pi/router/binary-gate-report.json
20
+ npm run router:report -- --events .pi/router/events.jsonl --outcomes .pi/router/outcomes.jsonl --dataset .pi/router/training.eval.jsonl --gate-report .pi/router/binary-gate-report.json --output .pi/router/report.json --markdown .pi/router/report.md
21
+ npm run router:sharpen -- --events .pi/router/events.jsonl --outcomes .pi/router/outcomes.jsonl --cards .pi/router/model-cards.jsonl --output .pi/router/sharpening-hints.json
17
22
  npm run router:shadow -- --checkpoint-file .pi/router/checkpoints.jsonl --ledger .pi/router/events.jsonl --output .pi/router/shadow-report.json
18
23
 
19
- # Live observe-only extension commands:
20
- # /router on|off|status|profile|profiles|models|configure|cycle
24
+ # Live router extension commands:
25
+ # /router status|help|on|off|mode|profile|print|profiles|models|configure|cycle
26
+ # /router mode observe # default: recommendations only
27
+ # /router mode auto_model # explicit: apply model switches only
28
+ # /router profile spark-smart
29
+ # /router print mismatch_only|all|off
21
30
  # ctrl+alt+p cycles router profiles (Ctrl-P is reserved by Pi model cycling).
22
31
  ```
23
32
 
24
33
  ## V1 telemetry notes
25
34
 
26
- Router v1 is still observe-only. It adds outcome skeletons, stronger diff/error fingerprints, teacher-label request export, binary gate dataset export, and subagent-aware telemetry schemas. It does not switch models, spawn agents, or promote policies automatically.
35
+ Router v1 defaults to observe-only. It adds outcome skeletons, stronger diff/error fingerprints, teacher-label request export, binary gate dataset export, and subagent-aware telemetry schemas. It does not spawn agents/subagents or promote policies automatically. The explicit `auto_model` mode may only switch the active model for future turns.
27
36
 
28
- Live config is repo-global at `.pi/router/config.json`, while mutable live state and route ledgers are isolated per Pi session under `.pi/router/sessions/<session-key>/state.json` and `events.jsonl`.
37
+ Live config is repo-global at `.pi/router/config.json`, while mutable live state and route ledgers are isolated per Pi session under `.pi/router/sessions/<session-key>/state.json` and `events.jsonl`. The default `mode` is `observe`; `auto_model` must be explicitly selected and does not alter agents, subagents, tools, or execution paths.
29
38
 
30
39
  - Diff telemetry stores counts and hashes from `git diff`, not raw patches. Offline rebuilds remain deterministic by default; use `--workspace-diff` only with one current live session/worktree snapshot.
40
+ - `router:outcome-enrich` upgrades conservative outcome skeletons with checkpoint/event-derived verifier, rework, interruption, override, and accepted-diff signals.
31
41
  - Error fingerprints normalize paths, line numbers, timestamps, UUIDs, ports, and object ids before hashing.
32
- - `router:teacher-requests` writes local JSONL requests for an explicit teacher model; imported teacher decisions are still required before labels become training truth.
42
+ - `router:teacher-requests` writes local JSONL requests for an explicit teacher model; `router:teacher-label` calls the explicitly configured teacher and writes decision/label JSONL artifacts.
33
43
  - `router:dataset` excludes `local-rule` labels by default so a future model does not merely imitate the current rules.
44
+ - `router:gate-train` trains a local binary continue-vs-intervene gate and evaluates it on a distinct labeled eval dataset; local-rule labels are rejected as training/eval truth and promotion remains manual/eval-gated.
45
+ - `router:report` writes JSON plus optional Markdown summaries across route ledgers, enriched outcomes, dataset labels, and gate evaluation reports.
46
+ - `router:sharpen` writes local-only `pi-router.sharpening-hints.v1` recommendations from route ledgers, optional outcomes, and optional capability cards. Hints include sample-size/confidence/auto-use guardrails, repo-local learning policy, and provenance, but never mutate config or promote policy automatically.
47
+
48
+ ### Automated, upgrade-safe sharpening persistence
49
+
50
+ Use this one-shot command for cron/background automation:
51
+
52
+ ```bash
53
+ npm run router:sharpen:auto -- --workspace .
54
+ ```
55
+
56
+ By default it stores artifacts at:
57
+
58
+ - Linux/BSD: `<XDG_DATA_HOME || ~/.local/share>/pi-rogue-router/learning/<repo-name>-<hash>/`
59
+ - macOS: `~/Library/Application Support/pi-rogue-router/learning/<repo-name>-<hash>/`
60
+
61
+ The script:
62
+ - writes `latest.json` and `history/*.json` artifacts;
63
+ - writes `manifest.json` with source fingerprints for change detection;
64
+ - skips re-computation if inputs are unchanged (unless `--force`);
65
+ - migrates legacy `.pi/router/sharpening-hints.json` into the stable learning directory when present.
66
+
67
+ Cron example:
68
+
69
+ ```bash
70
+ */30 * * * * cd /path/to/pi-rogue && npm run router:sharpen:auto -- --workspace /path/to/pi-rogue
71
+ ```
72
+
34
73
  - Subagent route/ledger schemas describe parent-child evidence flow, but live autonomous spawning remains out of scope.
@@ -0,0 +1,88 @@
1
+ import { mkdtempSync, readFileSync, writeFileSync } from "node:fs";
2
+ import { tmpdir } from "node:os";
3
+ import { join } from "node:path";
4
+ import { describe, expect, it } from "vitest";
5
+ import { trainBinaryGate, writeBinaryGateTraining } from "./binary-gate.js";
6
+ import type { RouterTrainingRow } from "./dataset.js";
7
+
8
+ function tempFile(name: string): string {
9
+ return join(mkdtempSync(join(tmpdir(), "pi-router-gate-")), name);
10
+ }
11
+
12
+ function row(id: string, label: "continue" | "intervene" | "unknown", overrides: Partial<RouterTrainingRow["features"]> = {}, source: RouterTrainingRow["labels"]["source"] = label === "unknown" ? "unknown" : "teacher"): RouterTrainingRow {
13
+ return {
14
+ schema: "pi-router.training-row.v1",
15
+ checkpointId: id,
16
+ sessionId: "session-1",
17
+ rawSessionRef: { schema: "pi-router.raw-session-ref.v1", path: "/tmp/session.jsonl", fromEvent: 0, toEvent: 1, fromByte: 0, toByte: 1, contentHash: "hash" },
18
+ features: {
19
+ phase: "implementation",
20
+ activeModel: "qwen",
21
+ provider: "local",
22
+ contextTokensApprox: 1000,
23
+ sameCommandRepeatedCount: 1,
24
+ sameErrorRepeatedCount: 0,
25
+ loopScore: 0.1,
26
+ progressScore: 0.9,
27
+ verifierUsed: true,
28
+ noVerifierUsed: false,
29
+ diffLines: 10,
30
+ diffFilesChanged: 1,
31
+ diffChurnScore: 0.01,
32
+ filesTouched: 1,
33
+ ...overrides,
34
+ },
35
+ labels: { routeAction: label === "intervene" ? "run_verifier" : label === "continue" ? "continue_current" : null, binaryGate: label, source, confidence: label === "unknown" ? null : 0.8 },
36
+ outcome: { taskStatus: "unknown", testsPassedAfter: null, acceptedDiff: null, userOverrodeDecision: null, reworkTurns: null },
37
+ provenance: { localRuleAction: label === "intervene" ? "run_verifier" : "continue_current", excludedLocalRuleAsTruth: false },
38
+ };
39
+ }
40
+
41
+ describe("router binary gate training", () => {
42
+ it("trains a threshold artifact and reports candidate vs rule baseline", () => {
43
+ const rows = [
44
+ row("continue-1", "continue"),
45
+ row("intervene-1", "intervene", { phase: "debug", loopScore: 0.8, progressScore: 0.2, sameErrorRepeatedCount: 3, noVerifierUsed: true, verifierUsed: false }),
46
+ row("unknown-1", "unknown"),
47
+ ];
48
+
49
+ const evalRows = [
50
+ row("eval-continue-1", "continue"),
51
+ row("eval-intervene-1", "intervene", { phase: "debug", loopScore: 0.7, progressScore: 0.3, sameErrorRepeatedCount: 2 }),
52
+ ];
53
+
54
+ const { artifact, report } = trainBinaryGate(rows, evalRows, "2026-06-14T00:00:00.000Z");
55
+
56
+ expect(artifact).toMatchObject({ schema: "pi-router.binary-gate-artifact.v1", manualPromotionRequired: true, training: { rows: 3, labeledRows: 2 }, evaluation: { rows: 2, labeledRows: 2 } });
57
+ expect(report).toMatchObject({ schema: "pi-router.binary-gate-eval.v1", trainRows: 3, trainLabeledRows: 2, evalRows: 2, evalLabeledRows: 2, manualPromotionRequired: true });
58
+ expect(report.thresholdSweep.length).toBeGreaterThan(1);
59
+ expect(report.candidate.truePositive + report.candidate.trueNegative + report.candidate.falsePositive + report.candidate.falseNegative).toBe(2);
60
+ });
61
+
62
+ it("writes gate artifact and eval report", () => {
63
+ const input = tempFile("training.jsonl");
64
+ const evalInput = tempFile("eval.jsonl");
65
+ const artifact = tempFile("gate.json");
66
+ const report = tempFile("report.json");
67
+ writeFileSync(input, [
68
+ JSON.stringify(row("continue-1", "continue")),
69
+ JSON.stringify(row("intervene-1", "intervene", { loopScore: 0.9, progressScore: 0.1, noVerifierUsed: true })),
70
+ ].join("\n") + "\n");
71
+ writeFileSync(evalInput, [
72
+ JSON.stringify(row("eval-continue-1", "continue")),
73
+ JSON.stringify(row("eval-intervene-1", "intervene", { loopScore: 0.8, progressScore: 0.2, sameErrorRepeatedCount: 3 })),
74
+ ].join("\n") + "\n");
75
+
76
+ const summary = writeBinaryGateTraining({ trainingRowsPath: input, evalRowsPath: evalInput, artifactPath: artifact, reportPath: report });
77
+
78
+ expect(summary).toMatchObject({ schema: "pi-router.binary-gate-train-summary.v1", trainRows: 2, trainLabeledRows: 2, evalRows: 2, evalLabeledRows: 2 });
79
+ expect(JSON.parse(readFileSync(artifact, "utf8")).schema).toBe("pi-router.binary-gate-artifact.v1");
80
+ expect(JSON.parse(readFileSync(report, "utf8")).schema).toBe("pi-router.binary-gate-eval.v1");
81
+ expect(() => writeBinaryGateTraining({ trainingRowsPath: input, evalRowsPath: input, artifactPath: tempFile("bad-gate.json"), reportPath: tempFile("bad-report.json") })).toThrow(/distinct --eval-dataset/);
82
+ });
83
+
84
+ it("rejects unusable labels", () => {
85
+ expect(() => trainBinaryGate([row("rule-1", "continue", {}, "local-rule"), row("rule-2", "intervene", {}, "local-rule")], [row("eval-1", "continue"), row("eval-2", "intervene")])).toThrow(/no usable/);
86
+ expect(() => trainBinaryGate([row("only-continue", "continue")], [row("eval-1", "continue"), row("eval-2", "intervene")])).toThrow(/both continue and intervene/);
87
+ });
88
+ });
@@ -0,0 +1,232 @@
1
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
2
+ import { dirname, resolve } from "node:path";
3
+ import type { BinaryGateLabel, RouterTrainingRow } from "./dataset.js";
4
+
5
+ export const BINARY_GATE_ARTIFACT_SCHEMA = "pi-router.binary-gate-artifact.v1" as const;
6
+ export const BINARY_GATE_EVAL_SCHEMA = "pi-router.binary-gate-eval.v1" as const;
7
+
8
+ export interface BinaryGateArtifact {
9
+ schema: typeof BINARY_GATE_ARTIFACT_SCHEMA;
10
+ generatedAt: string;
11
+ policyVersion: string;
12
+ model: {
13
+ kind: "linear-threshold";
14
+ threshold: number;
15
+ weights: Record<string, number>;
16
+ };
17
+ training: {
18
+ rows: number;
19
+ labeledRows: number;
20
+ positiveIntervene: number;
21
+ negativeContinue: number;
22
+ };
23
+ evaluation: {
24
+ rows: number;
25
+ labeledRows: number;
26
+ positiveIntervene: number;
27
+ negativeContinue: number;
28
+ };
29
+ manualPromotionRequired: true;
30
+ }
31
+
32
+ export interface ConfusionMatrix {
33
+ truePositive: number;
34
+ trueNegative: number;
35
+ falsePositive: number;
36
+ falseNegative: number;
37
+ }
38
+
39
+ export interface BinaryGateEvalReport {
40
+ schema: typeof BINARY_GATE_EVAL_SCHEMA;
41
+ generatedAt: string;
42
+ policyVersion: string;
43
+ trainRows: number;
44
+ trainLabeledRows: number;
45
+ evalRows: number;
46
+ evalLabeledRows: number;
47
+ candidate: ConfusionMatrix & { accuracy: number; precision: number; recall: number; f1: number };
48
+ ruleBaseline: ConfusionMatrix & { accuracy: number; precision: number; recall: number; f1: number };
49
+ thresholdSweep: Array<{ threshold: number; trainAccuracy: number; trainF1: number; evalAccuracy: number; evalF1: number }>;
50
+ manualPromotionRequired: true;
51
+ }
52
+
53
+ export interface GateTrainSummary {
54
+ schema: "pi-router.binary-gate-train-summary.v1";
55
+ artifact: string;
56
+ report: string;
57
+ trainRows: number;
58
+ trainLabeledRows: number;
59
+ evalRows: number;
60
+ evalLabeledRows: number;
61
+ threshold: number;
62
+ }
63
+
64
+ function round(value: number): number {
65
+ return Number(value.toFixed(4));
66
+ }
67
+
68
+ export function readTrainingRows(path: string): RouterTrainingRow[] {
69
+ const resolved = resolve(path);
70
+ if (!existsSync(resolved)) throw new Error(`training rows file not found: ${path}`);
71
+ return readFileSync(resolved, "utf8")
72
+ .split("\n")
73
+ .filter((line) => line.trim())
74
+ .map((line, index) => {
75
+ const row = JSON.parse(line) as RouterTrainingRow;
76
+ if (row.schema !== "pi-router.training-row.v1") throw new Error(`invalid training row schema at ${path}:${index + 1}`);
77
+ return row;
78
+ });
79
+ }
80
+
81
+ function scoreRow(row: RouterTrainingRow): number {
82
+ const weights = DEFAULT_WEIGHTS;
83
+ const context = row.features.contextTokensApprox ? Math.min(row.features.contextTokensApprox / 100_000, 1) : 0;
84
+ const diff = Math.min(row.features.diffLines / 600, 1);
85
+ const repeatErrors = Math.min(row.features.sameErrorRepeatedCount / 4, 1);
86
+ const repeatCommands = Math.min(row.features.sameCommandRepeatedCount / 4, 1);
87
+ const noVerifier = row.features.noVerifierUsed ? 1 : 0;
88
+ const lowProgress = 1 - row.features.progressScore;
89
+ const phaseRisk = row.features.phase === "debug" || row.features.phase === "review" ? 0.15 : 0;
90
+ return Math.max(0, Math.min(1,
91
+ weights.bias
92
+ + weights.loopScore * row.features.loopScore
93
+ + weights.lowProgress * lowProgress
94
+ + weights.sameErrorRepeatedCount * repeatErrors
95
+ + weights.sameCommandRepeatedCount * repeatCommands
96
+ + weights.noVerifierUsed * noVerifier
97
+ + weights.diffLines * diff
98
+ + weights.contextPressure * context
99
+ + phaseRisk,
100
+ ));
101
+ }
102
+
103
+ const DEFAULT_WEIGHTS: Record<string, number> = {
104
+ bias: -0.08,
105
+ loopScore: 0.38,
106
+ lowProgress: 0.22,
107
+ sameErrorRepeatedCount: 0.18,
108
+ sameCommandRepeatedCount: 0.08,
109
+ noVerifierUsed: 0.16,
110
+ diffLines: 0.08,
111
+ contextPressure: 0.12,
112
+ };
113
+
114
+ function isIntervene(label: BinaryGateLabel): boolean | null {
115
+ if (label === "intervene") return true;
116
+ if (label === "continue") return false;
117
+ return null;
118
+ }
119
+
120
+ function ruleGate(row: RouterTrainingRow): BinaryGateLabel {
121
+ return row.provenance.localRuleAction === "continue_current" || row.provenance.localRuleAction === "continue_local" ? "continue" : "intervene";
122
+ }
123
+
124
+ function metrics(matrix: ConfusionMatrix): ConfusionMatrix & { accuracy: number; precision: number; recall: number; f1: number } {
125
+ const total = matrix.truePositive + matrix.trueNegative + matrix.falsePositive + matrix.falseNegative;
126
+ const precision = matrix.truePositive + matrix.falsePositive ? matrix.truePositive / (matrix.truePositive + matrix.falsePositive) : 0;
127
+ const recall = matrix.truePositive + matrix.falseNegative ? matrix.truePositive / (matrix.truePositive + matrix.falseNegative) : 0;
128
+ const f1 = precision + recall ? (2 * precision * recall) / (precision + recall) : 0;
129
+ return {
130
+ ...matrix,
131
+ accuracy: total ? round((matrix.truePositive + matrix.trueNegative) / total) : 0,
132
+ precision: round(precision),
133
+ recall: round(recall),
134
+ f1: round(f1),
135
+ };
136
+ }
137
+
138
+ function confusion(rows: RouterTrainingRow[], predict: (row: RouterTrainingRow) => BinaryGateLabel): ConfusionMatrix {
139
+ const matrix: ConfusionMatrix = { truePositive: 0, trueNegative: 0, falsePositive: 0, falseNegative: 0 };
140
+ for (const row of rows) {
141
+ const truth = isIntervene(row.labels.binaryGate);
142
+ if (truth === null) continue;
143
+ const predicted = predict(row) === "intervene";
144
+ if (truth && predicted) matrix.truePositive++;
145
+ else if (!truth && !predicted) matrix.trueNegative++;
146
+ else if (!truth && predicted) matrix.falsePositive++;
147
+ else matrix.falseNegative++;
148
+ }
149
+ return matrix;
150
+ }
151
+
152
+ function thresholdValues(): number[] {
153
+ return Array.from({ length: 19 }, (_, index) => round(0.05 + index * 0.05));
154
+ }
155
+
156
+ function usableLabeledRows(rows: RouterTrainingRow[], label: string): RouterTrainingRow[] {
157
+ const labeled = rows.filter((row) => row.labels.binaryGate !== "unknown" && row.labels.source !== "local-rule");
158
+ const positives = labeled.filter((row) => row.labels.binaryGate === "intervene").length;
159
+ const negatives = labeled.filter((row) => row.labels.binaryGate === "continue").length;
160
+ if (labeled.length === 0) throw new Error(`${label} dataset has no usable teacher/human labeled rows`);
161
+ if (positives === 0 || negatives === 0) throw new Error(`${label} dataset must contain both continue and intervene labels`);
162
+ return labeled;
163
+ }
164
+
165
+ function labelCounts(rows: RouterTrainingRow[]): { positiveIntervene: number; negativeContinue: number } {
166
+ return {
167
+ positiveIntervene: rows.filter((row) => row.labels.binaryGate === "intervene").length,
168
+ negativeContinue: rows.filter((row) => row.labels.binaryGate === "continue").length,
169
+ };
170
+ }
171
+
172
+ export function trainBinaryGate(trainRows: RouterTrainingRow[], evalRows: RouterTrainingRow[], generatedAt = new Date().toISOString()): { artifact: BinaryGateArtifact; report: BinaryGateEvalReport } {
173
+ const trainLabeled = usableLabeledRows(trainRows, "training");
174
+ const evalLabeled = usableLabeledRows(evalRows, "eval");
175
+ const trainSweep = thresholdValues().map((threshold) => {
176
+ const result = metrics(confusion(trainLabeled, (row) => scoreRow(row) >= threshold ? "intervene" : "continue"));
177
+ return { threshold, accuracy: result.accuracy, f1: result.f1 };
178
+ });
179
+ const best = trainSweep.reduce((winner, item) => item.f1 > winner.f1 || (item.f1 === winner.f1 && item.accuracy > winner.accuracy) ? item : winner, trainSweep[0]);
180
+ const policyVersion = `pi-router.binary-gate.v1.threshold-${best.threshold}`;
181
+ const trainCounts = labelCounts(trainLabeled);
182
+ const evalCounts = labelCounts(evalLabeled);
183
+ const thresholdSweep = thresholdValues().map((threshold) => {
184
+ const train = metrics(confusion(trainLabeled, (row) => scoreRow(row) >= threshold ? "intervene" : "continue"));
185
+ const evaluation = metrics(confusion(evalLabeled, (row) => scoreRow(row) >= threshold ? "intervene" : "continue"));
186
+ return { threshold, trainAccuracy: train.accuracy, trainF1: train.f1, evalAccuracy: evaluation.accuracy, evalF1: evaluation.f1 };
187
+ });
188
+ const artifact: BinaryGateArtifact = {
189
+ schema: BINARY_GATE_ARTIFACT_SCHEMA,
190
+ generatedAt,
191
+ policyVersion,
192
+ model: { kind: "linear-threshold", threshold: best.threshold, weights: DEFAULT_WEIGHTS },
193
+ training: { rows: trainRows.length, labeledRows: trainLabeled.length, ...trainCounts },
194
+ evaluation: { rows: evalRows.length, labeledRows: evalLabeled.length, ...evalCounts },
195
+ manualPromotionRequired: true,
196
+ };
197
+ const report: BinaryGateEvalReport = {
198
+ schema: BINARY_GATE_EVAL_SCHEMA,
199
+ generatedAt,
200
+ policyVersion,
201
+ trainRows: trainRows.length,
202
+ trainLabeledRows: trainLabeled.length,
203
+ evalRows: evalRows.length,
204
+ evalLabeledRows: evalLabeled.length,
205
+ candidate: metrics(confusion(evalLabeled, (row) => scoreRow(row) >= best.threshold ? "intervene" : "continue")),
206
+ ruleBaseline: metrics(confusion(evalLabeled, ruleGate)),
207
+ thresholdSweep,
208
+ manualPromotionRequired: true,
209
+ };
210
+ return { artifact, report };
211
+ }
212
+
213
+ export function writeBinaryGateTraining(options: { trainingRowsPath: string; evalRowsPath: string; artifactPath: string; reportPath: string }): GateTrainSummary {
214
+ const rows = readTrainingRows(options.trainingRowsPath);
215
+ const evalRows = readTrainingRows(options.evalRowsPath);
216
+ if (resolve(options.trainingRowsPath) === resolve(options.evalRowsPath)) throw new Error("gate training requires a distinct --eval-dataset file for out-of-sample evaluation");
217
+ const { artifact, report } = trainBinaryGate(rows, evalRows);
218
+ mkdirSync(dirname(resolve(options.artifactPath)), { recursive: true });
219
+ mkdirSync(dirname(resolve(options.reportPath)), { recursive: true });
220
+ writeFileSync(resolve(options.artifactPath), `${JSON.stringify(artifact, null, 2)}\n`);
221
+ writeFileSync(resolve(options.reportPath), `${JSON.stringify(report, null, 2)}\n`);
222
+ return {
223
+ schema: "pi-router.binary-gate-train-summary.v1",
224
+ artifact: resolve(options.artifactPath),
225
+ report: resolve(options.reportPath),
226
+ trainRows: rows.length,
227
+ trainLabeledRows: artifact.training.labeledRows,
228
+ evalRows: evalRows.length,
229
+ evalLabeledRows: artifact.evaluation.labeledRows,
230
+ threshold: artifact.model.threshold,
231
+ };
232
+ }