@fiale-plus/pi-rogue 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/node_modules/@fiale-plus/pi-rogue-advisor/README.md +1 -0
  2. package/node_modules/@fiale-plus/pi-rogue-advisor/src/binary-gate-features.test.ts +8 -0
  3. package/node_modules/@fiale-plus/pi-rogue-advisor/src/binary-gate-features.ts +7 -0
  4. package/node_modules/@fiale-plus/pi-rogue-advisor/src/router.test.ts +26 -0
  5. package/node_modules/@fiale-plus/pi-rogue-advisor/src/router.ts +10 -1
  6. package/node_modules/@fiale-plus/pi-rogue-orchestration/README.md +3 -3
  7. package/node_modules/@fiale-plus/pi-rogue-orchestration/package.json +3 -0
  8. package/node_modules/@fiale-plus/pi-rogue-orchestration/skills/orchestration/SKILL.md +3 -2
  9. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/goal.test.ts +65 -2
  10. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/goal.ts +84 -4
  11. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/loop.ts +3 -0
  12. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/novelty-guard.test.ts +43 -0
  13. package/node_modules/@fiale-plus/pi-rogue-orchestration/src/novelty-guard.ts +96 -11
  14. package/node_modules/@fiale-plus/pi-rogue-router/README.md +45 -6
  15. package/node_modules/@fiale-plus/pi-rogue-router/src/binary-gate.test.ts +88 -0
  16. package/node_modules/@fiale-plus/pi-rogue-router/src/binary-gate.ts +232 -0
  17. package/node_modules/@fiale-plus/pi-rogue-router/src/cli.ts +123 -9
  18. package/node_modules/@fiale-plus/pi-rogue-router/src/completions.ts +39 -16
  19. package/node_modules/@fiale-plus/pi-rogue-router/src/config-extension.test.ts +111 -4
  20. package/node_modules/@fiale-plus/pi-rogue-router/src/config.ts +17 -2
  21. package/node_modules/@fiale-plus/pi-rogue-router/src/extension.ts +67 -7
  22. package/node_modules/@fiale-plus/pi-rogue-router/src/index.ts +4 -0
  23. package/node_modules/@fiale-plus/pi-rogue-router/src/observe.ts +76 -5
  24. package/node_modules/@fiale-plus/pi-rogue-router/src/outcomes.ts +130 -6
  25. package/node_modules/@fiale-plus/pi-rogue-router/src/reports.test.ts +92 -0
  26. package/node_modules/@fiale-plus/pi-rogue-router/src/reports.ts +116 -0
  27. package/node_modules/@fiale-plus/pi-rogue-router/src/sharpening.test.ts +223 -0
  28. package/node_modules/@fiale-plus/pi-rogue-router/src/sharpening.ts +344 -0
  29. package/node_modules/@fiale-plus/pi-rogue-router/src/teacher-runner.test.ts +126 -0
  30. package/node_modules/@fiale-plus/pi-rogue-router/src/teacher-runner.ts +238 -0
  31. package/node_modules/@fiale-plus/pi-rogue-router/src/v1-telemetry.test.ts +54 -1
  32. package/package.json +1 -1
@@ -0,0 +1,344 @@
1
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
2
+ import { dirname, resolve } from "node:path";
3
+ import { hashText } from "./hash.js";
4
+ import { readRouteEvents, type RouteEvent } from "./ledger.js";
5
+ import { readOutcomes, type RouterOutcome } from "./outcomes.js";
6
+ import type { ModelCapabilityCard } from "./learning.js";
7
+ import type { RouteAction, TaskStatus } from "./types.js";
8
+
9
+ export const ROUTER_SHARPENING_HINTS_SCHEMA = "pi-router.sharpening-hints.v1" as const;
10
+
11
+ export type RouterSharpeningHintKind = "prefer_model_for_action" | "savings_candidate" | "mismatch_followup";
12
+ export type RouterSharpeningConfidence = "low" | "medium" | "high";
13
+
14
+ export interface RouterSharpeningHint {
15
+ hintId: string;
16
+ kind: RouterSharpeningHintKind;
17
+ action?: RouteAction;
18
+ modelId: string;
19
+ provider?: string;
20
+ confidence: RouterSharpeningConfidence;
21
+ score: number;
22
+ rationale: string;
23
+ guardrails: {
24
+ manualPromotionOnly: true;
25
+ sampleSizeCapped: boolean;
26
+ sparse: boolean;
27
+ autoUse: {
28
+ eligible: boolean;
29
+ reason: string;
30
+ };
31
+ };
32
+ provenance: {
33
+ events: number;
34
+ sessions: number;
35
+ linkedOutcomes: number;
36
+ outcomeStatus: Record<TaskStatus, number>;
37
+ eventIds: string[];
38
+ checkpointIds: string[];
39
+ cardEvents?: number;
40
+ comparedWith?: Array<{ modelId: string; provider?: string; score: number; events: number }>;
41
+ };
42
+ }
43
+
44
+ export interface RouterSharpeningArtifact {
45
+ schema: typeof ROUTER_SHARPENING_HINTS_SCHEMA;
46
+ generatedAt: string;
47
+ inputs: { events: string; outcomes?: string; cards?: string };
48
+ totals: { events: number; outcomes: number; cards: number; sessions: number; models: number };
49
+ learningPolicy: {
50
+ scope: "repo-local";
51
+ ignoresRawTranscript: true;
52
+ fallback: "baseline-router";
53
+ minSessionsForAutoBias: number;
54
+ minLinkedOutcomesForAutoBias: number;
55
+ staleHintDecayRecommended: true;
56
+ };
57
+ hints: RouterSharpeningHint[];
58
+ manualPromotionRequired: true;
59
+ }
60
+
61
+ interface GroupStats {
62
+ action?: RouteAction;
63
+ provider?: string;
64
+ modelId: string;
65
+ events: RouteEvent[];
66
+ sessions: Set<string>;
67
+ outcomeStatus: Record<TaskStatus, number>;
68
+ linkedOutcomes: RouterOutcome[];
69
+ averageProgressScore: number;
70
+ averageLoopScore: number;
71
+ score: number;
72
+ }
73
+
74
+ const OUTCOME_SCORE: Record<TaskStatus, number> = {
75
+ success: 1,
76
+ partial: 0.6,
77
+ unknown: 0.45,
78
+ abandoned: 0.15,
79
+ failed: 0,
80
+ };
81
+
82
+ function emptyOutcomeStatus(): Record<TaskStatus, number> {
83
+ return { success: 0, partial: 0, failed: 0, abandoned: 0, unknown: 0 };
84
+ }
85
+
86
+ function round(value: number): number {
87
+ return Number(value.toFixed(3));
88
+ }
89
+
90
+ function average(values: number[]): number {
91
+ return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
92
+ }
93
+
94
+ function stableSample<T>(values: T[], max = 8): T[] {
95
+ return values.slice(0, max);
96
+ }
97
+
98
+ function modelKey(provider: string | undefined, modelId: string): string {
99
+ return `${provider ?? "unknown"}\0${modelId}`;
100
+ }
101
+
102
+ function modelDisplay(provider: string | undefined, modelId: string): string {
103
+ return provider && provider !== "unknown" ? `${provider}/${modelId}` : modelId;
104
+ }
105
+
106
+ function isLocalOrCheap(modelId: string, provider?: string): boolean {
107
+ return /(local|ollama|mlx|qwen|llama|mistral|phi|codex-spark|spark)/i.test(`${provider ?? ""}/${modelId}`);
108
+ }
109
+
110
+ function confidence(events: number, linkedOutcomes: number, score: number): RouterSharpeningConfidence {
111
+ if (events < 5 || linkedOutcomes === 0 || score < 0.65) return "low";
112
+ if (events >= 20 && linkedOutcomes >= 5 && score >= 0.75) return "high";
113
+ return "medium";
114
+ }
115
+
116
+ function outcomeMaps(outcomes: RouterOutcome[]): { byEvent: Map<string, RouterOutcome>; byCheckpoint: Map<string, RouterOutcome> } {
117
+ return {
118
+ byEvent: new Map(outcomes.flatMap((outcome) => outcome.routeEventId ? [[outcome.routeEventId, outcome] as const] : [])),
119
+ byCheckpoint: new Map(outcomes.flatMap((outcome) => outcome.checkpointId && !outcome.routeEventId ? [[outcome.checkpointId, outcome] as const] : [])),
120
+ };
121
+ }
122
+
123
+ function computeStats(action: RouteAction | undefined, provider: string | undefined, modelId: string, events: RouteEvent[], outcomes: RouterOutcome[]): GroupStats {
124
+ const maps = outcomeMaps(outcomes);
125
+ const linked = events.flatMap((event) => {
126
+ const outcome = maps.byEvent.get(event.eventId) ?? maps.byCheckpoint.get(event.checkpointId);
127
+ return outcome ? [outcome] : [];
128
+ });
129
+ const outcomeStatus = emptyOutcomeStatus();
130
+ for (const outcome of linked) outcomeStatus[outcome.taskStatus]++;
131
+ const progress = average(events.map((event) => event.metrics.progressScore));
132
+ const loop = average(events.map((event) => event.metrics.loopScore));
133
+ const signalScore = (progress + (1 - loop)) / 2;
134
+ const outcomeScore = linked.length ? average(linked.map((outcome) => OUTCOME_SCORE[outcome.taskStatus])) : signalScore;
135
+ return {
136
+ action,
137
+ provider,
138
+ modelId,
139
+ events,
140
+ sessions: new Set(events.map((event) => event.sessionId)),
141
+ outcomeStatus,
142
+ linkedOutcomes: linked,
143
+ averageProgressScore: round(progress),
144
+ averageLoopScore: round(loop),
145
+ score: round((outcomeScore * 0.65) + (signalScore * 0.35)),
146
+ };
147
+ }
148
+
149
+ function groupedByActionModel(events: RouteEvent[], outcomes: RouterOutcome[]): GroupStats[] {
150
+ const groups = new Map<string, RouteEvent[]>();
151
+ for (const event of events) {
152
+ const modelId = event.runtime.activeModel ?? "unknown";
153
+ const key = `${event.decision.action}\0${modelKey(event.runtime.provider, modelId)}`;
154
+ groups.set(key, [...(groups.get(key) ?? []), event]);
155
+ }
156
+ return [...groups.entries()].map(([key, group]) => {
157
+ const [action, provider, modelId] = key.split("\0") as [RouteAction, string, string];
158
+ return computeStats(action, provider === "unknown" ? undefined : provider, modelId, group, outcomes);
159
+ });
160
+ }
161
+
162
+ function groupedByModel(events: RouteEvent[], outcomes: RouterOutcome[]): GroupStats[] {
163
+ const groups = new Map<string, RouteEvent[]>();
164
+ for (const event of events) {
165
+ const modelId = event.runtime.activeModel ?? "unknown";
166
+ const key = modelKey(event.runtime.provider, modelId);
167
+ groups.set(key, [...(groups.get(key) ?? []), event]);
168
+ }
169
+ return [...groups.entries()].map(([key, group]) => {
170
+ const [provider, modelId] = key.split("\0") as [string, string];
171
+ return computeStats(undefined, provider === "unknown" ? undefined : provider, modelId, group, outcomes);
172
+ });
173
+ }
174
+
175
+ function hasPoorLinkedOutcomes(stats: GroupStats): boolean {
176
+ const negative = stats.outcomeStatus.failed + stats.outcomeStatus.abandoned;
177
+ const positive = stats.outcomeStatus.success + stats.outcomeStatus.partial;
178
+ return stats.linkedOutcomes.length > 0 && negative > 0 && positive === 0;
179
+ }
180
+
181
+ function autoUse(kind: RouterSharpeningHintKind, stats: GroupStats, hintConfidence: RouterSharpeningConfidence): RouterSharpeningHint["guardrails"]["autoUse"] {
182
+ if (kind === "mismatch_followup") return { eligible: false, reason: "mismatch follow-up hints are diagnostic only" };
183
+ if (hintConfidence !== "high") return { eligible: false, reason: "requires high confidence before any future automatic bias" };
184
+ if (stats.sessions.size < 2) return { eligible: false, reason: "requires evidence from at least two sessions" };
185
+ if (stats.linkedOutcomes.length < 5) return { eligible: false, reason: "requires at least five linked outcomes" };
186
+ if (hasPoorLinkedOutcomes(stats)) return { eligible: false, reason: "poor linked outcomes suppress automatic bias" };
187
+ return { eligible: true, reason: "eligible only for future bounded bias; baseline router remains fallback" };
188
+ }
189
+
190
+ function baseHint(kind: RouterSharpeningHintKind, stats: GroupStats, rationale: string, comparedWith?: RouterSharpeningHint["provenance"]["comparedWith"], cardEvents?: number): RouterSharpeningHint {
191
+ const eventIds = stableSample([...new Set(stats.events.map((event) => event.eventId))].sort());
192
+ const checkpointIds = stableSample([...new Set(stats.events.map((event) => event.checkpointId))].sort());
193
+ const sparse = stats.events.length < 5 || stats.linkedOutcomes.length === 0 || stats.sessions.size < 2;
194
+ const hintConfidence = confidence(stats.events.length, stats.linkedOutcomes.length, stats.score);
195
+ return {
196
+ hintId: hashText("sharpen", kind, stats.action ?? "any", stats.provider ?? "unknown", stats.modelId, String(stats.events.length), String(stats.score)),
197
+ kind,
198
+ action: stats.action,
199
+ modelId: stats.modelId,
200
+ provider: stats.provider,
201
+ confidence: hintConfidence,
202
+ score: stats.score,
203
+ rationale,
204
+ guardrails: { manualPromotionOnly: true, sampleSizeCapped: sparse, sparse, autoUse: autoUse(kind, stats, hintConfidence) },
205
+ provenance: {
206
+ events: stats.events.length,
207
+ sessions: stats.sessions.size,
208
+ linkedOutcomes: stats.linkedOutcomes.length,
209
+ outcomeStatus: stats.outcomeStatus,
210
+ eventIds,
211
+ checkpointIds,
212
+ ...(cardEvents === undefined ? {} : { cardEvents }),
213
+ ...(comparedWith?.length ? { comparedWith } : {}),
214
+ },
215
+ };
216
+ }
217
+
218
+ function readCapabilityCards(path?: string): ModelCapabilityCard[] {
219
+ if (!path) return [];
220
+ const resolved = resolve(path);
221
+ if (!existsSync(resolved)) throw new Error(`capability cards file not found: ${path}`);
222
+ return readFileSync(resolved, "utf8")
223
+ .split("\n")
224
+ .filter((line) => line.trim())
225
+ .map((line, index) => {
226
+ try {
227
+ const card = JSON.parse(line) as ModelCapabilityCard;
228
+ if (card.schema !== "pi-router.model-capability-card.v1") throw new Error("invalid schema");
229
+ return card;
230
+ } catch (error) {
231
+ throw new Error(`invalid capability card at line ${index + 1}: ${error instanceof Error ? error.message : String(error)}`);
232
+ }
233
+ });
234
+ }
235
+
236
+ function cardMap(cards: ModelCapabilityCard[]): Map<string, ModelCapabilityCard> {
237
+ return new Map(cards.map((card) => [modelKey(card.provider, card.modelId), card]));
238
+ }
239
+
240
+ export function generateSharpeningHints(options: { events: RouteEvent[]; outcomes?: RouterOutcome[]; cards?: ModelCapabilityCard[]; generatedAt?: string; inputs?: RouterSharpeningArtifact["inputs"] }): RouterSharpeningArtifact {
241
+ const events = [...options.events].sort((a, b) => a.eventId.localeCompare(b.eventId));
242
+ const outcomes = options.outcomes ?? [];
243
+ const cards = options.cards ?? [];
244
+ const byCard = cardMap(cards);
245
+ const hints: RouterSharpeningHint[] = [];
246
+ const byAction = new Map<RouteAction, GroupStats[]>();
247
+ for (const stats of groupedByActionModel(events, outcomes)) {
248
+ byAction.set(stats.action!, [...(byAction.get(stats.action!) ?? []), stats]);
249
+ }
250
+
251
+ for (const [action, groups] of [...byAction.entries()].sort(([a], [b]) => a.localeCompare(b))) {
252
+ const sorted = groups.sort((a, b) => b.score - a.score || b.events.length - a.events.length || modelDisplay(a.provider, a.modelId).localeCompare(modelDisplay(b.provider, b.modelId)));
253
+ if (sorted.length < 2) continue;
254
+ const [best, runnerUp] = sorted;
255
+ if (!best || !runnerUp || best.score - runnerUp.score < 0.05) continue;
256
+ const comparedWith = sorted.slice(1, 4).map((stats) => ({ modelId: stats.modelId, provider: stats.provider, score: stats.score, events: stats.events.length }));
257
+ hints.push(baseHint(
258
+ "prefer_model_for_action",
259
+ best,
260
+ `${modelDisplay(best.provider, best.modelId)} leads historical ${action} samples (score ${best.score}, progress ${best.averageProgressScore}, loop ${best.averageLoopScore}) over ${modelDisplay(runnerUp.provider, runnerUp.modelId)} (score ${runnerUp.score}).`,
261
+ comparedWith,
262
+ byCard.get(modelKey(best.provider, best.modelId))?.observed.events,
263
+ ));
264
+ }
265
+
266
+ for (const stats of groupedByModel(events, outcomes).sort((a, b) => b.score - a.score || modelDisplay(a.provider, a.modelId).localeCompare(modelDisplay(b.provider, b.modelId)))) {
267
+ const card = byCard.get(modelKey(stats.provider, stats.modelId));
268
+ const cardProgressOk = card ? card.observed.averageProgressScore >= 0.65 && card.observed.averageLoopScore <= 0.35 : true;
269
+ if (!isLocalOrCheap(stats.modelId, stats.provider) || stats.events.length < 3 || stats.averageProgressScore < 0.65 || stats.averageLoopScore > 0.35 || stats.score < 0.65 || hasPoorLinkedOutcomes(stats) || !cardProgressOk) continue;
270
+ hints.push(baseHint(
271
+ "savings_candidate",
272
+ stats,
273
+ `${modelDisplay(stats.provider, stats.modelId)} looks safe to keep exploring for routine/worker traffic: ${stats.events.length} events, progress ${stats.averageProgressScore}, loop ${stats.averageLoopScore}. This is a manual hint, not an automatic promotion.`,
274
+ undefined,
275
+ card?.observed.events,
276
+ ));
277
+ }
278
+
279
+ const overridden = events.filter((event) => event.observed.followed === false || event.observed.overriddenBy);
280
+ if (overridden.length > 0) {
281
+ const groups = new Map<string, RouteEvent[]>();
282
+ for (const event of overridden) {
283
+ const modelId = event.runtime.activeModel ?? "unknown";
284
+ const key = `${event.decision.action}\0${modelKey(event.runtime.provider, modelId)}`;
285
+ groups.set(key, [...(groups.get(key) ?? []), event]);
286
+ }
287
+ for (const [key, group] of [...groups.entries()].sort()) {
288
+ const [action, provider, modelId] = key.split("\0") as [RouteAction, string, string];
289
+ const stats = computeStats(action, provider === "unknown" ? undefined : provider, modelId, group, outcomes);
290
+ hints.push(baseHint(
291
+ "mismatch_followup",
292
+ stats,
293
+ `${group.length} ${action} observations were explicitly not followed or overridden on ${modelDisplay(stats.provider, stats.modelId)}; inspect before trusting future auto-routing for this slice.`,
294
+ ));
295
+ }
296
+ }
297
+
298
+ const sortedHints = hints.sort((a, b) => {
299
+ const kind = a.kind.localeCompare(b.kind);
300
+ if (kind) return kind;
301
+ return b.score - a.score || (a.action ?? "").localeCompare(b.action ?? "") || modelDisplay(a.provider, a.modelId).localeCompare(modelDisplay(b.provider, b.modelId));
302
+ });
303
+
304
+ return {
305
+ schema: ROUTER_SHARPENING_HINTS_SCHEMA,
306
+ generatedAt: options.generatedAt ?? new Date().toISOString(),
307
+ inputs: options.inputs ?? { events: "<memory>" },
308
+ totals: {
309
+ events: events.length,
310
+ outcomes: outcomes.length,
311
+ cards: cards.length,
312
+ sessions: new Set(events.map((event) => event.sessionId)).size,
313
+ models: new Set(events.map((event) => modelKey(event.runtime.provider, event.runtime.activeModel ?? "unknown"))).size,
314
+ },
315
+ learningPolicy: {
316
+ scope: "repo-local",
317
+ ignoresRawTranscript: true,
318
+ fallback: "baseline-router",
319
+ minSessionsForAutoBias: 2,
320
+ minLinkedOutcomesForAutoBias: 5,
321
+ staleHintDecayRecommended: true,
322
+ },
323
+ hints: sortedHints,
324
+ manualPromotionRequired: true,
325
+ };
326
+ }
327
+
328
+ export function writeSharpeningHints(options: { eventsPath: string; outputPath: string; outcomesPath?: string; cardsPath?: string; generatedAt?: string }): RouterSharpeningArtifact {
329
+ if (!existsSync(resolve(options.eventsPath))) throw new Error(`required route events file not found: ${options.eventsPath}`);
330
+ const events = readRouteEvents(options.eventsPath);
331
+ const outcomes = readOutcomes(options.outcomesPath);
332
+ const cards = readCapabilityCards(options.cardsPath);
333
+ const artifact = generateSharpeningHints({
334
+ events,
335
+ outcomes,
336
+ cards,
337
+ generatedAt: options.generatedAt,
338
+ inputs: { events: options.eventsPath, outcomes: options.outcomesPath, cards: options.cardsPath },
339
+ });
340
+ const resolved = resolve(options.outputPath);
341
+ mkdirSync(dirname(resolved), { recursive: true });
342
+ writeFileSync(resolved, `${JSON.stringify(artifact, null, 2)}\n`);
343
+ return artifact;
344
+ }
@@ -0,0 +1,126 @@
1
+ import { mkdtempSync, readFileSync, writeFileSync } from "node:fs";
2
+ import { tmpdir } from "node:os";
3
+ import { join } from "node:path";
4
+ import { describe, expect, it } from "vitest";
5
+ import { parseTeacherDecision, runTeacherLabeling, teacherPromptText, type TeacherModelExecutor } from "./teacher-runner.js";
6
+ import type { TeacherPromptRequest } from "./learning.js";
7
+
8
+ function tempFile(name: string): string {
9
+ return join(mkdtempSync(join(tmpdir(), "pi-router-teacher-")), name);
10
+ }
11
+
12
+ function request(overrides: Partial<TeacherPromptRequest> = {}): TeacherPromptRequest {
13
+ return {
14
+ schema: "pi-router.teacher-prompt.v1",
15
+ requestId: "request-1",
16
+ teacher: "openai-codex/gpt-5.5",
17
+ checkpointId: "session-1:event-1",
18
+ sessionId: "session-1",
19
+ rawSessionRef: { schema: "pi-router.raw-session-ref.v1", path: "/tmp/session.jsonl", fromEvent: 1, toEvent: 2, fromByte: 10, toByte: 20, contentHash: "hash-only" },
20
+ allowedActions: ["continue_current", "run_verifier", "escalate_debug_diagnosis"],
21
+ instruction: "Return one decision.",
22
+ features: {
23
+ phase: "debug",
24
+ activeModel: "qwen3.6-35b-a3b-128k",
25
+ provider: "local",
26
+ loopScore: 0.7,
27
+ progressScore: 0.3,
28
+ sameCommandRepeatedCount: 1,
29
+ sameErrorRepeatedCount: 2,
30
+ verifierUsed: false,
31
+ noVerifierUsed: true,
32
+ diffLines: 12,
33
+ diffFilesChanged: 2,
34
+ },
35
+ ...overrides,
36
+ };
37
+ }
38
+
39
+ function decisionJson(action = "run_verifier") {
40
+ return JSON.stringify({
41
+ schema: "pi-router.decision.v1",
42
+ checkpointId: "session-1:event-1",
43
+ action,
44
+ adviceShape: "none",
45
+ contextPolicy: "minimal",
46
+ confidence: 0.82,
47
+ reason: "teacher says verifier should run before more edits",
48
+ policyVersion: "teacher/openai-codex/gpt-5.5",
49
+ });
50
+ }
51
+
52
+ describe("router teacher label runner", () => {
53
+ it("builds explicit teacher prompts with only the bounded raw session span", () => {
54
+ const sessionPath = tempFile("session.jsonl");
55
+ writeFileSync(sessionPath, "0123456789bounded-span-secret-tail");
56
+ const prompt = teacherPromptText(request({ rawSessionRef: { schema: "pi-router.raw-session-ref.v1", path: sessionPath, fromEvent: 1, toEvent: 2, fromByte: 10, toByte: 22, contentHash: "hash-only" } }));
57
+
58
+ expect(prompt).toContain("Return exactly one JSON object");
59
+ expect(prompt).toContain("run_verifier");
60
+ expect(prompt).toContain("rawSessionRef");
61
+ expect(prompt).toContain("bounded-span");
62
+ expect(prompt).not.toContain("secret-tail");
63
+ });
64
+
65
+ it("parses and validates teacher decisions", () => {
66
+ const parsed = parseTeacherDecision(request(), `\n\n\`\`\`json\n${decisionJson()}\n\`\`\``);
67
+
68
+ expect(parsed).toMatchObject({ schema: "pi-router.decision.v1", checkpointId: "session-1:event-1", action: "run_verifier", policyVersion: "teacher/openai-codex/gpt-5.5/request/request-1" });
69
+ expect(() => parseTeacherDecision(request(), decisionJson("stop_and_ask_user"))).toThrow(/not allowed/);
70
+ expect(() => parseTeacherDecision(request(), JSON.stringify({
71
+ schema: "pi-router.decision.v1",
72
+ checkpointId: "session-1:event-1",
73
+ action: "run_verifier",
74
+ confidence: 0.8,
75
+ reason: "missing fields",
76
+ }))).toThrow(/adviceShape invalid/);
77
+
78
+ const withExtras = parseTeacherDecision(request(), JSON.stringify({
79
+ ...JSON.parse(decisionJson()),
80
+ reason: "The transcript says \"this is a very long raw transcript quote that should not be stored in labels\" and token=secret",
81
+ transcriptExcerpt: "do not persist me",
82
+ policyVersion: "model-supplied",
83
+ }));
84
+ expect(withExtras.policyVersion).toBe("teacher/openai-codex/gpt-5.5/request/request-1");
85
+ expect(JSON.stringify(withExtras)).not.toContain("transcriptExcerpt");
86
+ expect(withExtras.reason).not.toContain("very long raw transcript quote");
87
+ expect(withExtras.reason).not.toContain("token=secret");
88
+ });
89
+
90
+ it("runs an injected teacher executor and writes decisions plus labels", async () => {
91
+ const requestsPath = tempFile("requests.jsonl");
92
+ const decisionsPath = tempFile("teacher-decisions.jsonl");
93
+ const labelsPath = tempFile("teacher-labels.jsonl");
94
+ writeFileSync(requestsPath, `${JSON.stringify(request())}\n`);
95
+ const executor: TeacherModelExecutor = ({ prompt }) => {
96
+ expect(prompt).toContain("session-1:event-1");
97
+ return decisionJson();
98
+ };
99
+
100
+ const summary = await runTeacherLabeling({
101
+ requestsPath,
102
+ decisionsOutputPath: decisionsPath,
103
+ labelsOutputPath: labelsPath,
104
+ executor,
105
+ generatedAt: "2026-06-14T00:00:00.000Z",
106
+ });
107
+
108
+ expect(summary).toMatchObject({ schema: "pi-router.teacher-run-summary.v1", teacher: "openai-codex/gpt-5.5", teachers: ["openai-codex/gpt-5.5"], requests: 1, decisions: 1, labels: 1, dryRun: false });
109
+ expect(readFileSync(decisionsPath, "utf8")).toContain("pi-router.decision.v1");
110
+ const label = JSON.parse(readFileSync(labelsPath, "utf8").trim());
111
+ expect(label).toMatchObject({ schema: "pi-router.teacher-label.v1", source: "teacher-output", suggestedAction: "run_verifier" });
112
+ });
113
+
114
+ it("supports dry-run without model calls", async () => {
115
+ const requestsPath = tempFile("requests.jsonl");
116
+ const decisionsPath = tempFile("teacher-decisions.jsonl");
117
+ const labelsPath = tempFile("teacher-labels.jsonl");
118
+ writeFileSync(requestsPath, `${JSON.stringify(request())}\n`);
119
+
120
+ const summary = await runTeacherLabeling({ requestsPath, decisionsOutputPath: decisionsPath, labelsOutputPath: labelsPath, dryRun: true });
121
+
122
+ expect(summary).toMatchObject({ requests: 1, decisions: 0, labels: 0, dryRun: true });
123
+ expect(readFileSync(decisionsPath, "utf8")).toBe("");
124
+ expect(readFileSync(labelsPath, "utf8")).toBe("");
125
+ });
126
+ });