@booplex/bpx-consult 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/LICENSE +21 -0
- package/README.md +175 -0
- package/index.ts +112 -0
- package/package.json +54 -0
- package/prompts/advisor-system.txt +28 -0
- package/src/advisor.ts +137 -0
- package/src/cli-backend.ts +256 -0
- package/src/config.ts +422 -0
- package/src/consensus.ts +173 -0
- package/src/context-engine.ts +395 -0
- package/src/council.ts +429 -0
- package/src/debate.ts +292 -0
- package/src/messages.ts +49 -0
- package/src/personas.ts +163 -0
- package/src/solo.ts +205 -0
- package/src/timeout.ts +87 -0
- package/src/triggers.ts +190 -0
package/src/consensus.ts
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* consensus — stance validation + confidence scoring.
|
|
3
|
+
*
|
|
4
|
+
* The two mechanics that give "no fake consensus" actual teeth, both lifted
|
|
5
|
+
* from my-zen's tools/consensus.py:
|
|
6
|
+
*
|
|
7
|
+
* - stance validation: did a member actually hold its assigned stance, or did
|
|
8
|
+
* it return mush? A `critic` that agrees with everything is theater.
|
|
9
|
+
* - confidence score: 0.4·success + 0.35·agreement + 0.25·avg_alignment,
|
|
10
|
+
* surfaced with the synthesis so you can see how solid the read is.
|
|
11
|
+
*
|
|
12
|
+
* Cheap and heuristic by design — these are signals, not verdicts. The
|
|
13
|
+
* synthesizer model still does the real merging; we just give it (and you)
|
|
14
|
+
* honest metadata about what the members actually did.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import type { Stance } from "./personas.js";
|
|
18
|
+
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Member result shape
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
export interface MemberResult {
|
|
24
|
+
persona: string;
|
|
25
|
+
stance: Stance;
|
|
26
|
+
model: string;
|
|
27
|
+
/** "ok" if the member replied with usable text, "error" otherwise. */
|
|
28
|
+
status: "ok" | "error";
|
|
29
|
+
text: string;
|
|
30
|
+
errorMessage?: string;
|
|
31
|
+
/** 0..1 — did the reply actually reflect the assigned stance? */
|
|
32
|
+
alignment: number;
|
|
33
|
+
usage?: { input: number; output: number; total: number };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Stance validation
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Heuristic stance-alignment check. Returns a 0..1 score indicating how well
|
|
42
|
+
* the reply reflects the assigned stance.
|
|
43
|
+
*
|
|
44
|
+
* Not an LLM judge (my-zen has one via _validate_stance_with_llm, but that's
|
|
45
|
+
* another model call per member — too expensive for v1). Keyword + signal
|
|
46
|
+
* based:
|
|
47
|
+
* - `against`: looks for critique signals (flaw, risk, won't, breaks, wrong,
|
|
48
|
+
* assumption, failure). A reply with none of them from a critic is suspect.
|
|
49
|
+
* - `for`: looks for advocacy signals (sound, works, solid, agree, good).
|
|
50
|
+
* - `neutral`: neutral by construction — alignment is whether it weighed both
|
|
51
|
+
* sides (any of either signal set counts).
|
|
52
|
+
*
|
|
53
|
+
* Deliberately permissive on the high end (1.0) and strict on the low end:
|
|
54
|
+
* a low score flags "this member didn't do its job," which is the only signal
|
|
55
|
+
* that actually matters. Borderline cases default to 0.6 (trust the reply).
|
|
56
|
+
*/
|
|
57
|
+
export function validateStance(text: string, stance: Stance): number {
|
|
58
|
+
const t = text.toLowerCase();
|
|
59
|
+
if (!t.trim()) return 0;
|
|
60
|
+
|
|
61
|
+
const critiqueSignals = [
|
|
62
|
+
"flaw", "risk", "won't", "won’t", "breaks", "broken", "wrong", "assumption",
|
|
63
|
+
"failure", "fail", "missing", "edge case", "fragile", "beware", "problem",
|
|
64
|
+
"issue", "concern", "gap", "unclear", "danger", "unhandled",
|
|
65
|
+
];
|
|
66
|
+
const advocacySignals = [
|
|
67
|
+
"sound", "works", "solid", "agree", "good", "correct", "appropriate",
|
|
68
|
+
"reasonable", "holds up", "make sense", "fits", "right call",
|
|
69
|
+
];
|
|
70
|
+
|
|
71
|
+
const hasCritique = critiqueSignals.some((s) => t.includes(s));
|
|
72
|
+
const hasAdvocacy = advocacySignals.some((s) => t.includes(s));
|
|
73
|
+
|
|
74
|
+
if (stance === "against") {
|
|
75
|
+
if (hasCritique) return 1;
|
|
76
|
+
if (hasAdvocacy && !hasCritique) return 0.3; // critic that only praised — suspect
|
|
77
|
+
return 0.6;
|
|
78
|
+
}
|
|
79
|
+
if (stance === "for") {
|
|
80
|
+
if (hasAdvocacy) return 1;
|
|
81
|
+
if (hasCritique && !hasAdvocacy) return 0.5; // advocate that only attacked — forgot its job
|
|
82
|
+
return 0.6;
|
|
83
|
+
}
|
|
84
|
+
// neutral: weighing both sides is the win
|
|
85
|
+
if (hasCritique && hasAdvocacy) return 1;
|
|
86
|
+
if (hasCritique || hasAdvocacy) return 0.7;
|
|
87
|
+
return 0.6;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
// Confidence score (my-zen formula)
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
export interface ConfidenceBreakdown {
|
|
95
|
+
successRatio: number;
|
|
96
|
+
agreementRatio: number;
|
|
97
|
+
avgAlignment: number;
|
|
98
|
+
/** 0..1 weighted: 0.4·success + 0.35·agreement + 0.25·avgAlignment */
|
|
99
|
+
confidence: number;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Compute the consensus confidence over member results.
|
|
104
|
+
*
|
|
105
|
+
* - successRatio: fraction of members that returned usable text.
|
|
106
|
+
* - agreementRatio: the largest stance group / successful members. High when
|
|
107
|
+
* most members land on the same stance regardless of persona.
|
|
108
|
+
* - avgAlignment: mean of per-member stance-alignment scores.
|
|
109
|
+
*
|
|
110
|
+
* A council where the `critic` secretly agreed, the `architect` attacked, and
|
|
111
|
+
* the `simplifier` said nothing useful will score low on agreement and
|
|
112
|
+
* alignment — which is the honest signal, even though the stances differ by
|
|
113
|
+
* design. The synthesizer explains the substance; this just flags the shape.
|
|
114
|
+
*/
|
|
115
|
+
export function computeConfidence(results: MemberResult[]): ConfidenceBreakdown {
|
|
116
|
+
const total = results.length;
|
|
117
|
+
if (total === 0) {
|
|
118
|
+
return { successRatio: 0, agreementRatio: 0, avgAlignment: 0, confidence: 0 };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const successful = results.filter((r) => r.status === "ok");
|
|
122
|
+
const successCount = successful.length;
|
|
123
|
+
if (successCount === 0) {
|
|
124
|
+
return { successRatio: 0, agreementRatio: 0, avgAlignment: 0, confidence: 0 };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const successRatio = successCount / total;
|
|
128
|
+
|
|
129
|
+
const stanceCounts: Record<Stance, number> = { for: 0, against: 0, neutral: 0 };
|
|
130
|
+
for (const r of successful) stanceCounts[r.stance]++;
|
|
131
|
+
const maxStanceCount = Math.max(...Object.values(stanceCounts));
|
|
132
|
+
const agreementRatio = maxStanceCount / successCount;
|
|
133
|
+
|
|
134
|
+
const alignmentScores = successful.map((r) => r.alignment);
|
|
135
|
+
const avgAlignment = alignmentScores.reduce((a, b) => a + b, 0) / alignmentScores.length;
|
|
136
|
+
|
|
137
|
+
const confidence = round2(successRatio * 0.4 + agreementRatio * 0.35 + avgAlignment * 0.25);
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
successRatio: round2(successRatio),
|
|
141
|
+
agreementRatio: round2(agreementRatio),
|
|
142
|
+
avgAlignment: round2(avgAlignment),
|
|
143
|
+
confidence,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function round2(n: number): number {
|
|
148
|
+
return Math.round(n * 100) / 100;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
// Disagreement detection — surface it, don't paper over it
|
|
153
|
+
// ---------------------------------------------------------------------------
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Detect whether the council materially disagreed. Returns a short human-readable
|
|
157
|
+
* note for the synthesizer prompt, or undefined if members were broadly aligned.
|
|
158
|
+
*
|
|
159
|
+
* "Material disagreement" = at least one `for` and one `against` member both
|
|
160
|
+
* held their stance (alignment ≥ 0.7). Neutral members don't count toward the
|
|
161
|
+
* split — they're expected to weigh both sides.
|
|
162
|
+
*/
|
|
163
|
+
export function detectDisagreement(results: MemberResult[]): string | undefined {
|
|
164
|
+
const heldFor = results.filter((r) => r.stance === "for" && r.alignment >= 0.7 && r.status === "ok");
|
|
165
|
+
const heldAgainst = results.filter((r) => r.stance === "against" && r.alignment >= 0.7 && r.status === "ok");
|
|
166
|
+
|
|
167
|
+
if (heldFor.length > 0 && heldAgainst.length > 0) {
|
|
168
|
+
return `The council split: ${heldFor.map((r) => r.persona).join(", ")} advocated FOR; ${heldAgainst
|
|
169
|
+
.map((r) => r.persona)
|
|
170
|
+
.join(", ")} pushed BACK. Surface this disagreement honestly in your synthesis — do not manufacture a false consensus.`;
|
|
171
|
+
}
|
|
172
|
+
return undefined;
|
|
173
|
+
}
|
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* context-engine — the §P fix.
|
|
3
|
+
*
|
|
4
|
+
* rpiv-advisor forwards Pi's already-compacted session context to the advisor
|
|
5
|
+
* model without re-fitting it to the advisor's *own* window. The executor's
|
|
6
|
+
* compacted context can be larger than a small-window advisor (flash-tier 32k,
|
|
7
|
+
* or a CLI at 32–64k), so the advisor call overflows *its* window and dies —
|
|
8
|
+
* exactly when the session is long enough to need it.
|
|
9
|
+
*
|
|
10
|
+
* This module re-fits Pi's compacted context to whatever window *this* advisor
|
|
11
|
+
* has. The pipeline (SPEC §C):
|
|
12
|
+
* 1. strip in-flight consult() call (lifted from rpiv-advisor/context.ts)
|
|
13
|
+
* 2. extract user/assistant/tool text (lifted from pi-advisor/advisor-messages.ts)
|
|
14
|
+
* 3. [fast-follow] stage + signal detection — not here yet, doesn't affect fit
|
|
15
|
+
* 4. [fast-follow] signal block
|
|
16
|
+
* 5. per-message char caps with [omitted] markers (pi-advisor clampText)
|
|
17
|
+
* 6. sliding window: keep first N + last M, drop oldest-first when still over
|
|
18
|
+
* 7. reserve response tokens (load-bearing — see invariant below)
|
|
19
|
+
* 8. assemble final Message[] + a closing context message
|
|
20
|
+
*
|
|
21
|
+
* Window-fit does NOT depend on stage/signal detection. Those improve the
|
|
22
|
+
* directive, not the fit. Ship the guaranteed-fit core first.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import type { Message, UserMessage, AssistantMessage, ToolResultMessage, TextContent } from "@earendil-works/pi-ai";
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Public types
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
export interface ContextBudget {
|
|
32
|
+
userChars: number;
|
|
33
|
+
assistantChars: number;
|
|
34
|
+
toolArgChars: number;
|
|
35
|
+
toolResultChars: number;
|
|
36
|
+
keepFirst: number;
|
|
37
|
+
keepLast: number;
|
|
38
|
+
/** Tokens reserved for the advisor's reply. The input budget is window minus this. */
|
|
39
|
+
responseReserveTokens: number;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface FitResult {
|
|
43
|
+
/** The re-fitted messages, guaranteed to fit `maxInputTokens`. */
|
|
44
|
+
messages: Message[];
|
|
45
|
+
/** How many messages were dropped by the sliding window, if any. */
|
|
46
|
+
omittedCount: number;
|
|
47
|
+
/** Estimated tokens of the final payload, for diagnostics. */
|
|
48
|
+
estimatedTokens: number;
|
|
49
|
+
/** The token budget the fit was computed against. */
|
|
50
|
+
maxInputTokens: number;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ---------------------------------------------------------------------------
|
|
54
|
+
// Token estimation
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Rough token estimate. The ratio is deliberately conservative.
|
|
59
|
+
*
|
|
60
|
+
* The often-cited 4 chars/token is the tiktoken average for English prose.
|
|
61
|
+
* But the bulk of what we forward — tool arguments, tool results, code —
|
|
62
|
+
* tokenizes DENSER (closer to 3-3.5 chars/token). Using 4 would UNDERESTIMATE
|
|
63
|
+
* tokens on code-heavy sessions, causing us to pack more than fits and
|
|
64
|
+
* overflow the advisor window — which reopens the exact §P bug we exist to fix.
|
|
65
|
+
*
|
|
66
|
+
* So we use 3 chars/token (overestimates tokens for prose, the safe direction)
|
|
67
|
+
* and apply a 1.15 safety factor on top for provider-tokenizer variance.
|
|
68
|
+
* Under-packing is cheap; overflow defeats the whole point of this module.
|
|
69
|
+
*
|
|
70
|
+
* No real tokenizer (tiktoken etc.) because (a) heavy native dep for an
|
|
71
|
+
* estimate, (b) every provider tokenizes differently, (c) the heuristic only
|
|
72
|
+
* needs to be conservative enough that the cap+window pass lands under budget
|
|
73
|
+
* with margin.
|
|
74
|
+
*/
|
|
75
|
+
const CHARS_PER_TOKEN = 3;
|
|
76
|
+
const SAFETY_FACTOR = 1.15;
|
|
77
|
+
|
|
78
|
+
export function estimateTokens(text: string): number {
|
|
79
|
+
if (!text) return 0;
|
|
80
|
+
return Math.ceil((text.length / CHARS_PER_TOKEN) * SAFETY_FACTOR);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Sum tokens across every text-bearing field of a message. */
|
|
84
|
+
export function estimateMessageTokens(msg: Message): number {
|
|
85
|
+
return estimateTokens(stringifyMessageForEstimate(msg));
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Flatten a message to a single string for token estimation. Only counts text
|
|
90
|
+
* we will actually forward — image blocks are intentionally excluded (we strip
|
|
91
|
+
* them in extract anyway; advisors don't need screenshots).
|
|
92
|
+
*/
|
|
93
|
+
function stringifyMessageForEstimate(msg: Message): string {
|
|
94
|
+
if (msg.role === "user") {
|
|
95
|
+
return typeof msg.content === "string" ? msg.content : textBlocks(msg.content).map((b) => b.text).join("\n");
|
|
96
|
+
}
|
|
97
|
+
if (msg.role === "assistant") {
|
|
98
|
+
return msg.content
|
|
99
|
+
.map((b) => {
|
|
100
|
+
if (b.type === "text") return b.text;
|
|
101
|
+
if (b.type === "toolCall") return JSON.stringify(b.arguments ?? {});
|
|
102
|
+
if (b.type === "thinking") return b.thinking ?? "";
|
|
103
|
+
return "";
|
|
104
|
+
})
|
|
105
|
+
.join("\n");
|
|
106
|
+
}
|
|
107
|
+
// toolResult
|
|
108
|
+
return textBlocks(msg.content).map((b) => b.text).join("\n");
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
// Step 1 — strip in-flight consult() call (faithful fork of rpiv-advisor)
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
export const CONSULT_TOOL_NAME = "consult";
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Remove the executor's in-flight consult() toolCall from the tail assistant
|
|
119
|
+
* message. That call is what invoked us — there is no matching toolResult yet,
|
|
120
|
+
* and providers reject payloads with orphan toolCalls. Name-targeted so other
|
|
121
|
+
* trailing toolCalls stay visible.
|
|
122
|
+
*
|
|
123
|
+
* Lifted from rpiv-advisor/advisor/context.ts:stripInflightAdvisorCall, renamed
|
|
124
|
+
* to the consult tool name.
|
|
125
|
+
*/
|
|
126
|
+
export function stripInflightConsultCall(messages: Message[]): Message[] {
|
|
127
|
+
if (messages.length === 0) return messages;
|
|
128
|
+
const last = messages[messages.length - 1];
|
|
129
|
+
if (last.role !== "assistant") return messages;
|
|
130
|
+
const filtered = last.content.filter((c) => !(c.type === "toolCall" && c.name === CONSULT_TOOL_NAME));
|
|
131
|
+
if (filtered.length === last.content.length) return messages;
|
|
132
|
+
if (filtered.length === 0) return messages.slice(0, -1);
|
|
133
|
+
return [...messages.slice(0, -1), { ...last, content: filtered }];
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// ---------------------------------------------------------------------------
|
|
137
|
+
// Step 5 — per-message char caps (fork of pi-advisor clampText)
|
|
138
|
+
// ---------------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Clamp text to a char budget with an explicit marker. Lifted from
|
|
142
|
+
* pi-advisor/advisor-messages.ts:clampText, simplified (we cap by chars, not
|
|
143
|
+
* lines — the line cap was a belt-and-braces second constraint that adds noise
|
|
144
|
+
* here). Marks truncation explicitly so the advisor sees content was cut.
|
|
145
|
+
*/
|
|
146
|
+
export function clampText(text: string, maxChars: number): string {
|
|
147
|
+
const normalized = text.trim();
|
|
148
|
+
if (normalized.length <= maxChars) return normalized;
|
|
149
|
+
return `${normalized.slice(0, maxChars).trimEnd()}…\n[truncated for advisor context]`;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function clampUserMessage(msg: UserMessage, budget: ContextBudget): UserMessage {
|
|
153
|
+
if (typeof msg.content === "string") {
|
|
154
|
+
return { ...msg, content: clampText(msg.content, budget.userChars) };
|
|
155
|
+
}
|
|
156
|
+
const capped: TextContent[] = textBlocks(msg.content).map((b) => ({ type: "text", text: clampText(b.text, budget.userChars) }));
|
|
157
|
+
return { ...msg, content: capped };
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function clampAssistantMessage(msg: AssistantMessage, budget: ContextBudget): AssistantMessage {
|
|
161
|
+
// Keep text + toolCalls + thinking, but cap each text block and each toolCall's
|
|
162
|
+
// serialized arguments. ToolCalls themselves are structurally important (the
|
|
163
|
+
// advisor needs to see what was attempted), so we keep the call but trim
|
|
164
|
+
// oversized args rather than dropping the whole call.
|
|
165
|
+
const content = msg.content.map((b) => {
|
|
166
|
+
if (b.type === "text") return { ...b, text: clampText(b.text, budget.assistantChars) };
|
|
167
|
+
if (b.type === "toolCall") {
|
|
168
|
+
const argsJson = JSON.stringify(b.arguments ?? {});
|
|
169
|
+
if (argsJson.length <= budget.toolArgChars) return b;
|
|
170
|
+
return { ...b, arguments: { _truncated: clampText(argsJson, budget.toolArgChars) } };
|
|
171
|
+
}
|
|
172
|
+
return b; // thinking blocks passed through
|
|
173
|
+
});
|
|
174
|
+
return { ...msg, content };
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
function clampToolResultMessage(msg: ToolResultMessage, budget: ContextBudget): ToolResultMessage {
|
|
178
|
+
const content: TextContent[] = textBlocks(msg.content).map((b) => ({ type: "text", text: clampText(b.text, budget.toolResultChars) }));
|
|
179
|
+
return { ...msg, content };
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/** Apply per-message char caps. Non-mutating. */
|
|
183
|
+
export function applyCharCaps(messages: Message[], budget: ContextBudget): Message[] {
|
|
184
|
+
return messages.map((msg) => {
|
|
185
|
+
if (msg.role === "user") return clampUserMessage(msg, budget);
|
|
186
|
+
if (msg.role === "assistant") return clampAssistantMessage(msg, budget);
|
|
187
|
+
return clampToolResultMessage(msg, budget);
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
// Step 6 — sliding window (first-N + last-M, oldest-first drop)
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Drop messages from the middle of the transcript until we're under the token
|
|
197
|
+
* budget. Keeps the first `keepFirst` (task framing) and the last `keepLast`
|
|
198
|
+
* (freshest evidence), inserting an [omitted] marker between them. If still
|
|
199
|
+
* over after one pass, shrink keepLast one message at a time until it fits.
|
|
200
|
+
*
|
|
201
|
+
* Faithful to pi-advisor's first-2 + last-N-with-omitted-marker shape, but the
|
|
202
|
+
* *stopping condition* is the token budget, not a fixed message count — that
|
|
203
|
+
* is the §P fix. pi-advisor's `maxMessages` is a guess at the window; we read
|
|
204
|
+
* the real window per-call instead.
|
|
205
|
+
*/
|
|
206
|
+
export function fitToWindow(messages: Message[], budget: ContextBudget, maxInputTokens: number): FitResult {
|
|
207
|
+
if (messages.length === 0) {
|
|
208
|
+
return { messages: [], omittedCount: 0, estimatedTokens: 0, maxInputTokens };
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Quick path: already fits.
|
|
212
|
+
const whole = sumTokens(messages);
|
|
213
|
+
if (whole <= maxInputTokens) {
|
|
214
|
+
return { messages, omittedCount: 0, estimatedTokens: whole, maxInputTokens };
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const keepFirst = Math.min(budget.keepFirst, messages.length);
|
|
218
|
+
// Start from the configured tail and shrink under budget.
|
|
219
|
+
let keepLast = Math.min(budget.keepLast, messages.length - keepFirst);
|
|
220
|
+
|
|
221
|
+
const head = messages.slice(0, keepFirst);
|
|
222
|
+
|
|
223
|
+
// Shrink the tail until head + marker + tail fits. Test down to keepLast=1
|
|
224
|
+
// (a single tail message) before falling through to the last-resort path —
|
|
225
|
+
// otherwise we'd skip a fit that retains the head and lose it unnecessarily.
|
|
226
|
+
while (keepLast >= 1) {
|
|
227
|
+
const tail = messages.slice(-keepLast);
|
|
228
|
+
const omittedCount = messages.length - keepFirst - keepLast;
|
|
229
|
+
const marker = omittedMarker(omittedCount);
|
|
230
|
+
const candidate = [...head, marker, ...tail];
|
|
231
|
+
if (sumTokens(candidate) <= maxInputTokens) {
|
|
232
|
+
return { messages: candidate, omittedCount, estimatedTokens: sumTokens(candidate), maxInputTokens };
|
|
233
|
+
}
|
|
234
|
+
keepLast--;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Last resort: keep only the final message, capped. The cap pass already ran
|
|
238
|
+
// but re-clamp the survivor aggressively to whatever budget remains.
|
|
239
|
+
const only = messages[messages.length - 1];
|
|
240
|
+
const omittedCount = messages.length - 1;
|
|
241
|
+
const marker = omittedMarker(omittedCount);
|
|
242
|
+
const survivor = clampSurvivor(only, maxInputTokens - sumTokens([marker]));
|
|
243
|
+
const candidate = [marker, survivor];
|
|
244
|
+
return { messages: candidate, omittedCount, estimatedTokens: sumTokens(candidate), maxInputTokens };
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
function omittedMarker(omittedCount: number): UserMessage {
|
|
248
|
+
return {
|
|
249
|
+
role: "user",
|
|
250
|
+
content: `[${omittedCount} earlier transcript messages omitted to fit the advisor context window. Focus on the retained task framing and the most recent evidence.]`,
|
|
251
|
+
timestamp: Date.now(),
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* When even one message won't fit, clamp its text down to the remaining budget.
|
|
257
|
+
* Self-correcting: because estimateTokens applies a safety factor, clamping by
|
|
258
|
+
* chars then re-estimating can overshoot. So we clamp, check the estimate, and
|
|
259
|
+
* halve until it genuinely fits — never trust the char math alone on the
|
|
260
|
+
* last-resort path, which is exactly where overflow would reopen §P.
|
|
261
|
+
*/
|
|
262
|
+
function clampSurvivor(msg: Message, remainingTokenBudget: number): Message {
|
|
263
|
+
let maxChars = Math.max(64, Math.floor((remainingTokenBudget * CHARS_PER_TOKEN) / SAFETY_FACTOR));
|
|
264
|
+
const original = stringifyMessageForEstimate(msg);
|
|
265
|
+
let clamped = clampText(original, maxChars);
|
|
266
|
+
// Guard: if the re-estimate still overshoots (ceil rounding, provider variance),
|
|
267
|
+
// keep shrinking until it fits. Bounded — maxChars collapses fast.
|
|
268
|
+
let guard = 0;
|
|
269
|
+
while (estimateTokens(clamped) > remainingTokenBudget && maxChars > 32 && guard < 20) {
|
|
270
|
+
maxChars = Math.floor(maxChars * 0.7);
|
|
271
|
+
clamped = clampText(original, maxChars);
|
|
272
|
+
guard++;
|
|
273
|
+
}
|
|
274
|
+
// Return as a single text user message — structure is already lost at this
|
|
275
|
+
// point, honesty about that beats a half-mangled typed payload.
|
|
276
|
+
return { role: "user", content: clamped, timestamp: "timestamp" in msg ? msg.timestamp : Date.now() };
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// ---------------------------------------------------------------------------
|
|
280
|
+
// Step 7 — reserve + derive the input budget (load-bearing)
|
|
281
|
+
// ---------------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Derive the input token budget for this advisor call.
|
|
285
|
+
*
|
|
286
|
+
* maxInputTokens = advisor.contextWindow - responseReserveTokens
|
|
287
|
+
*
|
|
288
|
+
* This is the §P fix in one line: the budget is relative to *this* advisor's
|
|
289
|
+
* window, read live from the registry, never a global constant. If we can't
|
|
290
|
+
* read the window, fall back to a conservative 32k (typical small advisor) so
|
|
291
|
+
* we still re-fit rather than forwarding blindly.
|
|
292
|
+
*/
|
|
293
|
+
export function deriveInputBudget(advisorContextWindow: number | undefined, budget: ContextBudget): number {
|
|
294
|
+
const window = advisorContextWindow ?? 32_000;
|
|
295
|
+
// Floor the reserve at a sane minimum; never let it eat the whole window.
|
|
296
|
+
const reserve = Math.min(budget.responseReserveTokens, Math.floor(window * 0.5));
|
|
297
|
+
return Math.max(1024, window - reserve);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// ---------------------------------------------------------------------------
|
|
301
|
+
// Step 8 — assemble: the full pipeline
|
|
302
|
+
// ---------------------------------------------------------------------------
|
|
303
|
+
|
|
304
|
+
export interface BuildContextInput {
|
|
305
|
+
/** Pi's resolved (already-compacted) session messages for the active branch. */
|
|
306
|
+
sessionMessages: Message[];
|
|
307
|
+
/** This advisor model's context window, from the registry. Undefined if unknown. */
|
|
308
|
+
advisorContextWindow?: number;
|
|
309
|
+
budget: ContextBudget;
|
|
310
|
+
/** Optional closing directive (stage objective etc.) appended as a final user msg. */
|
|
311
|
+
directive?: string;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Run the full re-fit pipeline. Returns messages guaranteed to fit the advisor's
|
|
316
|
+
* window (§I invariant: the advisor call always fits).
|
|
317
|
+
*
|
|
318
|
+
* Order matters: strip → cap → window. Stripping first means we never budget for
|
|
319
|
+
* our own in-flight call; capping before windowing means each message is already
|
|
320
|
+
* small when we count tokens for the drop decision, so the window pass makes
|
|
321
|
+
* fewer, better cuts.
|
|
322
|
+
*/
|
|
323
|
+
export function buildConsultContext(input: BuildContextInput): FitResult {
|
|
324
|
+
const stripped = stripInflightConsultCall(input.sessionMessages);
|
|
325
|
+
const capped = applyCharCaps(stripped, input.budget);
|
|
326
|
+
const maxInputTokens = deriveInputBudget(input.advisorContextWindow, input.budget);
|
|
327
|
+
|
|
328
|
+
// Reserve room for the directive + closing marker before windowing, so the
|
|
329
|
+
// window pass accounts for them. The directive is small but non-zero.
|
|
330
|
+
const directiveTokens = input.directive ? estimateTokens(input.directive) + 8 : 0;
|
|
331
|
+
const adjustedBudget = Math.max(1024, maxInputTokens - directiveTokens);
|
|
332
|
+
|
|
333
|
+
const fit = fitToWindow(capped, input.budget, adjustedBudget);
|
|
334
|
+
|
|
335
|
+
// Append the directive as a final user message (fresh evidence last — the
|
|
336
|
+
// advisor's attention is strongest at the tail).
|
|
337
|
+
let messages = fit.messages;
|
|
338
|
+
if (input.directive) {
|
|
339
|
+
const closing: UserMessage = { role: "user", content: input.directive, timestamp: Date.now() };
|
|
340
|
+
messages = [...messages, closing];
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
return {
|
|
344
|
+
messages,
|
|
345
|
+
omittedCount: fit.omittedCount,
|
|
346
|
+
estimatedTokens: sumTokens(messages),
|
|
347
|
+
maxInputTokens,
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// ---------------------------------------------------------------------------
|
|
352
|
+
// Helpers
|
|
353
|
+
// ---------------------------------------------------------------------------
|
|
354
|
+
|
|
355
|
+
function textBlocks(content: TextContent[] | unknown): TextContent[] {
|
|
356
|
+
if (!Array.isArray(content)) return [];
|
|
357
|
+
return content.filter((b): b is TextContent => b?.type === "text" && typeof b.text === "string");
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function sumTokens(messages: Message[]): number {
|
|
361
|
+
let total = 0;
|
|
362
|
+
for (const m of messages) total += estimateMessageTokens(m);
|
|
363
|
+
return total;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/** Constructor helper for tests / callers building a user message. */
|
|
367
|
+
export function userText(text: string): UserMessage {
|
|
368
|
+
return { role: "user", content: text, timestamp: Date.now() };
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
/** Constructor helper for tests building an assistant text message. */
|
|
372
|
+
export function assistantText(text: string): AssistantMessage {
|
|
373
|
+
return {
|
|
374
|
+
role: "assistant",
|
|
375
|
+
content: [{ type: "text", text }],
|
|
376
|
+
api: "anthropic-messages" as never,
|
|
377
|
+
provider: "anthropic" as never,
|
|
378
|
+
model: "test-model",
|
|
379
|
+
usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
|
|
380
|
+
stopReason: "stop",
|
|
381
|
+
timestamp: Date.now(),
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
/** Constructor helper for tests building a tool result message. */
|
|
386
|
+
export function toolResultText(text: string): ToolResultMessage {
|
|
387
|
+
return {
|
|
388
|
+
role: "toolResult",
|
|
389
|
+
toolCallId: "test-call",
|
|
390
|
+
toolName: "bash",
|
|
391
|
+
content: [{ type: "text", text }],
|
|
392
|
+
isError: false,
|
|
393
|
+
timestamp: Date.now(),
|
|
394
|
+
};
|
|
395
|
+
}
|