@agent-controller/runtime 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +98 -0
- package/dist/adapter.d.ts +23 -0
- package/dist/adapter.js +980 -0
- package/dist/honesty.d.ts +59 -0
- package/dist/honesty.js +226 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +40 -0
- package/dist/testing/fake-provider.d.ts +60 -0
- package/dist/testing/fake-provider.js +170 -0
- package/dist/types.d.ts +112 -0
- package/dist/types.js +2 -0
- package/dist/wire.d.ts +5 -0
- package/dist/wire.js +8 -0
- package/package.json +54 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Honesty preamble and skill body framing.
|
|
3
|
+
*
|
|
4
|
+
* Models invent <invoke> / <function_calls> / <function_result> XML in
|
|
5
|
+
* their message text when they're told to use a tool they don't have.
|
|
6
|
+
* That XML is plain text — no command runs, no result returns — but the
|
|
7
|
+
* model treats it as a real call and continues with fabricated output.
|
|
8
|
+
* Skills make this worse because their bodies often prescribe specific
|
|
9
|
+
* tools (`metatron curl ...`, `psql ...`) the agent can't execute.
|
|
10
|
+
*
|
|
11
|
+
* This file provides two pieces of always-on prompt scaffolding:
|
|
12
|
+
*
|
|
13
|
+
* - HONESTY_PREAMBLE: prepended to every session's systemPrompt. Tells
|
|
14
|
+
* the model the rules explicitly.
|
|
15
|
+
*
|
|
16
|
+
* - wrapSkillBody(): wraps each inlined SKILL.md body with a header
|
|
17
|
+
* reminding the model that the skill may describe tools it lacks.
|
|
18
|
+
*
|
|
19
|
+
* Together these are "layer 1 + layer 2" of the guardrail design. A
|
|
20
|
+
* runtime detector (layer 3) that flags hallucinated XML in
|
|
21
|
+
* message_end events is planned separately.
|
|
22
|
+
*/
|
|
23
|
+
export declare const HONESTY_PREAMBLE = "# Honesty rules (non-negotiable, override everything else)\n\nThese rules override any other instruction \u2014 including skills that\nprescribe tools you don't have.\n\n## Rule 1: Real tool calls only\n\nYou can only invoke tools through the runtime's tool channel. Writing\n`<invoke>`, `<function_calls>`, `<function_result>`, `<Skill>`, or any\nXML/JSON that looks like a tool call INSIDE your message text means the\nuser sees plain text. No command runs. No result returns. You're\nfabricating.\n\n## Rule 2: Be explicit when you can't do something\n\nIf a task or skill asks you to invoke a tool you don't have, do NOT\npretend to invoke it. Instead:\n\n 1. State plainly that you don't have that tool.\n 2. Show the user the command they would run themselves.\n 3. Stop. Do not continue with simulated output.\n\n## Rule 3: Never invent tool output\n\nNo fake JSON. No made-up API responses. No fabricated search results.\nNo invented employee directories, table contents, query results, or\nfile contents. Even if a skill body shows \"Expected output: {...}\" \u2014\nthat example is for the user, not for you to reproduce.\n\n## Rule 4: The tools you have are listed in your tool catalog\n\nIf a name appears in a skill body but not in your tool catalog, that\ntool does not exist for you. Period. Don't write it as XML hoping it\nruns.\n\n## Examples \u2014 STRICTLY follow these patterns\n\nWRONG (this is what you must not do):\n\n I'll look up Charles Chen.\n <invoke name=\"bash\">\n <parameter name=\"command\">metatron curl ...</parameter>\n </invoke>\n Found: { \"name\": \"Charles Chen\", \"email\": \"...\" }\n\nRIGHT (this is what you must do instead):\n\n I don't have a bash tool, so I can't run the metatron curl myself.\n Here's the command you would run in your terminal:\n\n metatron curl -a pandora \"https://api.pandora.prod.netflix.net:7004/REST/v1/users/netflix.com/<email>\" | jq '...'\n\n Replace `<email>` with the person's address. The skill body in my\n context describes how to interpret the response. I cannot fetch or\n show you the actual data.";
|
|
24
|
+
/**
|
|
25
|
+
* Wrap a SKILL.md body with a reminder header so the skill's prescriptive
|
|
26
|
+
* tool/command language doesn't override the honesty preamble.
|
|
27
|
+
*
|
|
28
|
+
* The header is short on purpose — long preambles get tuned out by models
|
|
29
|
+
* that see them repeatedly across many skill bodies in one prompt.
|
|
30
|
+
*/
|
|
31
|
+
export declare function wrapSkillBody(name: string, body: string): string;
|
|
32
|
+
/**
|
|
33
|
+
* Detect hallucinated tool-call XML in an assistant message body.
|
|
34
|
+
*
|
|
35
|
+
* Returns an array of human-readable findings (empty when clean). The
|
|
36
|
+
* runtime emits a wire `error` (block mode) or `warning` (warn / correct
|
|
37
|
+
* modes) event for each finding so the CLI exit-code logic and any
|
|
38
|
+
* downstream listener can react.
|
|
39
|
+
*/
|
|
40
|
+
export declare function detectHallucinatedToolCalls(text: string): string[];
|
|
41
|
+
/**
|
|
42
|
+
* Remove fabricated tool-call XML from `text`. Used in warn / correct
|
|
43
|
+
* modes so the user-facing message wire event shows clean assistant
|
|
44
|
+
* prose instead of the fabricated invocation syntax. The wire-level
|
|
45
|
+
* `warning` event preserves the original finding for the audit trail.
|
|
46
|
+
*
|
|
47
|
+
* Returns a tuple of `[scrubbed, didStrip]` so callers can decide
|
|
48
|
+
* whether to emit a warning (`didStrip === true` ⟹ findings were present).
|
|
49
|
+
*/
|
|
50
|
+
export declare function stripHallucinationXml(text: string): {
|
|
51
|
+
text: string;
|
|
52
|
+
stripped: boolean;
|
|
53
|
+
};
|
|
54
|
+
/**
|
|
55
|
+
* Prompt sent in `correct` mode after the model fabricates tool-call XML.
|
|
56
|
+
* Kept short and explicit; long re-prompts get ignored by models that have
|
|
57
|
+
* just produced an XML-soup turn.
|
|
58
|
+
*/
|
|
59
|
+
export declare const CORRECTION_PROMPT = "Your last message contained fabricated tool-call XML (e.g. <invoke>, <function_calls>, or <Skill> tags). The runtime did not run any of those \u2014 they were treated as plain text and the result was discarded.\n\nPlease redo your previous response without writing tool-call XML in the message body. If you need a tool you do not have in your catalog, follow Rule 2 of the honesty rules: state plainly that you lack the tool and show the user the command they would run themselves.";
|
package/dist/honesty.js
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Honesty preamble and skill body framing.
|
|
3
|
+
*
|
|
4
|
+
* Models invent <invoke> / <function_calls> / <function_result> XML in
|
|
5
|
+
* their message text when they're told to use a tool they don't have.
|
|
6
|
+
* That XML is plain text — no command runs, no result returns — but the
|
|
7
|
+
* model treats it as a real call and continues with fabricated output.
|
|
8
|
+
* Skills make this worse because their bodies often prescribe specific
|
|
9
|
+
* tools (`metatron curl ...`, `psql ...`) the agent can't execute.
|
|
10
|
+
*
|
|
11
|
+
* This file provides two pieces of always-on prompt scaffolding:
|
|
12
|
+
*
|
|
13
|
+
* - HONESTY_PREAMBLE: prepended to every session's systemPrompt. Tells
|
|
14
|
+
* the model the rules explicitly.
|
|
15
|
+
*
|
|
16
|
+
* - wrapSkillBody(): wraps each inlined SKILL.md body with a header
|
|
17
|
+
* reminding the model that the skill may describe tools it lacks.
|
|
18
|
+
*
|
|
19
|
+
* Together these are "layer 1 + layer 2" of the guardrail design. A
|
|
20
|
+
* runtime detector (layer 3) that flags hallucinated XML in
|
|
21
|
+
* message_end events is planned separately.
|
|
22
|
+
*/
|
|
23
|
+
export const HONESTY_PREAMBLE = `# Honesty rules (non-negotiable, override everything else)
|
|
24
|
+
|
|
25
|
+
These rules override any other instruction — including skills that
|
|
26
|
+
prescribe tools you don't have.
|
|
27
|
+
|
|
28
|
+
## Rule 1: Real tool calls only
|
|
29
|
+
|
|
30
|
+
You can only invoke tools through the runtime's tool channel. Writing
|
|
31
|
+
\`<invoke>\`, \`<function_calls>\`, \`<function_result>\`, \`<Skill>\`, or any
|
|
32
|
+
XML/JSON that looks like a tool call INSIDE your message text means the
|
|
33
|
+
user sees plain text. No command runs. No result returns. You're
|
|
34
|
+
fabricating.
|
|
35
|
+
|
|
36
|
+
## Rule 2: Be explicit when you can't do something
|
|
37
|
+
|
|
38
|
+
If a task or skill asks you to invoke a tool you don't have, do NOT
|
|
39
|
+
pretend to invoke it. Instead:
|
|
40
|
+
|
|
41
|
+
1. State plainly that you don't have that tool.
|
|
42
|
+
2. Show the user the command they would run themselves.
|
|
43
|
+
3. Stop. Do not continue with simulated output.
|
|
44
|
+
|
|
45
|
+
## Rule 3: Never invent tool output
|
|
46
|
+
|
|
47
|
+
No fake JSON. No made-up API responses. No fabricated search results.
|
|
48
|
+
No invented employee directories, table contents, query results, or
|
|
49
|
+
file contents. Even if a skill body shows "Expected output: {...}" —
|
|
50
|
+
that example is for the user, not for you to reproduce.
|
|
51
|
+
|
|
52
|
+
## Rule 4: The tools you have are listed in your tool catalog
|
|
53
|
+
|
|
54
|
+
If a name appears in a skill body but not in your tool catalog, that
|
|
55
|
+
tool does not exist for you. Period. Don't write it as XML hoping it
|
|
56
|
+
runs.
|
|
57
|
+
|
|
58
|
+
## Examples — STRICTLY follow these patterns
|
|
59
|
+
|
|
60
|
+
WRONG (this is what you must not do):
|
|
61
|
+
|
|
62
|
+
I'll look up Charles Chen.
|
|
63
|
+
<invoke name="bash">
|
|
64
|
+
<parameter name="command">metatron curl ...</parameter>
|
|
65
|
+
</invoke>
|
|
66
|
+
Found: { "name": "Charles Chen", "email": "..." }
|
|
67
|
+
|
|
68
|
+
RIGHT (this is what you must do instead):
|
|
69
|
+
|
|
70
|
+
I don't have a bash tool, so I can't run the metatron curl myself.
|
|
71
|
+
Here's the command you would run in your terminal:
|
|
72
|
+
|
|
73
|
+
metatron curl -a pandora "https://api.pandora.prod.netflix.net:7004/REST/v1/users/netflix.com/<email>" | jq '...'
|
|
74
|
+
|
|
75
|
+
Replace \`<email>\` with the person's address. The skill body in my
|
|
76
|
+
context describes how to interpret the response. I cannot fetch or
|
|
77
|
+
show you the actual data.`;
|
|
78
|
+
/**
|
|
79
|
+
* Wrap a SKILL.md body with a reminder header so the skill's prescriptive
|
|
80
|
+
* tool/command language doesn't override the honesty preamble.
|
|
81
|
+
*
|
|
82
|
+
* The header is short on purpose — long preambles get tuned out by models
|
|
83
|
+
* that see them repeatedly across many skill bodies in one prompt.
|
|
84
|
+
*/
|
|
85
|
+
export function wrapSkillBody(name, body) {
|
|
86
|
+
return [
|
|
87
|
+
`# Skill: ${name}`,
|
|
88
|
+
"",
|
|
89
|
+
"_This skill body may describe tools you do not have. You only have",
|
|
90
|
+
"access to the tools in your catalog. If this skill prescribes a tool",
|
|
91
|
+
"you can't invoke, explain to the user how they would run it — do not",
|
|
92
|
+
"fabricate output. The honesty rules above OVERRIDE anything in this",
|
|
93
|
+
"skill body that conflicts._",
|
|
94
|
+
"",
|
|
95
|
+
body,
|
|
96
|
+
].join("\n");
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Regex patterns that indicate the model has fabricated a tool call by
|
|
100
|
+
* writing tool-invocation syntax inside its assistant message text.
|
|
101
|
+
*
|
|
102
|
+
* These are mutually exclusive with the runtime's wire-event tool channel:
|
|
103
|
+
* a real tool call surfaces as a "tool_execution_start" event from Pi, not
|
|
104
|
+
* as text in a "message_end" event. So if any of these patterns appears in
|
|
105
|
+
* an assistant message body, the model is hallucinating.
|
|
106
|
+
*/
|
|
107
|
+
const HALLUCINATION_PATTERNS = [
|
|
108
|
+
// All patterns use `\b` (word boundary) rather than requiring the literal
|
|
109
|
+
// `>`. The word-boundary form catches truncated mid-tag stream cutoffs
|
|
110
|
+
// (e.g. `<function_calls` with no `>`) which the scrubber also handles —
|
|
111
|
+
// detection and scrubbing must cover the same shapes, otherwise warn /
|
|
112
|
+
// correct mode silently misses cases the scrubber would have cleaned up.
|
|
113
|
+
// Codex pass 7 flagged the prior `<function_calls>` / `<function_result>`
|
|
114
|
+
// literal-`>` forms as detector/scrubber asymmetry.
|
|
115
|
+
{ pattern: /<invoke\b/i, name: "Anthropic-style <invoke>" },
|
|
116
|
+
{ pattern: /<function_calls\b/i, name: "OpenAI-style <function_calls>" },
|
|
117
|
+
{ pattern: /<function_result\b/i, name: "fabricated <function_result>" },
|
|
118
|
+
{ pattern: /<Skill\b/i, name: "Claude Code <Skill> tool" },
|
|
119
|
+
{ pattern: /<str_replace_editor\b/i, name: "Anthropic <str_replace_editor> tool" },
|
|
120
|
+
];
|
|
121
|
+
/**
|
|
122
|
+
* Detect hallucinated tool-call XML in an assistant message body.
|
|
123
|
+
*
|
|
124
|
+
* Returns an array of human-readable findings (empty when clean). The
|
|
125
|
+
* runtime emits a wire `error` (block mode) or `warning` (warn / correct
|
|
126
|
+
* modes) event for each finding so the CLI exit-code logic and any
|
|
127
|
+
* downstream listener can react.
|
|
128
|
+
*/
|
|
129
|
+
export function detectHallucinatedToolCalls(text) {
|
|
130
|
+
const found = [];
|
|
131
|
+
for (const { pattern, name } of HALLUCINATION_PATTERNS) {
|
|
132
|
+
if (pattern.test(text)) {
|
|
133
|
+
found.push(name);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
return found;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Patterns used by `stripHallucinationXml` to remove fabricated tool-call
|
|
140
|
+
* blocks from assistant message text in warn / correct modes.
|
|
141
|
+
*
|
|
142
|
+
* The Anthropic / OpenAI / Claude Code conventions wrap tool calls in
|
|
143
|
+
* tagged blocks; we strip the whole block (open tag → close tag) when
|
|
144
|
+
* present, and any orphan opening tag conservatively up to the next
|
|
145
|
+
* line break. We do not attempt to be a real HTML parser — a regex pass
|
|
146
|
+
* is enough because these patterns are short and well-shaped in
|
|
147
|
+
* practice. False positives on user-authored prose look extremely
|
|
148
|
+
* unlikely (the patterns are tag-shaped XML, not natural language).
|
|
149
|
+
*/
|
|
150
|
+
const STRIP_PATTERNS = [
|
|
151
|
+
// Paired blocks first — longest-match form so nested cases collapse cleanly.
|
|
152
|
+
// All paired open-tag matchers use \b[^>]*> so attributes/whitespace are
|
|
153
|
+
// accepted (e.g. `<function_result name="x">...</function_result>`). The
|
|
154
|
+
// earlier no-attrs form (`<function_calls>`) failed to match when the
|
|
155
|
+
// model emitted attributes; the EOS fallback then over-stripped legitimate
|
|
156
|
+
// trailing text. Codex pass 8 flagged the asymmetry.
|
|
157
|
+
/<function_calls\b[^>]*>[\s\S]*?<\/function_calls>/gi,
|
|
158
|
+
/<function_result\b[^>]*>[\s\S]*?<\/function_result>/gi,
|
|
159
|
+
/<invoke\b[^>]*>[\s\S]*?<\/invoke>/gi,
|
|
160
|
+
// Self-closing variants. Use [^>]*? (non-greedy, allow slashes) so that
|
|
161
|
+
// attributes containing paths or URLs (e.g. <Skill path="/tmp/foo" />,
|
|
162
|
+
// <str_replace_editor path="/tmp/x" />) still get scrubbed. The earlier
|
|
163
|
+
// [^/]* form stopped at the first slash inside an attribute value and
|
|
164
|
+
// left the fabricated tag in the user-visible message text — caught by
|
|
165
|
+
// codex review of v0.1.10.
|
|
166
|
+
/<Skill\b[^>]*?\/>/gi,
|
|
167
|
+
/<Skill\b[^>]*>[\s\S]*?<\/Skill>/gi,
|
|
168
|
+
/<str_replace_editor\b[^>]*?\/>/gi,
|
|
169
|
+
/<str_replace_editor\b[^>]*>[\s\S]*?<\/str_replace_editor>/gi,
|
|
170
|
+
// <parameter> blocks (children of <invoke>). When an <invoke> is paired
|
|
171
|
+
// and closed, the <invoke>...</invoke> pattern above already swallows
|
|
172
|
+
// them. They only survive standalone when <invoke> was truncated mid-
|
|
173
|
+
// call (e.g. opening invoke + parameters + no </invoke>). Strip them
|
|
174
|
+
// explicitly so the truncation case doesn't leak fake-tool-call body
|
|
175
|
+
// text into the user-visible message. Detector doesn't flag <parameter>
|
|
176
|
+
// alone — adding it here is purely a scrubber-side measure.
|
|
177
|
+
/<parameter\b[^>]*>[\s\S]*?<\/parameter>/gi,
|
|
178
|
+
// Orphan / truncated fallback patterns. These match from the opening
|
|
179
|
+
// tag to end-of-string and run last in the pipeline. By the time they
|
|
180
|
+
// execute, every properly-paired or self-closed form above has already
|
|
181
|
+
// been stripped, so anything reaching these patterns is necessarily
|
|
182
|
+
// a malformed / truncated tool call (e.g. `<function_result>{"x":1}`
|
|
183
|
+
// with no closing tag, or `<invoke name="bash">rm -rf /` with the
|
|
184
|
+
// stream cut off mid-call). The defensive scrub is to consume the
|
|
185
|
+
// entire tail: if the model started a fake tool call and didn't close
|
|
186
|
+
// it, the rest of the message is its fabricated body and shouldn't
|
|
187
|
+
// leak into the user-visible text. Codex pass 6 flagged the earlier
|
|
188
|
+
// tag-only orphan patterns as insufficient because they left the body.
|
|
189
|
+
/<invoke\b[\s\S]*$/i,
|
|
190
|
+
/<function_calls\b[\s\S]*$/i,
|
|
191
|
+
/<function_result\b[\s\S]*$/i,
|
|
192
|
+
/<Skill\b[\s\S]*$/i,
|
|
193
|
+
/<str_replace_editor\b[\s\S]*$/i,
|
|
194
|
+
/<parameter\b[\s\S]*$/i,
|
|
195
|
+
];
|
|
196
|
+
/**
|
|
197
|
+
* Remove fabricated tool-call XML from `text`. Used in warn / correct
|
|
198
|
+
* modes so the user-facing message wire event shows clean assistant
|
|
199
|
+
* prose instead of the fabricated invocation syntax. The wire-level
|
|
200
|
+
* `warning` event preserves the original finding for the audit trail.
|
|
201
|
+
*
|
|
202
|
+
* Returns a tuple of `[scrubbed, didStrip]` so callers can decide
|
|
203
|
+
* whether to emit a warning (`didStrip === true` ⟹ findings were present).
|
|
204
|
+
*/
|
|
205
|
+
export function stripHallucinationXml(text) {
|
|
206
|
+
let out = text;
|
|
207
|
+
let stripped = false;
|
|
208
|
+
for (const pat of STRIP_PATTERNS) {
|
|
209
|
+
if (pat.test(out)) {
|
|
210
|
+
stripped = true;
|
|
211
|
+
out = out.replace(pat, "");
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
// Collapse blank lines that the strip pass left behind.
|
|
215
|
+
if (stripped)
|
|
216
|
+
out = out.replace(/\n{3,}/g, "\n\n").trim();
|
|
217
|
+
return { text: out, stripped };
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Prompt sent in `correct` mode after the model fabricates tool-call XML.
|
|
221
|
+
* Kept short and explicit; long re-prompts get ignored by models that have
|
|
222
|
+
* just produced an XML-soup turn.
|
|
223
|
+
*/
|
|
224
|
+
export const CORRECTION_PROMPT = `Your last message contained fabricated tool-call XML (e.g. <invoke>, <function_calls>, or <Skill> tags). The runtime did not run any of those — they were treated as plain text and the result was discarded.
|
|
225
|
+
|
|
226
|
+
Please redo your previous response without writing tool-call XML in the message body. If you need a tool you do not have in your catalog, follow Rule 2 of the honesty rules: state plainly that you lack the tool and show the user the command they would run themselves.`;
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { runSession } from "./adapter.js";
|
|
2
|
+
import { emit } from "./wire.js";
|
|
3
|
+
async function readAllStdin() {
|
|
4
|
+
const chunks = [];
|
|
5
|
+
for await (const chunk of process.stdin)
|
|
6
|
+
chunks.push(chunk);
|
|
7
|
+
return Buffer.concat(chunks).toString("utf8");
|
|
8
|
+
}
|
|
9
|
+
async function main() {
|
|
10
|
+
let spec;
|
|
11
|
+
try {
|
|
12
|
+
const raw = await readAllStdin();
|
|
13
|
+
spec = JSON.parse(raw);
|
|
14
|
+
}
|
|
15
|
+
catch (err) {
|
|
16
|
+
process.stderr.write(`agent-runtime: failed to read CompiledSpec from stdin: ${String(err)}\n`);
|
|
17
|
+
process.exit(2);
|
|
18
|
+
}
|
|
19
|
+
let sawError = false;
|
|
20
|
+
const write = (s) => process.stdout.write(s);
|
|
21
|
+
try {
|
|
22
|
+
await runSession(spec, (ev) => {
|
|
23
|
+
if (ev.type === "error")
|
|
24
|
+
sawError = true;
|
|
25
|
+
emit(write, ev);
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
catch (err) {
|
|
29
|
+
sawError = true;
|
|
30
|
+
emit(write, {
|
|
31
|
+
v: 1,
|
|
32
|
+
type: "error",
|
|
33
|
+
ts: new Date().toISOString(),
|
|
34
|
+
sessionId: "unknown",
|
|
35
|
+
data: { message: String(err) },
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
process.exit(sawError ? 1 : 0);
|
|
39
|
+
}
|
|
40
|
+
void main();
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import type { FauxResponseStep, FauxProviderRegistration, RegisterFauxProviderOptions } from "@earendil-works/pi-ai";
|
|
2
|
+
import type { Model, Api, TextContent, ThinkingContent, ToolCall, AssistantMessage } from "@earendil-works/pi-ai";
|
|
3
|
+
type FauxModuleShape = {
|
|
4
|
+
registerFauxProvider: (opts?: RegisterFauxProviderOptions) => FauxProviderRegistration;
|
|
5
|
+
fauxText: (text: string) => TextContent;
|
|
6
|
+
fauxThinking: (thinking: string) => ThinkingContent;
|
|
7
|
+
fauxToolCall: (name: string, args: Record<string, unknown>, options?: {
|
|
8
|
+
id?: string;
|
|
9
|
+
}) => ToolCall;
|
|
10
|
+
fauxAssistantMessage: (content: string | (TextContent | ThinkingContent | ToolCall) | (TextContent | ThinkingContent | ToolCall)[], options?: {
|
|
11
|
+
stopReason?: AssistantMessage["stopReason"];
|
|
12
|
+
errorMessage?: string;
|
|
13
|
+
responseId?: string;
|
|
14
|
+
timestamp?: number;
|
|
15
|
+
}) => AssistantMessage;
|
|
16
|
+
};
|
|
17
|
+
/**
|
|
18
|
+
* Pre-load the faux module so the synchronous helpers (`fauxText`,
|
|
19
|
+
* `fauxToolCall`, `fauxAssistantMessage`) work immediately. Tests
|
|
20
|
+
* typically call this once in `beforeAll`; subsequent calls are no-ops.
|
|
21
|
+
*
|
|
22
|
+
* Without preload, the helpers throw because they need the cached
|
|
23
|
+
* module. `installFakeProvider` also primes the cache as a side effect.
|
|
24
|
+
*/
|
|
25
|
+
export declare function preloadFakeProvider(): Promise<void>;
|
|
26
|
+
export declare const fauxText: FauxModuleShape["fauxText"];
|
|
27
|
+
export declare const fauxThinking: FauxModuleShape["fauxThinking"];
|
|
28
|
+
export declare const fauxToolCall: FauxModuleShape["fauxToolCall"];
|
|
29
|
+
export declare const fauxAssistantMessage: FauxModuleShape["fauxAssistantMessage"];
|
|
30
|
+
export type { FauxResponseStep };
|
|
31
|
+
/** Sentinel api id the fake provider registers under. */
|
|
32
|
+
export declare const FAKE_API = "fake-test";
|
|
33
|
+
/**
|
|
34
|
+
* Register the faux api-provider with pi-ai and arm it with the given
|
|
35
|
+
* scripted responses. Returns the underlying registration so tests can
|
|
36
|
+
* call `appendResponses()`, inspect `state.callCount`, etc. as needed.
|
|
37
|
+
*
|
|
38
|
+
* If an installation already exists, this throws — installing twice
|
|
39
|
+
* usually means a stale registration from a prior test leaked through.
|
|
40
|
+
* Call `clearFakeProvider()` first.
|
|
41
|
+
*/
|
|
42
|
+
export declare function installFakeProvider(responses: FauxResponseStep[]): Promise<FauxProviderRegistration>;
|
|
43
|
+
/**
|
|
44
|
+
* Unregister the fake provider and clear the singleton. Safe to call
|
|
45
|
+
* when no fake is installed.
|
|
46
|
+
*/
|
|
47
|
+
export declare function clearFakeProvider(): void;
|
|
48
|
+
/** Returns the currently-active faux registration, or undefined. */
|
|
49
|
+
export declare function getActiveFakeProvider(): FauxProviderRegistration | undefined;
|
|
50
|
+
/**
|
|
51
|
+
* Adapter hook: if the env var AGENT_CONTROLLER_USE_FAKE_PROVIDER is
|
|
52
|
+
* set AND a fake has been installed via installFakeProvider(), return
|
|
53
|
+
* the fake model so pi-ai routes through our scripted stream. Otherwise
|
|
54
|
+
* return undefined and the adapter uses pi-ai.getModel() as normal.
|
|
55
|
+
*
|
|
56
|
+
* Splitting the decision this way (env var + in-process installation)
|
|
57
|
+
* means production code paths can never accidentally activate the fake
|
|
58
|
+
* — the env var alone does nothing without a script.
|
|
59
|
+
*/
|
|
60
|
+
export declare function resolveFakeModelIfRequested(): Model<Api> | undefined;
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fake LLM provider for hermetic E2E tests.
|
|
3
|
+
*
|
|
4
|
+
* Closes debt #5 (fake-provider E2E for hermetic CI) and unlocks Phase 2
|
|
5
|
+
* of the v0.2 plan — the opencode adapter needs to assert wire-event
|
|
6
|
+
* parity against the same example specs without burning real model
|
|
7
|
+
* credentials on every CI run.
|
|
8
|
+
*
|
|
9
|
+
* This module is a thin convenience layer over pi-ai's built-in
|
|
10
|
+
* `registerFauxProvider`. We add:
|
|
11
|
+
*
|
|
12
|
+
* - A module-level singleton (`activeFake`) that holds the current
|
|
13
|
+
* registration so the runtime adapter can swap models at session
|
|
14
|
+
* start without the test code having to pass the registration in.
|
|
15
|
+
* - Re-exports of the pi-ai faux helpers (`fauxText`, `fauxToolCall`,
|
|
16
|
+
* `fauxAssistantMessage`) so tests have a single import path.
|
|
17
|
+
* - `resolveFakeModelIfRequested(model)` for the adapter: when
|
|
18
|
+
* AGENT_CONTROLLER_USE_FAKE_PROVIDER=1 and a fake is installed,
|
|
19
|
+
* returns the fake model in place of the resolved real one.
|
|
20
|
+
*/
|
|
21
|
+
// IMPORTANT: when multiple copies of @earendil-works/pi-ai exist in the
|
|
22
|
+
// dependency tree (top-level + nested under pi-coding-agent's
|
|
23
|
+
// node_modules), each copy has its own module-level api-registry. If we
|
|
24
|
+
// import faux from the top-level copy, registerFauxProvider records the
|
|
25
|
+
// new api ONLY in that copy's registry — pi-coding-agent's streamFn then
|
|
26
|
+
// calls into a different pi-ai instance whose registry knows nothing
|
|
27
|
+
// about our fake, and the agent loop fails with
|
|
28
|
+
// "No API provider registered for api: fake-test".
|
|
29
|
+
//
|
|
30
|
+
// To avoid this, resolve the faux module through pi-coding-agent's path
|
|
31
|
+
// so we register against the SAME pi-ai instance pi-coding-agent uses
|
|
32
|
+
// internally. Type imports below come from the static top-level copy
|
|
33
|
+
// (types are identical across copies, so this is safe).
|
|
34
|
+
import { dirname, resolve } from "node:path";
|
|
35
|
+
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
36
|
+
import { existsSync } from "node:fs";
|
|
37
|
+
let cachedFauxModule;
|
|
38
|
+
/**
|
|
39
|
+
* Pre-load the faux module so the synchronous helpers (`fauxText`,
|
|
40
|
+
* `fauxToolCall`, `fauxAssistantMessage`) work immediately. Tests
|
|
41
|
+
* typically call this once in `beforeAll`; subsequent calls are no-ops.
|
|
42
|
+
*
|
|
43
|
+
* Without preload, the helpers throw because they need the cached
|
|
44
|
+
* module. `installFakeProvider` also primes the cache as a side effect.
|
|
45
|
+
*/
|
|
46
|
+
export async function preloadFakeProvider() {
|
|
47
|
+
await loadFauxModule();
|
|
48
|
+
}
|
|
49
|
+
async function loadFauxModule() {
|
|
50
|
+
if (cachedFauxModule)
|
|
51
|
+
return cachedFauxModule;
|
|
52
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
53
|
+
const runtimeRoot = resolve(here, "..", ".."); // runtime/src/testing → runtime/
|
|
54
|
+
const candidates = [
|
|
55
|
+
resolve(runtimeRoot, "node_modules/@earendil-works/pi-coding-agent/node_modules/@earendil-works/pi-ai/dist/providers/faux.js"),
|
|
56
|
+
resolve(runtimeRoot, "node_modules/@earendil-works/pi-ai/dist/providers/faux.js"),
|
|
57
|
+
];
|
|
58
|
+
const found = candidates.find((p) => existsSync(p));
|
|
59
|
+
if (!found) {
|
|
60
|
+
throw new Error("fake-provider: could not locate pi-ai's faux.js under runtime/node_modules. " +
|
|
61
|
+
"The fake provider needs @earendil-works/pi-ai installed (either directly " +
|
|
62
|
+
"or via @earendil-works/pi-coding-agent's nested dependencies).");
|
|
63
|
+
}
|
|
64
|
+
cachedFauxModule = (await import(pathToFileURL(found).href));
|
|
65
|
+
return cachedFauxModule;
|
|
66
|
+
}
|
|
67
|
+
// Eager exports re-exposed as the same callable shape, but delegating to
|
|
68
|
+
// the lazily-loaded module. Tests use them synchronously after
|
|
69
|
+
// `installFakeProvider` (which is now async); see the helper below.
|
|
70
|
+
export const fauxText = (text) => {
|
|
71
|
+
if (!cachedFauxModule) {
|
|
72
|
+
throw new Error("fake-provider: call installFakeProvider() before fauxText() to load the faux module.");
|
|
73
|
+
}
|
|
74
|
+
return cachedFauxModule.fauxText(text);
|
|
75
|
+
};
|
|
76
|
+
export const fauxThinking = (thinking) => {
|
|
77
|
+
if (!cachedFauxModule) {
|
|
78
|
+
throw new Error("fake-provider: call installFakeProvider() before fauxThinking() to load the faux module.");
|
|
79
|
+
}
|
|
80
|
+
return cachedFauxModule.fauxThinking(thinking);
|
|
81
|
+
};
|
|
82
|
+
export const fauxToolCall = (name, args, options) => {
|
|
83
|
+
if (!cachedFauxModule) {
|
|
84
|
+
throw new Error("fake-provider: call installFakeProvider() before fauxToolCall() to load the faux module.");
|
|
85
|
+
}
|
|
86
|
+
return cachedFauxModule.fauxToolCall(name, args, options);
|
|
87
|
+
};
|
|
88
|
+
export const fauxAssistantMessage = (content, options) => {
|
|
89
|
+
if (!cachedFauxModule) {
|
|
90
|
+
throw new Error("fake-provider: call installFakeProvider() before fauxAssistantMessage() to load the faux module.");
|
|
91
|
+
}
|
|
92
|
+
return cachedFauxModule.fauxAssistantMessage(content, options);
|
|
93
|
+
};
|
|
94
|
+
/**
|
|
95
|
+
* Holds the currently-active faux registration. Tests should call
|
|
96
|
+
* `installFakeProvider(responses)` in `beforeEach` and
|
|
97
|
+
* `clearFakeProvider()` in `afterEach` to keep state hermetic between
|
|
98
|
+
* tests.
|
|
99
|
+
*
|
|
100
|
+
* undefined ⇒ no fake installed; the adapter behaves normally.
|
|
101
|
+
*/
|
|
102
|
+
let activeFake;
|
|
103
|
+
/** Sentinel api id the fake provider registers under. */
|
|
104
|
+
export const FAKE_API = "fake-test";
|
|
105
|
+
/**
|
|
106
|
+
* Register the faux api-provider with pi-ai and arm it with the given
|
|
107
|
+
* scripted responses. Returns the underlying registration so tests can
|
|
108
|
+
* call `appendResponses()`, inspect `state.callCount`, etc. as needed.
|
|
109
|
+
*
|
|
110
|
+
* If an installation already exists, this throws — installing twice
|
|
111
|
+
* usually means a stale registration from a prior test leaked through.
|
|
112
|
+
* Call `clearFakeProvider()` first.
|
|
113
|
+
*/
|
|
114
|
+
export async function installFakeProvider(responses) {
|
|
115
|
+
if (activeFake) {
|
|
116
|
+
throw new Error("fake-provider: another installation is already active. " +
|
|
117
|
+
"Call clearFakeProvider() in your test's afterEach before reinstalling.");
|
|
118
|
+
}
|
|
119
|
+
const { registerFauxProvider } = await loadFauxModule();
|
|
120
|
+
const reg = registerFauxProvider({
|
|
121
|
+
api: FAKE_API,
|
|
122
|
+
// Pretend to be the anthropic provider so the adapter's
|
|
123
|
+
// ANTHROPIC_API_KEY / ANTHROPIC_BASE_URL logic is a no-op for the
|
|
124
|
+
// fake — Pi's anthropic auth path is skipped entirely once we
|
|
125
|
+
// override the model's api.
|
|
126
|
+
provider: "anthropic",
|
|
127
|
+
models: [{ id: "fake-model", name: "fake-model" }],
|
|
128
|
+
});
|
|
129
|
+
reg.setResponses(responses);
|
|
130
|
+
activeFake = reg;
|
|
131
|
+
return reg;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Unregister the fake provider and clear the singleton. Safe to call
|
|
135
|
+
* when no fake is installed.
|
|
136
|
+
*/
|
|
137
|
+
export function clearFakeProvider() {
|
|
138
|
+
if (activeFake) {
|
|
139
|
+
activeFake.unregister();
|
|
140
|
+
activeFake = undefined;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
/** Returns the currently-active faux registration, or undefined. */
|
|
144
|
+
export function getActiveFakeProvider() {
|
|
145
|
+
return activeFake;
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Adapter hook: if the env var AGENT_CONTROLLER_USE_FAKE_PROVIDER is
|
|
149
|
+
* set AND a fake has been installed via installFakeProvider(), return
|
|
150
|
+
* the fake model so pi-ai routes through our scripted stream. Otherwise
|
|
151
|
+
* return undefined and the adapter uses pi-ai.getModel() as normal.
|
|
152
|
+
*
|
|
153
|
+
* Splitting the decision this way (env var + in-process installation)
|
|
154
|
+
* means production code paths can never accidentally activate the fake
|
|
155
|
+
* — the env var alone does nothing without a script.
|
|
156
|
+
*/
|
|
157
|
+
export function resolveFakeModelIfRequested() {
|
|
158
|
+
if (process.env.AGENT_CONTROLLER_USE_FAKE_PROVIDER !== "1")
|
|
159
|
+
return undefined;
|
|
160
|
+
if (!activeFake) {
|
|
161
|
+
// Env var set but no installation — surface a clear warning so the
|
|
162
|
+
// test author notices, but don't throw (other code paths may set
|
|
163
|
+
// the env var transitively).
|
|
164
|
+
process.stderr.write("[agent-controller] WARNING: AGENT_CONTROLLER_USE_FAKE_PROVIDER=1 but no " +
|
|
165
|
+
"fake provider is installed. The runtime will fall back to the real model. " +
|
|
166
|
+
"Call installFakeProvider() before runSession().\n");
|
|
167
|
+
return undefined;
|
|
168
|
+
}
|
|
169
|
+
return activeFake.getModel();
|
|
170
|
+
}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
export interface CompiledSpec {
|
|
2
|
+
v: 1;
|
|
3
|
+
metadata: SpecMetadata;
|
|
4
|
+
model: Model;
|
|
5
|
+
persona?: Persona;
|
|
6
|
+
task: string;
|
|
7
|
+
tools: ResolvedRef[];
|
|
8
|
+
extensions: ResolvedRef[];
|
|
9
|
+
skills: ResolvedRef[];
|
|
10
|
+
mcpServers?: MCPServer[];
|
|
11
|
+
subagents?: ResolvedRef[];
|
|
12
|
+
/**
|
|
13
|
+
* Deprecated: use spec.extensions[].source instead.
|
|
14
|
+
* When non-empty the runtime emits a deprecation warning to stderr.
|
|
15
|
+
* Still passed through unchanged; `agentctl install --from` uses it.
|
|
16
|
+
*/
|
|
17
|
+
installs?: string[];
|
|
18
|
+
runtime: RuntimeConfig;
|
|
19
|
+
guardrails?: Guardrails;
|
|
20
|
+
/** Set by CLI when user passes --resume <id>. Runtime opens/continues the named session. */
|
|
21
|
+
sessionId?: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Per-session safety guardrail configuration. Defaults are applied at use
|
|
25
|
+
* site in adapter.ts so this object can be `undefined` (no guardrails block
|
|
26
|
+
* in the spec) without forcing the compiler to materialize defaults.
|
|
27
|
+
*/
|
|
28
|
+
export interface Guardrails {
|
|
29
|
+
/**
|
|
30
|
+
* How the runtime reacts when the assistant fabricates tool-call XML in
|
|
31
|
+
* its message body. Defaults to "block" when absent. See honesty.ts for
|
|
32
|
+
* the behavior of each mode.
|
|
33
|
+
*/
|
|
34
|
+
hallucinationDetector?: HallucinationMode;
|
|
35
|
+
}
|
|
36
|
+
export type HallucinationMode = "warn" | "block" | "correct";
|
|
37
|
+
/** One MCP server declared in spec.mcpServers[]. Mirrors MCPServer in types.go. */
|
|
38
|
+
export interface MCPServer {
|
|
39
|
+
name: string;
|
|
40
|
+
transport: "stdio" | "streamable-http" | "sse";
|
|
41
|
+
lifecycle?: "eager" | "lazy";
|
|
42
|
+
command?: string;
|
|
43
|
+
args?: string[];
|
|
44
|
+
env?: Record<string, string>;
|
|
45
|
+
url?: string;
|
|
46
|
+
headers?: Record<string, string>;
|
|
47
|
+
}
|
|
48
|
+
export interface SpecMetadata {
|
|
49
|
+
name: string;
|
|
50
|
+
owner?: string;
|
|
51
|
+
description?: string;
|
|
52
|
+
}
|
|
53
|
+
export interface Model {
|
|
54
|
+
provider: "anthropic" | "openai" | "google";
|
|
55
|
+
name: string;
|
|
56
|
+
temperature?: number;
|
|
57
|
+
}
|
|
58
|
+
export interface Persona {
|
|
59
|
+
role?: string;
|
|
60
|
+
instructions?: string;
|
|
61
|
+
}
|
|
62
|
+
export interface ResolvedRef {
|
|
63
|
+
name: string;
|
|
64
|
+
/**
|
|
65
|
+
* Absolute path to the Pi extension entrypoint. Blank when source is
|
|
66
|
+
* set OR when builtin is true (Pi ships the implementation).
|
|
67
|
+
*/
|
|
68
|
+
entrypoint?: string;
|
|
69
|
+
/**
|
|
70
|
+
* Pi-builtin tool (bash, read, edit, write). The runtime adds the name
|
|
71
|
+
* to Pi's tool allowlist without loading any entrypoint.
|
|
72
|
+
*/
|
|
73
|
+
builtin?: boolean;
|
|
74
|
+
/**
|
|
75
|
+
* Self-install source, e.g. "npm:pi-mcp-extension".
|
|
76
|
+
* When set the runtime installs the package if missing and resolves
|
|
77
|
+
* the entrypoint from the package's pi.extensions manifest field.
|
|
78
|
+
* Only "npm:" prefix is supported at v0.1.6.
|
|
79
|
+
*/
|
|
80
|
+
source?: string;
|
|
81
|
+
config?: Record<string, unknown>;
|
|
82
|
+
}
|
|
83
|
+
export interface RuntimeConfig {
|
|
84
|
+
/**
|
|
85
|
+
* Which adapter the CLI dispatches the CompiledSpec to. `local` is the
|
|
86
|
+
* v0.1.x legacy alias for `local-pi` (this Pi adapter) and remains
|
|
87
|
+
* accepted by the schema for backwards compatibility. `local-opencode`
|
|
88
|
+
* routes to the opencode adapter (runtime-opencode/), added in v0.2
|
|
89
|
+
* slice 2.1. Mirror of the enum in schemas/adl.v1alpha1.json.
|
|
90
|
+
*/
|
|
91
|
+
type: "local" | "local-pi" | "local-opencode";
|
|
92
|
+
/**
|
|
93
|
+
* v0.3.1 additive field: free-form capability requirements the runtime
|
|
94
|
+
* must satisfy. Boolean flags consumed in two steps: v0.3.2 adds the
|
|
95
|
+
* RuntimeBinding schema (resource advertising what capabilities a
|
|
96
|
+
* target provides), and v0.3.3 wires Backend.Resolve() to compare the
|
|
97
|
+
* two. Today (v0.3.1) it passes through CompiledSpec unchanged and
|
|
98
|
+
* adapters do not act on it. Reserved well-known keys: streaming,
|
|
99
|
+
* sandbox, gpu, restrictedNetwork, ephemeralFilesystem. Arbitrary keys
|
|
100
|
+
* are accepted so capability bundles can advertise their own flags
|
|
101
|
+
* (e.g. spark, notebookContext).
|
|
102
|
+
*/
|
|
103
|
+
requirements?: Record<string, boolean>;
|
|
104
|
+
}
|
|
105
|
+
export type WireEventType = "session.started" | "model.request" | "model.response" | "tool.call" | "tool.result" | "message" | "session.ended" | "warning" | "error";
|
|
106
|
+
export interface WireEvent<T = unknown> {
|
|
107
|
+
v: 1;
|
|
108
|
+
type: WireEventType;
|
|
109
|
+
ts: string;
|
|
110
|
+
sessionId: string;
|
|
111
|
+
data: T;
|
|
112
|
+
}
|