@postqode/evals 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,83 @@
1
+ # @postqode/evals
2
+
3
+ Real-LLM evaluation harness for `@postqode/coding-agent`. Designed for CI: skips cleanly when no API key is present, enforces a dollar-cost cap (default $10), surfaces per-rule pass/fail.
4
+
5
+ ## Run the baseline
6
+
7
+ ```bash
8
+ # Anthropic direct (default)
9
+ ANTHROPIC_API_KEY=sk-ant-… npm run eval
10
+
11
+ # Via OpenRouter
12
+ EVAL_PROVIDER=openrouter OPENROUTER_API_KEY=sk-or-… \
13
+ EVAL_MODEL="anthropic/claude-sonnet-4.5" npm run eval
14
+
15
+ # Tighter budget
16
+ ANTHROPIC_API_KEY=sk-ant-… EVAL_BUDGET_USD=2 npm run eval
17
+ ```
18
+
19
+ The 5 baseline fixtures exercise distinct capabilities: read-before-summarize, grep-and-report, drop-tool respect, trivial-answer turn efficiency, custom-tool discovery.
20
+
21
+ ## Environment variables
22
+
23
+ | Var | Default | Purpose |
24
+ |---|---|---|
25
+ | `EVAL_PROVIDER` | `anthropic` | Any registered provider. |
26
+ | `EVAL_MODEL` | provider-specific | Model id. |
27
+ | `EVAL_BUDGET_USD` | `10` | Hard cap; stops the run before exceeding. |
28
+ | `ANTHROPIC_API_KEY` | — | Read when `EVAL_PROVIDER=anthropic`. |
29
+ | `OPENROUTER_API_KEY` | — | Read when `EVAL_PROVIDER=openrouter`. |
30
+ | `OPENAI_API_KEY` | — | Read when `EVAL_PROVIDER=openai-native`. |
31
+ | `GEMINI_API_KEY` | — | Read when `EVAL_PROVIDER=gemini`. |
32
+ | `EVAL_API_KEY` | — | Fallback for other providers. |
33
+
34
+ ## Author your own evals
35
+
36
+ ```ts
37
+ import { runEvals, toolUsed, contentIncludes, stopReasonIs } from "@postqode/evals"
38
+
39
+ const summary = await runEvals([
40
+ {
41
+ name: "my_eval",
42
+ description: "Agent reads the file before describing it.",
43
+ input: "Summarize src/foo.ts",
44
+ maxTurns: 6,
45
+ rules: [
46
+ toolUsed("read_file"),
47
+ contentIncludes("foo"),
48
+ stopReasonIs("completed"),
49
+ ],
50
+ },
51
+ ], {
52
+ provider: "anthropic",
53
+ budgetUsd: 2,
54
+ onEvalComplete: (r) => console.log(r.passed ? "✓" : "✗", r.eval.name),
55
+ })
56
+ ```
57
+
58
+ ## Rule builders (`@postqode/evals/rules`)
59
+
60
+ | Rule | Passes when |
61
+ |---|---|
62
+ | `toolUsed(name)` | Agent called `name` at least once. |
63
+ | `toolCalledAtMost(name, n)` | `name` was called ≤ n times. |
64
+ | `contentIncludes(substring)` | Final assistant message contains the substring (case-insensitive). |
65
+ | `stopReasonIs(reason)` | Run ended with the given stop reason. |
66
+ | `turnsAtMost(n)` | Used ≤ n turns. |
67
+ | `hadSuccessfulToolResult()` | At least one tool_result event had no error. |
68
+
69
+ Custom rules implement the `EvalRule` interface — any function that inspects `{ input, events, result }` and returns `{ passed, reason? }`.
70
+
71
+ ## Budget semantics
72
+
73
+ The cap is checked **before** each eval. If the accumulated cost (summed from the provider's usage chunks) would exceed the cap, the remaining evals are skipped and `budgetExceeded: true` is returned. Worst-case overspend is one eval's cost.
74
+
75
+ ## Skip semantics
76
+
77
+ Missing API key → the whole run returns `{ skipped: true }`, zero evals execute, `onSkip` fires once with a message naming the env var. CLI exits 0 so CI without secrets stays green.
78
+
79
+ ## Sibling packages
80
+
81
+ - [`@postqode/ai`](../postqode-ai) — provider registry consulted by the runner.
82
+ - [`@postqode/agent`](../postqode-agent) — the `AgentResult` / `AgentEvent` types the rules inspect.
83
+ - [`@postqode/coding-agent`](../postqode-coding-agent) — the agent under test.
@@ -0,0 +1,27 @@
1
+ import type { UsageTotals } from "@postqode/agent";
2
+ /**
3
+ * Accumulator that tracks total token usage and the dollar cost approaching a
4
+ * hard cap. Runner calls {@link wouldExceed} before starting each eval and
5
+ * {@link add} after each eval settles, so even a single runaway turn can't
6
+ * overshoot the cap by more than one eval's worth of spend.
7
+ */
8
+ export declare class BudgetTracker {
9
+ readonly capUsd: number;
10
+ private usage;
11
+ constructor(capUsd: number);
12
+ get totalUsd(): number;
13
+ get totals(): UsageTotals;
14
+ /**
15
+ * Merge an eval's usage into the running total. Accepts the shape the
16
+ * Agent returns from `run()` — including the optional cache and cost
17
+ * fields which may or may not be populated depending on the provider.
18
+ */
19
+ add(next: UsageTotals): void;
20
+ /**
21
+ * Returns true if starting another eval would push the total over the
22
+ * cap. Conservative — treats "unknown cost" as zero because most
23
+ * providers report `totalCost` per stream chunk.
24
+ */
25
+ wouldExceed(): boolean;
26
+ }
27
+ //# sourceMappingURL=budget.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"budget.d.ts","sourceRoot":"","sources":["../src/budget.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAElD;;;;;GAKG;AACH,qBAAa,aAAa;aAGG,MAAM,EAAE,MAAM;IAF1C,OAAO,CAAC,KAAK,CAAmD;gBAEpC,MAAM,EAAE,MAAM;IAM1C,IAAI,QAAQ,IAAI,MAAM,CAErB;IAED,IAAI,MAAM,IAAI,WAAW,CAExB;IAED;;;;OAIG;IACH,GAAG,CAAC,IAAI,EAAE,WAAW,GAAG,IAAI;IAU5B;;;;OAIG;IACH,WAAW,IAAI,OAAO;CAGtB"}
package/dist/budget.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.BudgetTracker=void 0;class BudgetTracker{capUsd;usage={inputTokens:0,outputTokens:0};constructor(e){if(this.capUsd=e,!(e>0))throw new Error(`BudgetTracker: capUsd must be > 0, got ${e}`)}get totalUsd(){return this.usage.costUsd??0}get totals(){return{...this.usage}}add(e){this.usage={inputTokens:this.usage.inputTokens+e.inputTokens,outputTokens:this.usage.outputTokens+e.outputTokens,cacheReadTokens:(this.usage.cacheReadTokens??0)+(e.cacheReadTokens??0)||void 0,cacheWriteTokens:(this.usage.cacheWriteTokens??0)+(e.cacheWriteTokens??0)||void 0,costUsd:(this.usage.costUsd??0)+(e.costUsd??0)||void 0}}wouldExceed(){return this.totalUsd>=this.capUsd}}exports.BudgetTracker=BudgetTracker;
2
+ //# sourceMappingURL=budget.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../src/budget.ts"],
4
+ "sourcesContent": ["import type { UsageTotals } from \"@postqode/agent\"\n\n/**\n * Accumulator that tracks total token usage and the dollar cost approaching a\n * hard cap. Runner calls {@link wouldExceed} before starting each eval and\n * {@link add} after each eval settles, so even a single runaway turn can't\n * overshoot the cap by more than one eval's worth of spend.\n */\nexport class BudgetTracker {\n\tprivate usage: UsageTotals = { inputTokens: 0, outputTokens: 0 }\n\n\tconstructor(public readonly capUsd: number) {\n\t\tif (!(capUsd > 0)) {\n\t\t\tthrow new Error(`BudgetTracker: capUsd must be > 0, got ${capUsd}`)\n\t\t}\n\t}\n\n\tget totalUsd(): number {\n\t\treturn this.usage.costUsd ?? 0\n\t}\n\n\tget totals(): UsageTotals {\n\t\treturn { ...this.usage }\n\t}\n\n\t/**\n\t * Merge an eval's usage into the running total. Accepts the shape the\n\t * Agent returns from `run()` \u2014 including the optional cache and cost\n\t * fields which may or may not be populated depending on the provider.\n\t */\n\tadd(next: UsageTotals): void {\n\t\tthis.usage = {\n\t\t\tinputTokens: this.usage.inputTokens + next.inputTokens,\n\t\t\toutputTokens: this.usage.outputTokens + next.outputTokens,\n\t\t\tcacheReadTokens: ((this.usage.cacheReadTokens ?? 0) + (next.cacheReadTokens ?? 0)) || undefined,\n\t\t\tcacheWriteTokens: ((this.usage.cacheWriteTokens ?? 0) + (next.cacheWriteTokens ?? 0)) || undefined,\n\t\t\tcostUsd: ((this.usage.costUsd ?? 0) + (next.costUsd ?? 0)) || undefined,\n\t\t}\n\t}\n\n\t/**\n\t * Returns true if starting another eval would push the total over the\n\t * cap. Conservative \u2014 treats \"unknown cost\" as zero because most\n\t * providers report `totalCost` per stream chunk.\n\t */\n\twouldExceed(): boolean {\n\t\treturn this.totalUsd >= this.capUsd\n\t}\n}\n"],
5
+ "mappings": "iGAQA,MAAa,aAAa,CAGG,OAFpB,MAAqB,CAAE,YAAa,EAAG,aAAc,CAAC,EAE9D,YAA4BA,EAAc,CACzC,GAD2B,KAAA,OAAAA,EACvB,EAAEA,EAAS,GACd,MAAM,IAAI,MAAM,0CAA0CA,CAAM,EAAE,CAEpE,CAEA,IAAI,UAAQ,CACX,OAAO,KAAK,MAAM,SAAW,CAC9B,CAEA,IAAI,QAAM,CACT,MAAO,CAAE,GAAG,KAAK,KAAK,CACvB,CAOA,IAAIC,EAAiB,CACpB,KAAK,MAAQ,CACZ,YAAa,KAAK,MAAM,YAAcA,EAAK,YAC3C,aAAc,KAAK,MAAM,aAAeA,EAAK,aAC7C,iBAAmB,KAAK,MAAM,iBAAmB,IAAMA,EAAK,iBAAmB,IAAO,OACtF,kBAAoB,KAAK,MAAM,kBAAoB,IAAMA,EAAK,kBAAoB,IAAO,OACzF,SAAW,KAAK,MAAM,SAAW,IAAMA,EAAK,SAAW,IAAO,OAEhE,CAOA,aAAW,CACV,OAAO,KAAK,UAAY,KAAK,MAC9B,EAvCD,QAAA,cAAA",
6
+ "names": ["capUsd", "next"]
7
+ }
@@ -0,0 +1,9 @@
1
+ import type { Eval } from "../types";
2
+ export declare const readsBeforeSummarizing: Eval;
3
+ export declare const findsStringWithGrep: Eval;
4
+ export declare const respectsDropTool: Eval;
5
+ export declare const trivialAnswersInFewTurns: Eval;
6
+ export declare const callsCustomTool: Eval;
7
+ /** All baseline fixture evals, in a stable order. */
8
+ export declare const baselineFixtures: Eval[];
9
+ //# sourceMappingURL=basic.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"basic.d.ts","sourceRoot":"","sources":["../../src/fixtures/basic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,UAAU,CAAA;AAuCpC,eAAO,MAAM,sBAAsB,EAAE,IAapC,CAAA;AAED,eAAO,MAAM,mBAAmB,EAAE,IAajC,CAAA;AAED,eAAO,MAAM,gBAAgB,EAAE,IA4B9B,CAAA;AAED,eAAO,MAAM,wBAAwB,EAAE,IAWtC,CAAA;AAED,eAAO,MAAM,eAAe,EAAE,IAa7B,CAAA;AAED,qDAAqD;AACrD,eAAO,MAAM,gBAAgB,EAAE,IAAI,EAMlC,CAAA"}
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.baselineFixtures=exports.callsCustomTool=exports.trivialAnswersInFewTurns=exports.respectsDropTool=exports.findsStringWithGrep=exports.readsBeforeSummarizing=void 0;const rules_1=require("../rules"),COUNT_WORDS_TOOL={name:"count_words",description:"Returns the number of whitespace-separated words in the given text. Use when the user explicitly asks to count words.",inputSchema:{type:"object",properties:{text:{type:"string"}},required:["text"]},async execute({text:t}){const e=t.trim().length===0?0:t.trim().split(/\s+/).length;return{content:String(e),details:{count:e}}}};exports.readsBeforeSummarizing={name:"reads_before_summarizing",description:"Agent calls read_file before summarizing a file rather than guessing at its contents.",input:"Summarize what the file packages/postqode-ai/package.json describes. Be brief.",maxTurns:6,rules:[(0,rules_1.toolUsed)("read_file"),(0,rules_1.hadSuccessfulToolResult)(),(0,rules_1.contentIncludes)("postqode"),(0,rules_1.stopReasonIs)("completed")]},exports.findsStringWithGrep={name:"finds_string_with_grep",description:"Given a repo-relative search request, the agent uses grep and reports the match.",input:'Search the packages/postqode-agent/src directory for the string "AbortController" and tell me which file contains it.',maxTurns:6,rules:[(0,rules_1.toolUsed)("grep"),(0,rules_1.hadSuccessfulToolResult)(),(0,rules_1.contentIncludes)("agent.ts"),(0,rules_1.stopReasonIs)("completed")]},exports.respectsDropTool={name:"respects_drop_tool",description:"When read_file is explicitly dropped, the agent does NOT call it and falls back to another tool (bash cat).",input:"Read the first few lines of packages/postqode-ai/package.json and tell me the package name.",maxTurns:6,dropTools:["read_file"],systemPromptExtras:"The read_file tool is unavailable in this session. Use bash instead to inspect files.",rules:[{name:"did_not_call_read_file",check:({events:t})=>{const e=t.some(s=>s.type==="tool_use"&&s.name==="read_file");return{passed:!e,reason:e?"Agent called read_file despite it being dropped from the tool pack.":void 0}}},(0,rules_1.toolUsed)("bash"),(0,rules_1.contentIncludes)("@postqode/ai"),(0,rules_1.stopReasonIs)("completed")]},exports.trivialAnswersInFewTurns={name:"trivial_answers_in_few_turns",description:"Agent doesn't burn turns on a question it can answer from pretraining.",input:"What is 2 + 2? Reply with just the number.",maxTurns:3,rules:[(0,rules_1.turnsAtMost)(2),(0,rules_1.contentIncludes)("4"),(0,rules_1.stopReasonIs)("completed")]},exports.callsCustomTool={name:"calls_custom_tool",description:"Custom tools added via extraTools are discoverable and used when the prompt matches.",input:'Use the count_words tool to count words in "hello there how are you". Tell me the count.',maxTurns:4,extraTools:[COUNT_WORDS_TOOL],rules:[(0,rules_1.toolUsed)("count_words"),(0,rules_1.hadSuccessfulToolResult)(),(0,rules_1.contentIncludes)("5"),(0,rules_1.stopReasonIs)("completed")]},exports.baselineFixtures=[exports.readsBeforeSummarizing,exports.findsStringWithGrep,exports.respectsDropTool,exports.trivialAnswersInFewTurns,exports.callsCustomTool];
2
+ //# sourceMappingURL=basic.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../../src/fixtures/basic.ts"],
4
+ "sourcesContent": ["import type { Tool } from \"@postqode/agent\"\n\nimport type { Eval } from \"../types\"\n// Re-imports to keep the fixtures file free of deep paths.\nimport {\n\tcontentIncludes,\n\thadSuccessfulToolResult,\n\tstopReasonIs,\n\ttoolUsed,\n\tturnsAtMost,\n} from \"../rules\"\n\n/**\n * Baseline coding-agent fixture evals. Designed for low cost and\n * low flakiness: each targets \u226410 turns and exercises a distinct\n * capability.\n *\n * Scope:\n * - No filesystem mutations \u2014 keeps the suite re-runnable without\n * cleanup. A follow-up can add write-oriented evals once the runner\n * gains setup/teardown hooks.\n * - Prompts phrased so any competent model should produce a tool call;\n * rules check FOR the call, not for a particular response wording.\n */\n\nconst COUNT_WORDS_TOOL: Tool<{ text: string }> = {\n\tname: \"count_words\",\n\tdescription:\n\t\t\"Returns the number of whitespace-separated words in the given text. \" +\n\t\t\"Use when the user explicitly asks to count words.\",\n\tinputSchema: {\n\t\ttype: \"object\",\n\t\tproperties: { text: { type: \"string\" } },\n\t\trequired: [\"text\"],\n\t},\n\tasync execute({ text }: { text: string }) {\n\t\tconst count = text.trim().length === 0 ? 0 : text.trim().split(/\\s+/).length\n\t\treturn { content: String(count), details: { count } }\n\t},\n}\n\nexport const readsBeforeSummarizing: Eval = {\n\tname: \"reads_before_summarizing\",\n\tdescription:\n\t\t\"Agent calls read_file before summarizing a file rather than guessing at its contents.\",\n\tinput:\n\t\t\"Summarize what the file packages/postqode-ai/package.json describes. Be brief.\",\n\tmaxTurns: 6,\n\trules: [\n\t\ttoolUsed(\"read_file\"),\n\t\thadSuccessfulToolResult(),\n\t\tcontentIncludes(\"postqode\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const findsStringWithGrep: Eval = {\n\tname: \"finds_string_with_grep\",\n\tdescription:\n\t\t\"Given a repo-relative search request, the agent uses grep and reports the match.\",\n\tinput:\n\t\t'Search the packages/postqode-agent/src directory for the string \"AbortController\" and tell me which file contains it.',\n\tmaxTurns: 6,\n\trules: [\n\t\ttoolUsed(\"grep\"),\n\t\thadSuccessfulToolResult(),\n\t\tcontentIncludes(\"agent.ts\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const respectsDropTool: Eval = {\n\tname: \"respects_drop_tool\",\n\tdescription:\n\t\t\"When read_file is explicitly dropped, the agent does NOT call it and falls back to another tool (bash cat).\",\n\tinput:\n\t\t\"Read the first few lines of packages/postqode-ai/package.json and tell me the package name.\",\n\tmaxTurns: 6,\n\tdropTools: [\"read_file\"],\n\tsystemPromptExtras:\n\t\t\"The read_file tool is unavailable in this session. Use bash instead to inspect files.\",\n\trules: [\n\t\t// Must NOT have called read_file.\n\t\t{\n\t\t\tname: \"did_not_call_read_file\",\n\t\t\tcheck: ({ events }) => {\n\t\t\t\tconst called = events.some((e) => e.type === \"tool_use\" && e.name === \"read_file\")\n\t\t\t\treturn {\n\t\t\t\t\tpassed: !called,\n\t\t\t\t\treason: called\n\t\t\t\t\t\t? \"Agent called read_file despite it being dropped from the tool pack.\"\n\t\t\t\t\t\t: undefined,\n\t\t\t\t}\n\t\t\t},\n\t\t},\n\t\ttoolUsed(\"bash\"),\n\t\tcontentIncludes(\"@postqode/ai\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const trivialAnswersInFewTurns: Eval = {\n\tname: \"trivial_answers_in_few_turns\",\n\tdescription:\n\t\t\"Agent doesn't burn turns on a question it can answer from pretraining.\",\n\tinput: \"What is 2 + 2? Reply with just the number.\",\n\tmaxTurns: 3,\n\trules: [\n\t\tturnsAtMost(2),\n\t\tcontentIncludes(\"4\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const callsCustomTool: Eval = {\n\tname: \"calls_custom_tool\",\n\tdescription:\n\t\t\"Custom tools added via extraTools are discoverable and used when the prompt matches.\",\n\tinput: 'Use the count_words tool to count words in \"hello there how are you\". Tell me the count.',\n\tmaxTurns: 4,\n\textraTools: [COUNT_WORDS_TOOL],\n\trules: [\n\t\ttoolUsed(\"count_words\"),\n\t\thadSuccessfulToolResult(),\n\t\tcontentIncludes(\"5\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\n/** All baseline fixture evals, in a stable order. */\nexport const baselineFixtures: Eval[] = [\n\treadsBeforeSummarizing,\n\tfindsStringWithGrep,\n\trespectsDropTool,\n\ttrivialAnswersInFewTurns,\n\tcallsCustomTool,\n]\n"],
5
+ "mappings": "iPAIA,MAAA,QAAA,QAAA,UAAA,EAqBM,iBAA2C,CAChD,KAAM,cACN,YACC,wHAED,YAAa,CACZ,KAAM,SACN,WAAY,CAAE,KAAM,CAAE,KAAM,QAAQ,CAAE,EACtC,SAAU,CAAC,MAAM,GAElB,MAAM,QAAQ,CAAE,KAAAA,CAAI,EAAoB,CACvC,MAAMC,EAAQD,EAAK,KAAI,EAAG,SAAW,EAAI,EAAIA,EAAK,KAAI,EAAG,MAAM,KAAK,EAAE,OACtE,MAAO,CAAE,QAAS,OAAOC,CAAK,EAAG,QAAS,CAAE,MAAAA,CAAK,CAAE,CACpD,GAGY,QAAA,uBAA+B,CAC3C,KAAM,2BACN,YACC,wFACD,MACC,iFACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,WAAW,KACpB,QAAA,yBAAuB,KACvB,QAAA,iBAAgB,UAAU,KAC1B,QAAA,cAAa,WAAW,IAIb,QAAA,oBAA4B,CACxC,KAAM,yBACN,YACC,mFACD,MACC,wHACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,MAAM,KACf,QAAA,yBAAuB,KACvB,QAAA,iBAAgB,UAAU,KAC1B,QAAA,cAAa,WAAW,IAIb,QAAA,iBAAyB,CACrC,KAAM,qBACN,YACC,8GACD,MACC,8FACD,SAAU,EACV,UAAW,CAAC,WAAW,EACvB,mBACC,wFACD,MAAO,CAEN,CACC,KAAM,yBACN,MAAO,CAAC,CAAE,OAAAC,CAAM,IAAM,CACrB,MAAMC,EAASD,EAAO,KAAME,GAAMA,EAAE,OAAS,YAAcA,EAAE,OAAS,WAAW,EACjF,MAAO,CACN,OAAQ,CAACD,EACT,OAAQA,EACL,sEACA,OAEL,MAED,QAAA,UAAS,MAAM,KACf,QAAA,iBAAgB,cAAc,KAC9B,QAAA,cAAa,WAAW,IAIb,QAAA,yBAAiC,CAC7C,KAAM,+BACN,YACC,yEACD,MAAO,6CACP,SAAU,EACV,MAAO,IACN,QAAA,aAAY,CAAC,KACb,QAAA,iBAAgB,GAAG,KACnB,QAAA,cAAa,WAAW,IAIb,QAAA,gBAAwB,CACpC,KAAM,oBACN,YACC,uFACD,MAAO,2FACP,SAAU,EACV,WAAY,CAAC,gBAAgB,EAC7B,MAAO,IACN,QAAA,UAAS,aAAa,KACtB,QAAA,yBAAuB,KACvB,QAAA,iBAAgB,GAAG,KACnB,QAAA,cAAa,WAAW,IAKb,QAAA,iBAA2B,CACvC,QAAA,uBACA,QAAA,oBACA,QAAA,iBACA,QAAA,yBACA,QAAA",
6
+ "names": ["text", "count", "events", "called", "e"]
7
+ }
@@ -0,0 +1,17 @@
1
+ import type { Eval, EvalRunConfig, EvalRunSummary } from "../types";
2
+ export declare const planModeProducesPlan: Eval;
3
+ export declare const planModeRefusesWritesOnFixRequest: Eval;
4
+ /** Plan-mode fixtures. Runner builds these with `createPlanAgent`. */
5
+ export declare const planFixtures: Eval[];
6
+ /**
7
+ * Run the plan-mode fixtures through the standard runner, swapping in
8
+ * `createPlanAgent` as the agent factory so write-side tools are
9
+ * dropped and the plan-mode rubric is in effect.
10
+ *
11
+ * ```ts
12
+ * import { runPlanFixtures } from "@postqode/evals"
13
+ * const summary = await runPlanFixtures({ budgetUsd: 2 })
14
+ * ```
15
+ */
16
+ export declare function runPlanFixtures(config?: EvalRunConfig): Promise<EvalRunSummary>;
17
+ //# sourceMappingURL=plan.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"plan.d.ts","sourceRoot":"","sources":["../../src/fixtures/plan.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EAAE,IAAI,EAAY,aAAa,EAAE,cAAc,EAAE,MAAM,UAAU,CAAA;AAmC7E,eAAO,MAAM,oBAAoB,EAAE,IAclC,CAAA;AAED,eAAO,MAAM,iCAAiC,EAAE,IAQ/C,CAAA;AAED,sEAAsE;AACtE,eAAO,MAAM,YAAY,EAAE,IAAI,EAG9B,CAAA;AAED;;;;;;;;;GASG;AACH,wBAAgB,eAAe,CAAC,MAAM,GAAE,aAAkB,GAAG,OAAO,CAAC,cAAc,CAAC,CAKnF"}
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.planFixtures=exports.planModeRefusesWritesOnFixRequest=exports.planModeProducesPlan=void 0,exports.runPlanFixtures=runPlanFixtures;const coding_agent_1=require("@postqode/coding-agent"),runner_1=require("../runner"),rules_1=require("../rules"),neverCalledWriteTools={name:"never_called_write_tools",check:({events:t})=>{const n=t.filter(e=>e.type==="tool_use"&&(e.name==="write_file"||e.name==="edit_file"));return{passed:n.length===0,reason:n.length===0?void 0:`Plan mode must not call write tools; saw ${n.map(e=>e.name).join(", ")}.`}}};exports.planModeProducesPlan={name:"plan_mode_produces_plan",description:"In plan mode, the agent reads relevant code and returns a plan without writing anything.",input:"Plan how to add a new tool to @postqode/coding-agent's default pack. Outline the files I'd touch and the order.",maxTurns:8,rules:[(0,rules_1.toolUsed)("read_file"),(0,rules_1.hadSuccessfulToolResult)(),neverCalledWriteTools,(0,rules_1.contentIncludes)("tool"),(0,rules_1.stopReasonIs)("completed")]},exports.planModeRefusesWritesOnFixRequest={name:"plan_mode_refuses_writes_on_fix_request",description:"Even when the user says 'fix', plan mode describes the fix instead of editing anything.",input:"Fix the placeholder comment in packages/postqode-ai/src/index.ts by filling in what @postqode/ai exports.",maxTurns:6,rules:[(0,rules_1.toolUsed)("read_file"),neverCalledWriteTools,(0,rules_1.stopReasonIs)("completed")]},exports.planFixtures=[exports.planModeProducesPlan,exports.planModeRefusesWritesOnFixRequest];function runPlanFixtures(t={}){return(0,runner_1.runEvals)(exports.planFixtures,{...t,agentFactory:coding_agent_1.createPlanAgent})}
2
+ //# sourceMappingURL=plan.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../../src/fixtures/plan.ts"],
4
+ "sourcesContent": ["import { createPlanAgent } from \"@postqode/coding-agent\"\n\nimport { runEvals } from \"../runner\"\nimport {\n\tcontentIncludes,\n\thadSuccessfulToolResult,\n\tstopReasonIs,\n\ttoolUsed,\n} from \"../rules\"\nimport type { Eval, EvalRule, EvalRunConfig, EvalRunSummary } from \"../types\"\n\n/**\n * Plan-mode fixture evals.\n *\n * Plan mode is the PostQode extension's read-only analysis mode. The\n * SDK equivalent is `createPlanAgent` from @postqode/coding-agent:\n * write-side tools (write_file, edit_file) are dropped, and the\n * system prompt is augmented with a plan-mode rubric.\n *\n * Two non-negotiables these fixtures assert:\n * 1. The agent still actively inspects the codebase (reads/greps/lists).\n * 2. The agent NEVER calls a write-side tool.\n */\n\nconst neverCalledWriteTools: EvalRule = {\n\tname: \"never_called_write_tools\",\n\tcheck: ({ events }) => {\n\t\tconst writes = events.filter(\n\t\t\t(e) =>\n\t\t\t\te.type === \"tool_use\" &&\n\t\t\t\t(e.name === \"write_file\" || e.name === \"edit_file\"),\n\t\t)\n\t\treturn {\n\t\t\tpassed: writes.length === 0,\n\t\t\treason:\n\t\t\t\twrites.length === 0\n\t\t\t\t\t? undefined\n\t\t\t\t\t: `Plan mode must not call write tools; saw ${writes\n\t\t\t\t\t\t\t.map((w) => (w as { name: string }).name)\n\t\t\t\t\t\t\t.join(\", \")}.`,\n\t\t}\n\t},\n}\n\nexport const planModeProducesPlan: Eval = {\n\tname: \"plan_mode_produces_plan\",\n\tdescription:\n\t\t\"In plan mode, the agent reads relevant code and returns a plan without writing anything.\",\n\tinput:\n\t\t\"Plan how to add a new tool to @postqode/coding-agent's default pack. Outline the files I'd touch and the order.\",\n\tmaxTurns: 8,\n\trules: [\n\t\ttoolUsed(\"read_file\"),\n\t\thadSuccessfulToolResult(),\n\t\tneverCalledWriteTools,\n\t\tcontentIncludes(\"tool\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const planModeRefusesWritesOnFixRequest: Eval = {\n\tname: \"plan_mode_refuses_writes_on_fix_request\",\n\tdescription:\n\t\t\"Even when the user says 'fix', plan mode describes the fix instead of editing anything.\",\n\tinput:\n\t\t\"Fix the placeholder comment in packages/postqode-ai/src/index.ts by filling in what @postqode/ai exports.\",\n\tmaxTurns: 6,\n\trules: [toolUsed(\"read_file\"), neverCalledWriteTools, stopReasonIs(\"completed\")],\n}\n\n/** Plan-mode fixtures. Runner builds these with `createPlanAgent`. */\nexport const planFixtures: Eval[] = [\n\tplanModeProducesPlan,\n\tplanModeRefusesWritesOnFixRequest,\n]\n\n/**\n * Run the plan-mode fixtures through the standard runner, swapping in\n * `createPlanAgent` as the agent factory so write-side tools are\n * dropped and the plan-mode rubric is in effect.\n *\n * ```ts\n * import { runPlanFixtures } from \"@postqode/evals\"\n * const summary = await runPlanFixtures({ budgetUsd: 2 })\n * ```\n */\nexport function runPlanFixtures(config: EvalRunConfig = {}): Promise<EvalRunSummary> {\n\treturn runEvals(planFixtures, {\n\t\t...config,\n\t\tagentFactory: createPlanAgent,\n\t})\n}\n"],
5
+ "mappings": "uKAsFA,QAAA,gBAAA,gBAtFA,MAAA,eAAA,QAAA,wBAAA,EAEA,SAAA,QAAA,WAAA,EACA,QAAA,QAAA,UAAA,EAqBM,sBAAkC,CACvC,KAAM,2BACN,MAAO,CAAC,CAAE,OAAAA,CAAM,IAAM,CACrB,MAAMC,EAASD,EAAO,OACpB,GACA,EAAE,OAAS,aACV,EAAE,OAAS,cAAgB,EAAE,OAAS,YAAY,EAErD,MAAO,CACN,OAAQC,EAAO,SAAW,EAC1B,OACCA,EAAO,SAAW,EACf,OACA,4CAA4CA,EAC3C,IAAKC,GAAOA,EAAuB,IAAI,EACvC,KAAK,IAAI,CAAC,IAEjB,GAGY,QAAA,qBAA6B,CACzC,KAAM,0BACN,YACC,2FACD,MACC,kHACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,WAAW,KACpB,QAAA,yBAAuB,EACvB,yBACA,QAAA,iBAAgB,MAAM,KACtB,QAAA,cAAa,WAAW,IAIb,QAAA,kCAA0C,CACtD,KAAM,0CACN,YACC,0FACD,MACC,4GACD,SAAU,EACV,MAAO,IAAC,QAAA,UAAS,WAAW,EAAG,yBAAuB,QAAA,cAAa,WAAW,CAAC,GAInE,QAAA,aAAuB,CACnC,QAAA,qBACA,QAAA,mCAaD,SAAgB,gBAAgBC,EAAwB,CAAA,EAAE,CACzD,SAAO,SAAA,UAAS,QAAA,aAAc,CAC7B,GAAGA,EACH,aAAc,eAAA,gBACd,CACF",
6
+ "names": ["events", "writes", "w", "config"]
7
+ }
@@ -0,0 +1,31 @@
1
+ import type { Eval } from "../types";
2
+ /**
3
+ * Prompt-behavior eval fixtures.
4
+ *
5
+ * Targets specific behaviors that prompt-component trims can break:
6
+ * - Tool selection (rules.ts / capabilities.ts cuts → wrong tool picked)
7
+ * - Format compliance (editing_files.ts cuts → malformed SEARCH/REPLACE)
8
+ * - Plan-vs-Agent mode awareness (act_vs_plan_mode.ts cuts)
9
+ * - Iteration discipline (objective.ts cuts → one-shot or back-and-forth)
10
+ * - Subagent restraint (cli_subagents.ts cuts → over- or under-delegation)
11
+ *
12
+ * Each eval is small (≤8 turns), self-contained, and budget-friendly.
13
+ * They run via the same runner as fixtures/basic.ts:
14
+ *
15
+ * ANTHROPIC_API_KEY=… npm run eval -- --suite prompt-behavior
16
+ *
17
+ * Or pass directly to runEvals() in a custom script.
18
+ */
19
+ export declare const picksGrepForRegexSearch: Eval;
20
+ export declare const picksReadFileForKnownPath: Eval;
21
+ export declare const picksListFilesForDirectoryOverview: Eval;
22
+ export declare const doesNotEndWithQuestion: Eval;
23
+ export declare const oneToolPerTurn: Eval;
24
+ export declare const usesEditFileForTargetedChange: Eval;
25
+ export declare const doesNotSpawnSubagentForSimpleTask: Eval;
26
+ /**
27
+ * Default suite of prompt-behavior evals. Skip slow ones in CI by leaving
28
+ * `runSlow: false` on the runner config.
29
+ */
30
+ export declare const promptBehaviorEvals: readonly Eval[];
31
+ //# sourceMappingURL=prompt-behavior.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt-behavior.d.ts","sourceRoot":"","sources":["../../src/fixtures/prompt-behavior.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,UAAU,CAAA;AAGpC;;;;;;;;;;;;;;;;GAgBG;AAMH,eAAO,MAAM,uBAAuB,EAAE,IAcrC,CAAA;AAED,eAAO,MAAM,yBAAyB,EAAE,IAavC,CAAA;AAED,eAAO,MAAM,kCAAkC,EAAE,IAYhD,CAAA;AAMD,eAAO,MAAM,sBAAsB,EAAE,IA2BpC,CAAA;AAED,eAAO,MAAM,cAAc,EAAE,IAsC5B,CAAA;AAMD,eAAO,MAAM,6BAA6B,EAAE,IAc3C,CAAA;AAMD,eAAO,MAAM,iCAAiC,EAAE,IAW/C,CAAA;AAMD;;;GAGG;AACH,eAAO,MAAM,mBAAmB,EAAE,SAAS,IAAI,EAQ9C,CAAA"}
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.promptBehaviorEvals=exports.doesNotSpawnSubagentForSimpleTask=exports.usesEditFileForTargetedChange=exports.oneToolPerTurn=exports.doesNotEndWithQuestion=exports.picksListFilesForDirectoryOverview=exports.picksReadFileForKnownPath=exports.picksGrepForRegexSearch=void 0;const rules_1=require("../rules");exports.picksGrepForRegexSearch={name:"picks_grep_for_regex_search",description:"Asked to find a regex, the agent uses grep \u2014 not blanket file reading.",input:"Find every line in this repo where the string `TODO` appears. Return just the count.",maxTurns:4,rules:[(0,rules_1.toolUsed)("grep"),(0,rules_1.toolCalledAtMost)("read_file",1),(0,rules_1.stopReasonIs)("completed")]},exports.picksReadFileForKnownPath={name:"picks_read_file_for_known_path",description:"When the user names a file directly, the agent reads it instead of grepping.",input:"What is the `name` field set to in `packages/postqode-ai/package.json`?",maxTurns:4,rules:[(0,rules_1.toolUsed)("read_file"),(0,rules_1.toolCalledAtMost)("grep",0),(0,rules_1.contentIncludes)("postqode"),(0,rules_1.stopReasonIs)("completed")]},exports.picksListFilesForDirectoryOverview={name:"picks_list_files_for_directory_overview",description:"Given a directory, the agent uses list_files (not read_file on every entry).",input:"What files are in the `packages/postqode-evals/src/bin/` directory?",maxTurns:4,rules:[(0,rules_1.toolUsed)("list_files"),(0,rules_1.toolCalledAtMost)("read_file",0),(0,rules_1.stopReasonIs)("completed")]},exports.doesNotEndWithQuestion={name:"does_not_end_with_question",description:"After completing a request, the agent does not end with a follow-up question.",input:"Tell me what programming language `packages/postqode-ai/src/index.ts` is written in.",maxTurns:4,rules:[(0,rules_1.stopReasonIs)("completed"),{name:"final_message_not_a_question",check:({result:o})=>{const e=o.messages.filter(r=>r.role==="assistant"),t=e[e.length-1],s=typeof t?.content=="string"?t.content.trim():"",n=s.endsWith("?");return{passed:!n,reason:n?`Final message ends with a question: "${s.slice(-100)}"`:void 0}}}]},exports.oneToolPerTurn={name:"one_tool_per_turn",description:"Agent calls tools sequentially across turns, not parallel within one turn.",input:"Read packages/postqode-ai/package.json AND tell me its `name` field. Be terse.",maxTurns:6,rules:[(0,rules_1.toolUsed)("read_file"),(0,rules_1.stopReasonIs)("completed"),{name:"no_parallel_tool_calls_in_single_turn",check:({events:o})=>{let e=!1,t=!1;for(const s of o)if(s.type==="turn_start"&&(e=!1),s.type==="tool_use"){if(e){t=!0;break}e=!0}return{passed:!t,reason:t?"Agent issued multiple tool_use events within one turn (parallel calls).":void 0}}}]},exports.usesEditFileForTargetedChange={name:"uses_edit_file_for_targeted_change",description:"Given a small targeted change, the agent picks edit_file (replace_in_file family), not write_file.",input:'In packages/postqode-ai/package.json, change the value of the `version` field from "0.1.0" to "0.1.1". Show me the diff after.',maxTurns:6,dropTools:["bash"],rules:[(0,rules_1.toolUsed)("edit_file"),(0,rules_1.toolCalledAtMost)("write_file",0),(0,rules_1.stopReasonIs)("completed")],slow:!0},exports.doesNotSpawnSubagentForSimpleTask={name:"does_not_spawn_subagent_for_simple_task",description:"For a single-file lookup, the agent does NOT delegate to a subagent.",input:"What does the function `getSystemPrompt` in `packages/postqode-prompts/src/index.ts` do? Read it and explain in one sentence.",maxTurns:5,rules:[(0,rules_1.toolUsed)("read_file"),(0,rules_1.toolCalledAtMost)("bash",1),(0,rules_1.stopReasonIs)("completed")]},exports.promptBehaviorEvals=[exports.picksGrepForRegexSearch,exports.picksReadFileForKnownPath,exports.picksListFilesForDirectoryOverview,exports.doesNotEndWithQuestion,exports.oneToolPerTurn,exports.usesEditFileForTargetedChange,exports.doesNotSpawnSubagentForSimpleTask];
2
+ //# sourceMappingURL=prompt-behavior.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../../src/fixtures/prompt-behavior.ts"],
4
+ "sourcesContent": ["import type { Eval } from \"../types\"\nimport { contentIncludes, stopReasonIs, toolCalledAtMost, toolUsed } from \"../rules\"\n\n/**\n * Prompt-behavior eval fixtures.\n *\n * Targets specific behaviors that prompt-component trims can break:\n * - Tool selection (rules.ts / capabilities.ts cuts \u2192 wrong tool picked)\n * - Format compliance (editing_files.ts cuts \u2192 malformed SEARCH/REPLACE)\n * - Plan-vs-Agent mode awareness (act_vs_plan_mode.ts cuts)\n * - Iteration discipline (objective.ts cuts \u2192 one-shot or back-and-forth)\n * - Subagent restraint (cli_subagents.ts cuts \u2192 over- or under-delegation)\n *\n * Each eval is small (\u22648 turns), self-contained, and budget-friendly.\n * They run via the same runner as fixtures/basic.ts:\n *\n * ANTHROPIC_API_KEY=\u2026 npm run eval -- --suite prompt-behavior\n *\n * Or pass directly to runEvals() in a custom script.\n */\n\n// =====================================================================\n// Tool selection \u2014 rules + capabilities cuts can confuse routing\n// =====================================================================\n\nexport const picksGrepForRegexSearch: Eval = {\n\tname: \"picks_grep_for_regex_search\",\n\tdescription: \"Asked to find a regex, the agent uses grep \u2014 not blanket file reading.\",\n\tinput:\n\t\t\"Find every line in this repo where the string `TODO` appears. Return just the count.\",\n\tmaxTurns: 4,\n\trules: [\n\t\ttoolUsed(\"grep\"),\n\t\t// Allow up to 1 read_file (a model may read a single file to verify a\n\t\t// match it found via grep). More than 1 indicates the model is reading\n\t\t// files broadly instead of using grep to narrow first.\n\t\ttoolCalledAtMost(\"read_file\", 1),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const picksReadFileForKnownPath: Eval = {\n\tname: \"picks_read_file_for_known_path\",\n\tdescription:\n\t\t\"When the user names a file directly, the agent reads it instead of grepping.\",\n\tinput:\n\t\t\"What is the `name` field set to in `packages/postqode-ai/package.json`?\",\n\tmaxTurns: 4,\n\trules: [\n\t\ttoolUsed(\"read_file\"),\n\t\ttoolCalledAtMost(\"grep\", 0),\n\t\tcontentIncludes(\"postqode\"),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\nexport const picksListFilesForDirectoryOverview: Eval = {\n\tname: \"picks_list_files_for_directory_overview\",\n\tdescription:\n\t\t\"Given a directory, the agent uses list_files (not read_file on every entry).\",\n\tinput:\n\t\t\"What files are in the `packages/postqode-evals/src/bin/` directory?\",\n\tmaxTurns: 4,\n\trules: [\n\t\ttoolUsed(\"list_files\"),\n\t\ttoolCalledAtMost(\"read_file\", 0),\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\n// =====================================================================\n// Iteration discipline \u2014 objective.ts trims can erode this\n// =====================================================================\n\nexport const doesNotEndWithQuestion: Eval = {\n\tname: \"does_not_end_with_question\",\n\tdescription:\n\t\t\"After completing a request, the agent does not end with a follow-up question.\",\n\tinput: \"Tell me what programming language `packages/postqode-ai/src/index.ts` is written in.\",\n\tmaxTurns: 4,\n\trules: [\n\t\tstopReasonIs(\"completed\"),\n\t\t// The final assistant message should NOT end with a question mark\n\t\t// after the agent has answered the question. Implemented as a\n\t\t// custom rule via the rule API:\n\t\t{\n\t\t\tname: \"final_message_not_a_question\",\n\t\t\tcheck: ({ result }) => {\n\t\t\t\tconst assistants = result.messages.filter((m) => m.role === \"assistant\")\n\t\t\t\tconst last = assistants[assistants.length - 1]\n\t\t\t\tconst text = typeof last?.content === \"string\" ? last.content.trim() : \"\"\n\t\t\t\tconst endsWithQuestion = text.endsWith(\"?\")\n\t\t\t\treturn {\n\t\t\t\t\tpassed: !endsWithQuestion,\n\t\t\t\t\treason: endsWithQuestion\n\t\t\t\t\t\t? `Final message ends with a question: \"${text.slice(-100)}\"`\n\t\t\t\t\t\t: undefined,\n\t\t\t\t}\n\t\t\t},\n\t\t},\n\t],\n}\n\nexport const oneToolPerTurn: Eval = {\n\tname: \"one_tool_per_turn\",\n\tdescription:\n\t\t\"Agent calls tools sequentially across turns, not parallel within one turn.\",\n\tinput:\n\t\t\"Read packages/postqode-ai/package.json AND tell me its `name` field. Be terse.\",\n\tmaxTurns: 6,\n\trules: [\n\t\ttoolUsed(\"read_file\"),\n\t\tstopReasonIs(\"completed\"),\n\t\t// Number of distinct turns containing tool_use events should\n\t\t// equal the number of tool_use events. Two tools issued in one\n\t\t// turn would produce more tool_use events than turn_start\n\t\t// events that contained any.\n\t\t{\n\t\t\tname: \"no_parallel_tool_calls_in_single_turn\",\n\t\t\tcheck: ({ events }) => {\n\t\t\t\tlet currentTurnHasTool = false\n\t\t\t\tlet multipleInOneTurn = false\n\t\t\t\tfor (const e of events) {\n\t\t\t\t\tif (e.type === \"turn_start\") currentTurnHasTool = false\n\t\t\t\t\tif (e.type === \"tool_use\") {\n\t\t\t\t\t\tif (currentTurnHasTool) {\n\t\t\t\t\t\t\tmultipleInOneTurn = true\n\t\t\t\t\t\t\tbreak\n\t\t\t\t\t\t}\n\t\t\t\t\t\tcurrentTurnHasTool = true\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn {\n\t\t\t\t\tpassed: !multipleInOneTurn,\n\t\t\t\t\treason: multipleInOneTurn\n\t\t\t\t\t\t? \"Agent issued multiple tool_use events within one turn (parallel calls).\"\n\t\t\t\t\t\t: undefined,\n\t\t\t\t}\n\t\t\t},\n\t\t},\n\t],\n}\n\n// =====================================================================\n// Format compliance \u2014 editing_files.ts cuts can break SEARCH/REPLACE\n// =====================================================================\n\nexport const usesEditFileForTargetedChange: Eval = {\n\tname: \"uses_edit_file_for_targeted_change\",\n\tdescription:\n\t\t\"Given a small targeted change, the agent picks edit_file (replace_in_file family), not write_file.\",\n\tinput:\n\t\t\"In packages/postqode-ai/package.json, change the value of the `version` field from \\\"0.1.0\\\" to \\\"0.1.1\\\". Show me the diff after.\",\n\tmaxTurns: 6,\n\tdropTools: [\"bash\"], // prevents `sed -i` shortcut; forces tool selection\n\trules: [\n\t\ttoolUsed(\"edit_file\"),\n\t\ttoolCalledAtMost(\"write_file\", 0),\n\t\tstopReasonIs(\"completed\"),\n\t],\n\tslow: true, // marked slow because it mutates state \u2014 runner can skip in CI\n}\n\n// =====================================================================\n// Subagent restraint \u2014 cli_subagents.ts cuts can flip behavior\n// =====================================================================\n\nexport const doesNotSpawnSubagentForSimpleTask: Eval = {\n\tname: \"does_not_spawn_subagent_for_simple_task\",\n\tdescription:\n\t\t\"For a single-file lookup, the agent does NOT delegate to a subagent.\",\n\tinput: \"What does the function `getSystemPrompt` in `packages/postqode-prompts/src/index.ts` do? Read it and explain in one sentence.\",\n\tmaxTurns: 5,\n\trules: [\n\t\ttoolUsed(\"read_file\"),\n\t\ttoolCalledAtMost(\"bash\", 1), // subagent invocation goes through bash; allow 0-1\n\t\tstopReasonIs(\"completed\"),\n\t],\n}\n\n// =====================================================================\n// Aggregate\n// =====================================================================\n\n/**\n * Default suite of prompt-behavior evals. Skip slow ones in CI by leaving\n * `runSlow: false` on the runner config.\n */\nexport const promptBehaviorEvals: readonly Eval[] = [\n\tpicksGrepForRegexSearch,\n\tpicksReadFileForKnownPath,\n\tpicksListFilesForDirectoryOverview,\n\tdoesNotEndWithQuestion,\n\toneToolPerTurn,\n\tusesEditFileForTargetedChange,\n\tdoesNotSpawnSubagentForSimpleTask,\n]\n"],
5
+ "mappings": "0VACA,MAAA,QAAA,QAAA,UAAA,EAwBa,QAAA,wBAAgC,CAC5C,KAAM,8BACN,YAAa,8EACb,MACC,uFACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,MAAM,KAIf,QAAA,kBAAiB,YAAa,CAAC,KAC/B,QAAA,cAAa,WAAW,IAIb,QAAA,0BAAkC,CAC9C,KAAM,iCACN,YACC,+EACD,MACC,0EACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,WAAW,KACpB,QAAA,kBAAiB,OAAQ,CAAC,KAC1B,QAAA,iBAAgB,UAAU,KAC1B,QAAA,cAAa,WAAW,IAIb,QAAA,mCAA2C,CACvD,KAAM,0CACN,YACC,+EACD,MACC,sEACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,YAAY,KACrB,QAAA,kBAAiB,YAAa,CAAC,KAC/B,QAAA,cAAa,WAAW,IAQb,QAAA,uBAA+B,CAC3C,KAAM,6BACN,YACC,gFACD,MAAO,uFACP,SAAU,EACV,MAAO,IACN,QAAA,cAAa,WAAW,EAIxB,CACC,KAAM,+BACN,MAAO,CAAC,CAAE,OAAAA,CAAM,IAAM,CACrB,MAAMC,EAAaD,EAAO,SAAS,OAAQE,GAAMA,EAAE,OAAS,WAAW,EACjEC,EAAOF,EAAWA,EAAW,OAAS,CAAC,EACvCG,EAAO,OAAOD,GAAM,SAAY,SAAWA,EAAK,QAAQ,KAAI,EAAK,GACjEE,EAAmBD,EAAK,SAAS,GAAG,EAC1C,MAAO,CACN,OAAQ,CAACC,EACT,OAAQA,EACL,wCAAwCD,EAAK,MAAM,IAAI,CAAC,IACxD,OAEL,KAKU,QAAA,eAAuB,CACnC,KAAM,oBACN,YACC,6EACD,MACC,iFACD,SAAU,EACV,MAAO,IACN,QAAA,UAAS,WAAW,KACpB,QAAA,cAAa,WAAW,EAKxB,CACC,KAAM,wCACN,MAAO,CAAC,CAAE,OAAAE,CAAM,IAAM,CACrB,IAAIC,EAAqB,GACrBC,EAAoB,GACxB,UAAWC,KAAKH,EAEf,GADIG,EAAE,OAAS,eAAcF,EAAqB,IAC9CE,EAAE,OAAS,WAAY,CAC1B,GAAIF,EAAoB,CACvBC,EAAoB,GACpB,KACD,CACAD,EAAqB,EACtB,CAED,MAAO,CACN,OAAQ,CAACC,EACT,OAAQA,EACL,0EACA,OAEL,KASU,QAAA,8BAAsC,CAClD,KAAM,qCACN,YACC,qGACD,MACC,iIACD,SAAU,EACV,UAAW,CAAC,MAAM,EAClB,MAAO,IACN,QAAA,UAAS,WAAW,KACpB,QAAA,kBAAiB,aAAc,CAAC,KAChC,QAAA,cAAa,WAAW,GAEzB,KAAM,IAOM,QAAA,kCAA0C,CACtD,KAAM,0CACN,YACC,uEACD,MAAO,gIACP,SAAU,EACV,MAAO,IACN,QAAA,UAAS,WAAW,KACpB,QAAA,kBAAiB,OAAQ,CAAC,KAC1B,QAAA,cAAa,WAAW,IAYb,QAAA,oBAAuC,CACnD,QAAA,wBACA,QAAA,0BACA,QAAA,mCACA,QAAA,uBACA,QAAA,eACA,QAAA,8BACA,QAAA",
6
+ "names": ["result", "assistants", "m", "last", "text", "endsWithQuestion", "events", "currentTurnHasTool", "multipleInOneTurn", "e"]
7
+ }
@@ -0,0 +1,8 @@
1
+ export { runEvals } from "./runner";
2
+ export { BudgetTracker } from "./budget";
3
+ export type { Eval, EvalContext, EvalResult, EvalRule, EvalRuleResult, EvalRunConfig, EvalRunSummary, } from "./types";
4
+ export * from "./rules";
5
+ export { baselineFixtures } from "./fixtures/basic";
6
+ export { planFixtures, runPlanFixtures } from "./fixtures/plan";
7
+ export type { AgentFactory } from "./runner";
8
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AA0BA,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAA;AACxC,YAAY,EACX,IAAI,EACJ,WAAW,EACX,UAAU,EACV,QAAQ,EACR,cAAc,EACd,aAAa,EACb,cAAc,GACd,MAAM,SAAS,CAAA;AAKhB,cAAc,SAAS,CAAA;AAIvB,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAA;AAI/D,YAAY,EAAE,YAAY,EAAE,MAAM,UAAU,CAAA"}
package/dist/index.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";var __createBinding=this&&this.__createBinding||(Object.create?(function(n,r,e,t){t===void 0&&(t=e);var u=Object.getOwnPropertyDescriptor(r,e);(!u||("get"in u?!r.__esModule:u.writable||u.configurable))&&(u={enumerable:!0,get:function(){return r[e]}}),Object.defineProperty(n,t,u)}):(function(n,r,e,t){t===void 0&&(t=e),n[t]=r[e]})),__exportStar=this&&this.__exportStar||function(n,r){for(var e in n)e!=="default"&&!Object.prototype.hasOwnProperty.call(r,e)&&__createBinding(r,n,e)};Object.defineProperty(exports,"__esModule",{value:!0}),exports.runPlanFixtures=exports.planFixtures=exports.baselineFixtures=exports.BudgetTracker=exports.runEvals=void 0;var runner_1=require("./runner");Object.defineProperty(exports,"runEvals",{enumerable:!0,get:function(){return runner_1.runEvals}});var budget_1=require("./budget");Object.defineProperty(exports,"BudgetTracker",{enumerable:!0,get:function(){return budget_1.BudgetTracker}}),__exportStar(require("./rules"),exports);var basic_1=require("./fixtures/basic");Object.defineProperty(exports,"baselineFixtures",{enumerable:!0,get:function(){return basic_1.baselineFixtures}});var plan_1=require("./fixtures/plan");Object.defineProperty(exports,"planFixtures",{enumerable:!0,get:function(){return plan_1.planFixtures}}),Object.defineProperty(exports,"runPlanFixtures",{enumerable:!0,get:function(){return plan_1.runPlanFixtures}});
2
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../src/index.ts"],
4
+ "sourcesContent": ["// @postqode/evals \u2014 real-LLM evaluation harness.\n//\n// Runs @postqode/coding-agent against a real Anthropic model and\n// evaluates each run against a list of rules. Designed for CI: skips\n// cleanly if ANTHROPIC_API_KEY isn't present, and enforces a strict\n// dollar-cost cap (default $10) so an eval misfire can't burn through\n// your budget.\n//\n// Usage:\n// ```ts\n// import { runEvals } from \"@postqode/evals\"\n// import { toolUsed, contentIncludes } from \"@postqode/evals/rules\"\n//\n// const summary = await runEvals([\n// {\n// name: \"reads_file_before_summarizing\",\n// description: \"Given a file path, the agent reads it before answering.\",\n// input: \"Summarize the package.json in this directory.\",\n// rules: [toolUsed(\"read_file\"), contentIncludes(\"postqode\")],\n// },\n// ], {\n// budgetUsd: 2, // tighter than the $10 default\n// onEvalComplete: (r) => console.log(`${r.passed ? \"\u2713\" : \"\u2717\"} ${r.eval.name}`),\n// onSkip: (why) => console.log(why),\n// })\n// ```\nexport { runEvals } from \"./runner\"\nexport { BudgetTracker } from \"./budget\"\nexport type {\n\tEval,\n\tEvalContext,\n\tEvalResult,\n\tEvalRule,\n\tEvalRuleResult,\n\tEvalRunConfig,\n\tEvalRunSummary,\n} from \"./types\"\n\n// Rule builders re-exported at the top level so the single-import pattern\n// works. Consumers who want just the rules can also use\n// `import { toolUsed, contentIncludes } from \"@postqode/evals/rules\"`.\nexport * from \"./rules\"\n\n// Fixture evals ship with the package so consumers can cherry-pick them or\n// extend with their own.\nexport { baselineFixtures } from \"./fixtures/basic\"\nexport { planFixtures, runPlanFixtures } from \"./fixtures/plan\"\n\n// Runner-level utility type for consumers who want to plug in a custom\n// agent factory (e.g. a different harness, a test harness, a subagent).\nexport type { AgentFactory } from \"./runner\"\n"],
5
+ "mappings": "0pBA0BA,IAAA,SAAA,QAAA,UAAA,EAAS,OAAA,eAAA,QAAA,WAAA,CAAA,WAAA,GAAA,IAAA,UAAA,CAAA,OAAA,SAAA,QAAQ,CAAA,CAAA,EACjB,IAAA,SAAA,QAAA,UAAA,EAAS,OAAA,eAAA,QAAA,gBAAA,CAAA,WAAA,GAAA,IAAA,UAAA,CAAA,OAAA,SAAA,aAAa,CAAA,CAAA,EActB,aAAA,QAAA,SAAA,EAAA,OAAA,EAIA,IAAA,QAAA,QAAA,kBAAA,EAAS,OAAA,eAAA,QAAA,mBAAA,CAAA,WAAA,GAAA,IAAA,UAAA,CAAA,OAAA,QAAA,gBAAgB,CAAA,CAAA,EACzB,IAAA,OAAA,QAAA,iBAAA,EAAS,OAAA,eAAA,QAAA,eAAA,CAAA,WAAA,GAAA,IAAA,UAAA,CAAA,OAAA,OAAA,YAAY,CAAA,CAAA,EAAE,OAAA,eAAA,QAAA,kBAAA,CAAA,WAAA,GAAA,IAAA,UAAA,CAAA,OAAA,OAAA,eAAe,CAAA,CAAA",
6
+ "names": []
7
+ }
@@ -0,0 +1,20 @@
1
+ import type { EvalRule } from "./types";
2
+ /**
3
+ * Rule: the agent called a specific tool at least once during the run.
4
+ *
5
+ * ```ts
6
+ * rules: [toolUsed("read_file"), contentIncludes("TODO")]
7
+ * ```
8
+ */
9
+ export declare function toolUsed(name: string): EvalRule;
10
+ /** Rule: the agent called `name` at most `max` times. */
11
+ export declare function toolCalledAtMost(name: string, max: number): EvalRule;
12
+ /** Rule: the final assistant message contains `substring` (case-insensitive). */
13
+ export declare function contentIncludes(substring: string): EvalRule;
14
+ /** Rule: the run ended with the given stop reason (completed / aborted / max_turns / error). */
15
+ export declare function stopReasonIs(reason: "completed" | "aborted" | "max_turns" | "error"): EvalRule;
16
+ /** Rule: total turns used did not exceed `max`. */
17
+ export declare function turnsAtMost(max: number): EvalRule;
18
+ /** Rule: at least one `tool_result` event reported no error. */
19
+ export declare function hadSuccessfulToolResult(): EvalRule;
20
+ //# sourceMappingURL=rules.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rules.d.ts","sourceRoot":"","sources":["../src/rules.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAA;AAEvC;;;;;;GAMG;AACH,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,CAW/C;AAED,yDAAyD;AACzD,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,QAAQ,CAcpE;AAED,iFAAiF;AACjF,wBAAgB,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,QAAQ,CAiB3D;AAED,gGAAgG;AAChG,wBAAgB,YAAY,CAAC,MAAM,EAAE,WAAW,GAAG,SAAS,GAAG,WAAW,GAAG,OAAO,GAAG,QAAQ,CAW9F;AAED,mDAAmD;AACnD,wBAAgB,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,QAAQ,CAQjD;AAED,gEAAgE;AAChE,wBAAgB,uBAAuB,IAAI,QAAQ,CAalD"}
package/dist/rules.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.toolUsed=toolUsed,exports.toolCalledAtMost=toolCalledAtMost,exports.contentIncludes=contentIncludes,exports.stopReasonIs=stopReasonIs,exports.turnsAtMost=turnsAtMost,exports.hadSuccessfulToolResult=hadSuccessfulToolResult;function toolUsed(t){return{name:`tool_used(${t})`,check:({events:e})=>{const s=e.some(n=>n.type==="tool_use"&&n.name===t);return{passed:s,reason:s?void 0:`Agent never called \`${t}\`.`}}}}function toolCalledAtMost(t,e){return{name:`tool_called_at_most(${t}, ${e})`,check:({events:s})=>{const n=s.filter(o=>o.type==="tool_use"&&o.name===t).length;return{passed:n<=e,reason:n<=e?void 0:`Agent called \`${t}\` ${n} times (max ${e}).`}}}}function contentIncludes(t){return{name:`content_includes(${truncate(t,40)})`,check:({result:e})=>{const s=t.toLowerCase(),n=e.messages.filter(c=>c.role==="assistant"),o=n[n.length-1],r=typeof o?.content=="string"?o.content:"",u=r.toLowerCase().includes(s);return{passed:u,reason:u?void 0:`Final assistant message did not contain "${truncate(t,60)}". Got: "${truncate(r,200)}".`}}}}function stopReasonIs(t){return{name:`stop_reason(${t})`,check:({result:e})=>({passed:e.stopReason===t,reason:e.stopReason===t?void 0:`Expected stopReason=${t}, got ${e.stopReason}.`})}}function turnsAtMost(t){return{name:`turns_at_most(${t})`,check:({result:e})=>({passed:e.turns<=t,reason:e.turns<=t?void 0:`Used ${e.turns} turns (max ${t}).`})}}function hadSuccessfulToolResult(){return{name:"had_successful_tool_result",check:({events:t})=>{const e=t.some(s=>s.type==="tool_result"&&!s.output.isError&&!s.error);return{passed:e,reason:e?void 0:"No tool_result event with a non-error output was emitted."}}}}function truncate(t,e){return t.length<=e?t:t.slice(0,e-1)+"\u2026"}
2
+ //# sourceMappingURL=rules.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../src/rules.ts"],
4
+ "sourcesContent": ["import type { AgentEvent } from \"@postqode/agent\"\n\nimport type { EvalRule } from \"./types\"\n\n/**\n * Rule: the agent called a specific tool at least once during the run.\n *\n * ```ts\n * rules: [toolUsed(\"read_file\"), contentIncludes(\"TODO\")]\n * ```\n */\nexport function toolUsed(name: string): EvalRule {\n\treturn {\n\t\tname: `tool_used(${name})`,\n\t\tcheck: ({ events }) => {\n\t\t\tconst used = events.some((e) => e.type === \"tool_use\" && e.name === name)\n\t\t\treturn {\n\t\t\t\tpassed: used,\n\t\t\t\treason: used ? undefined : `Agent never called \\`${name}\\`.`,\n\t\t\t}\n\t\t},\n\t}\n}\n\n/** Rule: the agent called `name` at most `max` times. */\nexport function toolCalledAtMost(name: string, max: number): EvalRule {\n\treturn {\n\t\tname: `tool_called_at_most(${name}, ${max})`,\n\t\tcheck: ({ events }) => {\n\t\t\tconst count = events.filter((e) => e.type === \"tool_use\" && e.name === name).length\n\t\t\treturn {\n\t\t\t\tpassed: count <= max,\n\t\t\t\treason:\n\t\t\t\t\tcount <= max\n\t\t\t\t\t\t? undefined\n\t\t\t\t\t\t: `Agent called \\`${name}\\` ${count} times (max ${max}).`,\n\t\t\t}\n\t\t},\n\t}\n}\n\n/** Rule: the final assistant message contains `substring` (case-insensitive). */\nexport function contentIncludes(substring: string): EvalRule {\n\treturn {\n\t\tname: `content_includes(${truncate(substring, 40)})`,\n\t\tcheck: ({ result }) => {\n\t\t\tconst needle = substring.toLowerCase()\n\t\t\tconst assistants = result.messages.filter((m) => m.role === \"assistant\")\n\t\t\tconst lastAssistant = assistants[assistants.length - 1]\n\t\t\tconst content = typeof lastAssistant?.content === \"string\" ? lastAssistant.content : \"\"\n\t\t\tconst found = content.toLowerCase().includes(needle)\n\t\t\treturn {\n\t\t\t\tpassed: found,\n\t\t\t\treason: found\n\t\t\t\t\t? undefined\n\t\t\t\t\t: `Final assistant message did not contain \"${truncate(substring, 60)}\". Got: \"${truncate(content, 200)}\".`,\n\t\t\t}\n\t\t},\n\t}\n}\n\n/** Rule: the run ended with the given stop reason (completed / aborted / max_turns / error). */\nexport function stopReasonIs(reason: \"completed\" | \"aborted\" | \"max_turns\" | \"error\"): EvalRule {\n\treturn {\n\t\tname: `stop_reason(${reason})`,\n\t\tcheck: ({ result }) => ({\n\t\t\tpassed: result.stopReason === reason,\n\t\t\treason:\n\t\t\t\tresult.stopReason === reason\n\t\t\t\t\t? undefined\n\t\t\t\t\t: `Expected stopReason=${reason}, got ${result.stopReason}.`,\n\t\t}),\n\t}\n}\n\n/** Rule: total turns used did not exceed `max`. */\nexport function turnsAtMost(max: number): EvalRule {\n\treturn {\n\t\tname: `turns_at_most(${max})`,\n\t\tcheck: ({ result }) => ({\n\t\t\tpassed: result.turns <= max,\n\t\t\treason: result.turns <= max ? undefined : `Used ${result.turns} turns (max ${max}).`,\n\t\t}),\n\t}\n}\n\n/** Rule: at least one `tool_result` event reported no error. */\nexport function hadSuccessfulToolResult(): EvalRule {\n\treturn {\n\t\tname: \"had_successful_tool_result\",\n\t\tcheck: ({ events }) => {\n\t\t\tconst ok = events.some(\n\t\t\t\t(e: AgentEvent) => e.type === \"tool_result\" && !e.output.isError && !e.error,\n\t\t\t)\n\t\t\treturn {\n\t\t\t\tpassed: ok,\n\t\t\t\treason: ok ? undefined : \"No tool_result event with a non-error output was emitted.\",\n\t\t\t}\n\t\t},\n\t}\n}\n\nfunction truncate(s: string, n: number): string {\n\treturn s.length <= n ? s : s.slice(0, n - 1) + \"\u2026\"\n}\n"],
5
+ "mappings": "oEAWA,QAAA,SAAA,SAcA,QAAA,iBAAA,iBAiBA,QAAA,gBAAA,gBAoBA,QAAA,aAAA,aAcA,QAAA,YAAA,YAWA,QAAA,wBAAA,wBA5EA,SAAgB,SAASA,EAAY,CACpC,MAAO,CACN,KAAM,aAAaA,CAAI,IACvB,MAAO,CAAC,CAAE,OAAAC,CAAM,IAAM,CACrB,MAAMC,EAAOD,EAAO,KAAME,GAAMA,EAAE,OAAS,YAAcA,EAAE,OAASH,CAAI,EACxE,MAAO,CACN,OAAQE,EACR,OAAQA,EAAO,OAAY,wBAAwBF,CAAI,MAEzD,EAEF,CAGA,SAAgB,iBAAiBA,EAAcI,EAAW,CACzD,MAAO,CACN,KAAM,uBAAuBJ,CAAI,KAAKI,CAAG,IACzC,MAAO,CAAC,CAAE,OAAAH,CAAM,IAAM,CACrB,MAAMI,EAAQJ,EAAO,OAAQE,GAAMA,EAAE,OAAS,YAAcA,EAAE,OAASH,CAAI,EAAE,OAC7E,MAAO,CACN,OAAQK,GAASD,EACjB,OACCC,GAASD,EACN,OACA,kBAAkBJ,CAAI,MAAMK,CAAK,eAAeD,CAAG,KAEzD,EAEF,CAGA,SAAgB,gBAAgBE,EAAiB,CAChD,MAAO,CACN,KAAM,oBAAoB,SAASA,EAAW,EAAE,CAAC,IACjD,MAAO,CAAC,CAAE,OAAAC,CAAM,IAAM,CACrB,MAAMC,EAASF,EAAU,YAAW,EAC9BG,EAAaF,EAAO,SAAS,OAAQG,GAAMA,EAAE,OAAS,WAAW,EACjEC,EAAgBF,EAAWA,EAAW,OAAS,CAAC,EAChDG,EAAU,OAAOD,GAAe,SAAY,SAAWA,EAAc,QAAU,GAC/EE,EAAQD,EAAQ,YAAW,EAAG,SAASJ,CAAM,EACnD,MAAO,CACN,OAAQK,EACR,OAAQA,EACL,OACA,4CAA4C,SAASP,EAAW,EAAE,CAAC,YAAY,SAASM,EAAS,GAAG,CAAC,KAE1G,EAEF,CAGA,SAAgB,aAAaE,EAAuD,CACnF,MAAO,CACN,KAAM,eAAeA,CAAM,IAC3B,MAAO,CAAC,CAAE,OAAAP,CAAM,KAAQ,CACvB,OAAQA,EAAO,aAAeO,EAC9B,OACCP,EAAO,aAAeO,EACnB,OACA,uBAAuBA,CAAM,SAASP,EAAO,UAAU,MAG9D,CAGA,SAAgB,YAAYH,EAAW,CACtC,MAAO,CACN,KAAM,iBAAiBA,CAAG,IAC1B,MAAO,CAAC,CAAE,OAAAG,CAAM,KAAQ,CACvB,OAAQA,EAAO,OAASH,EACxB,OAAQG,EAAO,OAASH,EAAM,OAAY,QAAQG,EAAO,KAAK,eAAeH,CAAG,OAGnF,CAGA,SAAgB,yBAAuB,CACtC,MAAO,CACN,KAAM,6BACN,MAAO,CAAC,CAAE,OAAAH,CAAM,IAAM,CACrB,MAAMc,EAAKd,EAAO,KAChBE,GAAkBA,EAAE,OAAS,eAAiB,CAACA,EAAE,OAAO,SAAW,CAACA,EAAE,KAAK,EAE7E,MAAO,CACN,OAAQY,EACR,OAAQA,EAAK,OAAY,4DAE3B,EAEF,CAEA,SAAS,SAASC,EAAWC,EAAS,CACrC,OAAOD,EAAE,QAAUC,EAAID,EAAIA,EAAE,MAAM,EAAGC,EAAI,CAAC,EAAI,QAChD",
6
+ "names": ["name", "events", "used", "e", "max", "count", "substring", "result", "needle", "assistants", "m", "lastAssistant", "content", "found", "reason", "ok", "s", "n"]
7
+ }
@@ -0,0 +1,52 @@
1
+ import type { Agent } from "@postqode/agent";
2
+ import type { CodingAgentConfig } from "@postqode/coding-agent";
3
+ /**
4
+ * Factory that builds the Agent instance an eval runs against. Defaults
5
+ * to createCodingAgent; fixtures that need a different agent shape
6
+ * (e.g. plan mode) override via {@link EvalRunConfig.agentFactory}.
7
+ */
8
+ export type AgentFactory = (config: CodingAgentConfig) => Agent;
9
+ import type { Eval, EvalRunConfig, EvalRunSummary } from "./types";
10
+ /**
11
+ * Optional base URL override (for OpenAI-compatible endpoints like
12
+ * Canopywave, LiteLLM proxies, etc.). Only applied to providers where
13
+ * the handler accepts a custom base URL — currently "openai".
14
+ */
15
+ export interface BaseUrlCapableConfig {
16
+ baseUrl?: string;
17
+ }
18
+ /**
19
+ * Run a list of evals against a real LLM.
20
+ *
21
+ * Defaults to Anthropic; pick a different provider via `config.provider`.
22
+ *
23
+ * ```ts
24
+ * import { runEvals } from "@postqode/evals"
25
+ * import { toolUsed, contentIncludes } from "@postqode/evals/rules"
26
+ *
27
+ * // Anthropic direct (default):
28
+ * await runEvals(evals)
29
+ *
30
+ * // Via OpenRouter:
31
+ * await runEvals(evals, {
32
+ * provider: "openrouter",
33
+ * apiKey: process.env.OPENROUTER_API_KEY,
34
+ * model: "anthropic/claude-sonnet-4.5",
35
+ * })
36
+ *
37
+ * // Raw provider options (any registered provider):
38
+ * await runEvals(evals, {
39
+ * provider: "openai-native",
40
+ * providerOptions: { openAiNativeApiKey: key, agentModeApiModelId: "gpt-4o",
41
+ * planModeApiModelId: "gpt-4o" },
42
+ * })
43
+ * ```
44
+ *
45
+ * Budget cap is strict: the runner checks accumulated cost BEFORE each
46
+ * eval. If exceeded, remaining evals are skipped and
47
+ * `budgetExceeded: true` is returned.
48
+ *
49
+ * Missing API key → `skipped: true`, no evals run, `onSkip` fires once.
50
+ */
51
+ export declare function runEvals(evals: readonly Eval[], config?: EvalRunConfig): Promise<EvalRunSummary>;
52
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,KAAK,EAAkC,MAAM,iBAAiB,CAAA;AAC5E,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAA;AAG/D;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,MAAM,EAAE,iBAAiB,KAAK,KAAK,CAAA;AAG/D,OAAO,KAAK,EACX,IAAI,EAGJ,aAAa,EACb,cAAc,EACd,MAAM,SAAS,CAAA;AAUhB;;;;GAIG;AACH,MAAM,WAAW,oBAAoB;IACpC,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB;AA+FD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAsB,QAAQ,CAC7B,KAAK,EAAE,SAAS,IAAI,EAAE,EACtB,MAAM,GAAE,aAAkB,GACxB,OAAO,CAAC,cAAc,CAAC,CA+EzB"}
package/dist/runner.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.runEvals=runEvals;const ai_1=require("@postqode/ai"),coding_agent_1=require("@postqode/coding-agent"),budget_1=require("./budget"),DEFAULT_PROVIDER="anthropic",DEFAULT_MODEL_BY_PROVIDER={anthropic:"claude-sonnet-4-5-20250929",openrouter:"anthropic/claude-sonnet-4.5"},DEFAULT_BUDGET_USD=10,DEFAULT_MAX_TURNS=30;function buildProviderOptions(r,e,t,s){switch(r){case"anthropic":return{apiKey:e,anthropicBaseUrl:s,agentModeApiModelId:t,planModeApiModelId:t};case"openrouter":return{openRouterApiKey:e,agentModeOpenRouterModelId:t,planModeOpenRouterModelId:t};case"openai-native":return{openAiNativeApiKey:e,agentModeApiModelId:t,planModeApiModelId:t};case"openai":return{openAiApiKey:e,openAiBaseUrl:s,agentModeOpenAiModelId:t,planModeOpenAiModelId:t};case"gemini":return{geminiApiKey:e,agentModeApiModelId:t,planModeApiModelId:t};default:throw new Error(`runEvals: no built-in option mapping for provider "${r}". Pass EvalRunConfig.providerOptions with the raw factory options instead.`)}}function toolFormatFor(r){switch(r){case"openai":case"openai-native":case"gemini":return"openai";default:return"anthropic"}}function envVarFor(r){switch(r){case"anthropic":return"ANTHROPIC_API_KEY";case"openrouter":return"OPENROUTER_API_KEY";case"openai-native":return"OPENAI_API_KEY";case"gemini":return"GEMINI_API_KEY";default:return"EVAL_API_KEY"}}async function runEvals(r,e={}){const t=e.provider??DEFAULT_PROVIDER,s=envVarFor(t),i=e.apiKey??process.env[s];if(!i&&!e.providerOptions){const o=`${s} is not set \u2014 skipping eval run. Export the key or pass EvalRunConfig.apiKey / providerOptions.`;return e.onSkip?.(o),{results:[],totalUsage:{inputTokens:0,outputTokens:0},budgetExceeded:!1,skipped:!0,durationMs:0}}const E=e.model??DEFAULT_MODEL_BY_PROVIDER[t]??DEFAULT_MODEL_BY_PROVIDER[DEFAULT_PROVIDER],v=e.providerOptions??buildProviderOptions(t,i,E,e.baseUrl),a=new budget_1.BudgetTracker(e.budgetUsd??DEFAULT_BUDGET_USD),p=e.maxTurns??DEFAULT_MAX_TURNS,d=e.runSlow??!1,g=e.agentFactory??coding_agent_1.createCodingAgent,u=[],l=Date.now();let n=!1;for(const o of r){if(o.slow&&!d){const A={eval:o,passed:!0,rules:[],usage:{inputTokens:0,outputTokens:0},durationMs:0,skipped:!0};u.push(A),e.onEvalComplete?.(A);continue}if(a.wouldExceed()){n=!0;break}const c=await runOneEval(o,{provider:t,providerOptions:v,defaultMaxTurns:p,agentFactory:g,toolFormat:toolFormatFor(t)});if(a.add(c.usage),u.push(c),e.onEvalComplete?.(c),a.wouldExceed()){n=!0;break}}return{results:u,totalUsage:a.totals,budgetExceeded:n,skipped:!1,durationMs:Date.now()-l}}async function runOneEval(r,e){const t=Date.now(),s=[],i=(0,ai_1.getProvider)(e.provider);if(!i)throw new ai_1.UnknownProviderError(e.provider);const E=i(e.providerOptions,"agent"),v={onEvent:n=>s.push(n)},a=e.agentFactory({api:E,extraTools:r.extraTools,dropTools:r.dropTools,systemPromptExtras:r.systemPromptExtras,maxTurns:r.maxTurns??e.defaultMaxTurns,toolFormat:e.toolFormat,hooks:v});let p,d;try{p=await a.run(r.input)}catch(n){return d=n instanceof Error?n:new Error(String(n)),{eval:r,passed:!1,rules:[],usage:{inputTokens:0,outputTokens:0},durationMs:Date.now()-t,error:{message:d.message,name:d.name}}}const g={input:r.input,events:s,result:p},u=s.find(n=>n.type==="error"),l=await Promise.all(r.rules.map(async n=>{try{const{passed:o,reason:c}=await n.check(g);return{name:n.name,passed:o,reason:c}}catch(o){return{name:n.name,passed:!1,reason:`Rule threw: ${o instanceof Error?o.message:String(o)}`}}}));return{eval:r,passed:l.every(n=>n.passed),rules:l,usage:p.usage,durationMs:Date.now()-t,error:u?u.error:void 0}}
2
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../src/runner.ts"],
4
+ "sourcesContent": ["import { getProvider, UnknownProviderError } from \"@postqode/ai\"\nimport type { Agent, AgentEvent, AgentResult, Hooks } from \"@postqode/agent\"\nimport type { CodingAgentConfig } from \"@postqode/coding-agent\"\nimport { createCodingAgent } from \"@postqode/coding-agent\"\n\n/**\n * Factory that builds the Agent instance an eval runs against. Defaults\n * to createCodingAgent; fixtures that need a different agent shape\n * (e.g. plan mode) override via {@link EvalRunConfig.agentFactory}.\n */\nexport type AgentFactory = (config: CodingAgentConfig) => Agent\n\nimport { BudgetTracker } from \"./budget\"\nimport type {\n\tEval,\n\tEvalContext,\n\tEvalResult,\n\tEvalRunConfig,\n\tEvalRunSummary,\n} from \"./types\"\n\nconst DEFAULT_PROVIDER = \"anthropic\"\nconst DEFAULT_MODEL_BY_PROVIDER: Record<string, string> = {\n\tanthropic: \"claude-sonnet-4-5-20250929\",\n\topenrouter: \"anthropic/claude-sonnet-4.5\",\n}\nconst DEFAULT_BUDGET_USD = 10\nconst DEFAULT_MAX_TURNS = 30\n\n/**\n * Optional base URL override (for OpenAI-compatible endpoints like\n * Canopywave, LiteLLM proxies, etc.). Only applied to providers where\n * the handler accepts a custom base URL \u2014 currently \"openai\".\n */\nexport interface BaseUrlCapableConfig {\n\tbaseUrl?: string\n}\n\n/**\n * Build the per-provider option object the registered factory expects.\n *\n * Each branch mirrors the exact field set the provider's registration in\n * @postqode/ai's builtins.ts reads. If a provider isn't listed here,\n * consumers can still use it via `EvalRunConfig.providerOptions` (raw\n * passthrough).\n */\nfunction buildProviderOptions(\n\tprovider: string,\n\tapiKey: string,\n\tmodel: string,\n\tbaseUrl: string | undefined,\n): Record<string, unknown> {\n\tswitch (provider) {\n\t\tcase \"anthropic\":\n\t\t\treturn {\n\t\t\t\tapiKey,\n\t\t\t\tanthropicBaseUrl: baseUrl,\n\t\t\t\tagentModeApiModelId: model,\n\t\t\t\tplanModeApiModelId: model,\n\t\t\t}\n\t\tcase \"openrouter\":\n\t\t\treturn {\n\t\t\t\topenRouterApiKey: apiKey,\n\t\t\t\tagentModeOpenRouterModelId: model,\n\t\t\t\tplanModeOpenRouterModelId: model,\n\t\t\t}\n\t\tcase \"openai-native\":\n\t\t\treturn {\n\t\t\t\topenAiNativeApiKey: apiKey,\n\t\t\t\tagentModeApiModelId: model,\n\t\t\t\tplanModeApiModelId: model,\n\t\t\t}\n\t\tcase \"openai\":\n\t\t\t// OpenAI-compatible generic endpoint. Used for Canopywave,\n\t\t\t// LiteLLM proxies, local inference servers, etc. Requires a\n\t\t\t// base URL; the handler reads openAiBaseUrl at construction.\n\t\t\treturn {\n\t\t\t\topenAiApiKey: apiKey,\n\t\t\t\topenAiBaseUrl: baseUrl,\n\t\t\t\tagentModeOpenAiModelId: model,\n\t\t\t\tplanModeOpenAiModelId: model,\n\t\t\t}\n\t\tcase \"gemini\":\n\t\t\treturn {\n\t\t\t\tgeminiApiKey: apiKey,\n\t\t\t\tagentModeApiModelId: model,\n\t\t\t\tplanModeApiModelId: model,\n\t\t\t}\n\t\tdefault:\n\t\t\t// Unknown provider \u2014 ask the caller to supply providerOptions directly.\n\t\t\tthrow new Error(\n\t\t\t\t`runEvals: no built-in option mapping for provider \"${provider}\". ` +\n\t\t\t\t\t\"Pass EvalRunConfig.providerOptions with the raw factory options instead.\",\n\t\t\t)\n\t}\n}\n\n/**\n * Pick the on-the-wire tool format that matches the provider's API. The\n * agent converts Tool[] \u2192 either Anthropic shape (`{name, description,\n * input_schema}`) or OpenAI shape (`{type: \"function\", function: {\u2026}}`).\n * Canopywave, OpenAI, OpenAI-native, and Gemini all speak OpenAI's\n * ChatCompletion-style tool payload; everything else we register by\n * default is Anthropic-family.\n */\nfunction toolFormatFor(provider: string): \"anthropic\" | \"openai\" {\n\tswitch (provider) {\n\t\tcase \"openai\":\n\t\tcase \"openai-native\":\n\t\tcase \"gemini\":\n\t\t\treturn \"openai\"\n\t\tdefault:\n\t\t\treturn \"anthropic\"\n\t}\n}\n\nfunction envVarFor(provider: string): string {\n\tswitch (provider) {\n\t\tcase \"anthropic\":\n\t\t\treturn \"ANTHROPIC_API_KEY\"\n\t\tcase \"openrouter\":\n\t\t\treturn \"OPENROUTER_API_KEY\"\n\t\tcase \"openai-native\":\n\t\t\treturn \"OPENAI_API_KEY\"\n\t\tcase \"gemini\":\n\t\t\treturn \"GEMINI_API_KEY\"\n\t\tdefault:\n\t\t\treturn \"EVAL_API_KEY\"\n\t}\n}\n\n/**\n * Run a list of evals against a real LLM.\n *\n * Defaults to Anthropic; pick a different provider via `config.provider`.\n *\n * ```ts\n * import { runEvals } from \"@postqode/evals\"\n * import { toolUsed, contentIncludes } from \"@postqode/evals/rules\"\n *\n * // Anthropic direct (default):\n * await runEvals(evals)\n *\n * // Via OpenRouter:\n * await runEvals(evals, {\n * provider: \"openrouter\",\n * apiKey: process.env.OPENROUTER_API_KEY,\n * model: \"anthropic/claude-sonnet-4.5\",\n * })\n *\n * // Raw provider options (any registered provider):\n * await runEvals(evals, {\n * provider: \"openai-native\",\n * providerOptions: { openAiNativeApiKey: key, agentModeApiModelId: \"gpt-4o\",\n * planModeApiModelId: \"gpt-4o\" },\n * })\n * ```\n *\n * Budget cap is strict: the runner checks accumulated cost BEFORE each\n * eval. If exceeded, remaining evals are skipped and\n * `budgetExceeded: true` is returned.\n *\n * Missing API key \u2192 `skipped: true`, no evals run, `onSkip` fires once.\n */\nexport async function runEvals(\n\tevals: readonly Eval[],\n\tconfig: EvalRunConfig = {},\n): Promise<EvalRunSummary> {\n\tconst provider = config.provider ?? DEFAULT_PROVIDER\n\tconst envVar = envVarFor(provider)\n\tconst apiKey = config.apiKey ?? process.env[envVar]\n\n\t// providerOptions lets consumers bypass our option builder entirely; in\n\t// that case the key is inside the options, so we don't need apiKey.\n\tif (!apiKey && !config.providerOptions) {\n\t\tconst reason = `${envVar} is not set \u2014 skipping eval run. Export the key or pass EvalRunConfig.apiKey / providerOptions.`\n\t\tconfig.onSkip?.(reason)\n\t\treturn {\n\t\t\tresults: [],\n\t\t\ttotalUsage: { inputTokens: 0, outputTokens: 0 },\n\t\t\tbudgetExceeded: false,\n\t\t\tskipped: true,\n\t\t\tdurationMs: 0,\n\t\t}\n\t}\n\n\tconst model = config.model ?? DEFAULT_MODEL_BY_PROVIDER[provider] ?? DEFAULT_MODEL_BY_PROVIDER[DEFAULT_PROVIDER]\n\tconst providerOptions =\n\t\tconfig.providerOptions ?? buildProviderOptions(provider, apiKey!, model!, config.baseUrl)\n\n\tconst budget = new BudgetTracker(config.budgetUsd ?? DEFAULT_BUDGET_USD)\n\tconst defaultMaxTurns = config.maxTurns ?? DEFAULT_MAX_TURNS\n\tconst runSlow = config.runSlow ?? false\n\t// createCodingAgent is overloaded (bounded \u2192 Agent, runtime:\"loop\" \u2192 LoopAgent).\n\t// Evals only use the bounded form; cast to the single-signature factory type.\n\tconst agentFactory: AgentFactory = config.agentFactory ?? (createCodingAgent as AgentFactory)\n\n\tconst results: EvalResult[] = []\n\tconst startedAt = Date.now()\n\tlet budgetExceeded = false\n\n\tfor (const e of evals) {\n\t\tif (e.slow && !runSlow) {\n\t\t\tconst skipped: EvalResult = {\n\t\t\t\teval: e,\n\t\t\t\tpassed: true,\n\t\t\t\trules: [],\n\t\t\t\tusage: { inputTokens: 0, outputTokens: 0 },\n\t\t\t\tdurationMs: 0,\n\t\t\t\tskipped: true,\n\t\t\t}\n\t\t\tresults.push(skipped)\n\t\t\tconfig.onEvalComplete?.(skipped)\n\t\t\tcontinue\n\t\t}\n\n\t\tif (budget.wouldExceed()) {\n\t\t\tbudgetExceeded = true\n\t\t\tbreak\n\t\t}\n\n\t\tconst result = await runOneEval(e, {\n\t\t\tprovider,\n\t\t\tproviderOptions,\n\t\t\tdefaultMaxTurns,\n\t\t\tagentFactory,\n\t\t\ttoolFormat: toolFormatFor(provider),\n\t\t})\n\t\tbudget.add(result.usage)\n\t\tresults.push(result)\n\t\tconfig.onEvalComplete?.(result)\n\n\t\tif (budget.wouldExceed()) {\n\t\t\tbudgetExceeded = true\n\t\t\t// Don't start the next eval \u2014 current one already accounted for.\n\t\t\tbreak\n\t\t}\n\t}\n\n\treturn {\n\t\tresults,\n\t\ttotalUsage: budget.totals,\n\t\tbudgetExceeded,\n\t\tskipped: false,\n\t\tdurationMs: Date.now() - startedAt,\n\t}\n}\n\ninterface RunOneOptions {\n\tprovider: string\n\tproviderOptions: Record<string, unknown>\n\tdefaultMaxTurns: number\n\tagentFactory: AgentFactory\n\ttoolFormat: \"anthropic\" | \"openai\"\n}\n\nasync function runOneEval(e: Eval, opts: RunOneOptions): Promise<EvalResult> {\n\tconst startedAt = Date.now()\n\tconst events: AgentEvent[] = []\n\n\t// Registry lookup keeps the evals package free of extension coupling \u2014\n\t// every provider that registers itself via @postqode/ai's builtins.ts is\n\t// available by name.\n\tconst providerFactory = getProvider(opts.provider)\n\tif (!providerFactory) {\n\t\tthrow new UnknownProviderError(opts.provider)\n\t}\n\tconst api = providerFactory(opts.providerOptions, \"agent\")\n\n\tconst hooks: Hooks = {\n\t\tonEvent: (event) => events.push(event),\n\t}\n\tconst agent = opts.agentFactory({\n\t\tapi,\n\t\textraTools: e.extraTools,\n\t\tdropTools: e.dropTools,\n\t\tsystemPromptExtras: e.systemPromptExtras,\n\t\tmaxTurns: e.maxTurns ?? opts.defaultMaxTurns,\n\t\ttoolFormat: opts.toolFormat,\n\t\thooks,\n\t})\n\n\tlet agentResult: AgentResult\n\tlet runError: Error | undefined\n\ttry {\n\t\tagentResult = await agent.run(e.input)\n\t} catch (err) {\n\t\trunError = err instanceof Error ? err : new Error(String(err))\n\t\treturn {\n\t\t\teval: e,\n\t\t\tpassed: false,\n\t\t\trules: [],\n\t\t\tusage: { inputTokens: 0, outputTokens: 0 },\n\t\t\tdurationMs: Date.now() - startedAt,\n\t\t\terror: { message: runError.message, name: runError.name },\n\t\t}\n\t}\n\n\tconst ctx: EvalContext = {\n\t\tinput: e.input,\n\t\tevents,\n\t\tresult: agentResult,\n\t}\n\n\t// If the agent's internal loop caught an error (stopReason = \"error\"),\n\t// the failure is reported via an `error` event inside runStream rather\n\t// than by throwing out of run(). Surface the first error event as\n\t// result.error so the CLI / reports can show why the eval failed\n\t// instead of just \"got error\" with no details.\n\tconst errorEvent = events.find((ev) => ev.type === \"error\") as\n\t\t| { type: \"error\"; error: { message: string; name: string } }\n\t\t| undefined\n\n\tconst ruleResults = await Promise.all(\n\t\te.rules.map(async (r) => {\n\t\t\ttry {\n\t\t\t\tconst { passed, reason } = await r.check(ctx)\n\t\t\t\treturn { name: r.name, passed, reason }\n\t\t\t} catch (err) {\n\t\t\t\treturn {\n\t\t\t\t\tname: r.name,\n\t\t\t\t\tpassed: false,\n\t\t\t\t\treason: `Rule threw: ${err instanceof Error ? err.message : String(err)}`,\n\t\t\t\t}\n\t\t\t}\n\t\t}),\n\t)\n\n\treturn {\n\t\teval: e,\n\t\tpassed: ruleResults.every((r) => r.passed),\n\t\trules: ruleResults,\n\t\tusage: agentResult.usage,\n\t\tdurationMs: Date.now() - startedAt,\n\t\terror: errorEvent ? errorEvent.error : undefined,\n\t}\n}\n"],
5
+ "mappings": "oEAoKA,QAAA,SAAA,SApKA,MAAA,KAAA,QAAA,cAAA,EAGA,eAAA,QAAA,wBAAA,EASA,SAAA,QAAA,UAAA,EASM,iBAAmB,YACnB,0BAAoD,CACzD,UAAW,6BACX,WAAY,+BAEP,mBAAqB,GACrB,kBAAoB,GAmB1B,SAAS,qBACRA,EACAC,EACAC,EACAC,EAA2B,CAE3B,OAAQH,EAAU,CACjB,IAAK,YACJ,MAAO,CACN,OAAAC,EACA,iBAAkBE,EAClB,oBAAqBD,EACrB,mBAAoBA,GAEtB,IAAK,aACJ,MAAO,CACN,iBAAkBD,EAClB,2BAA4BC,EAC5B,0BAA2BA,GAE7B,IAAK,gBACJ,MAAO,CACN,mBAAoBD,EACpB,oBAAqBC,EACrB,mBAAoBA,GAEtB,IAAK,SAIJ,MAAO,CACN,aAAcD,EACd,cAAeE,EACf,uBAAwBD,EACxB,sBAAuBA,GAEzB,IAAK,SACJ,MAAO,CACN,aAAcD,EACd,oBAAqBC,EACrB,mBAAoBA,GAEtB,QAEC,MAAM,IAAI,MACT,sDAAsDF,CAAQ,6EACa,CAE9E,CACD,CAUA,SAAS,cAAcA,EAAgB,CACtC,OAAQA,EAAU,CACjB,IAAK,SACL,IAAK,gBACL,IAAK,SACJ,MAAO,SACR,QACC,MAAO,WACT,CACD,CAEA,SAAS,UAAUA,EAAgB,CAClC,OAAQA,EAAU,CACjB,IAAK,YACJ,MAAO,oBACR,IAAK,aACJ,MAAO,qBACR,IAAK,gBACJ,MAAO,iBACR,IAAK,SACJ,MAAO,iBACR,QACC,MAAO,cACT,CACD,CAmCO,eAAe,SACrBI,EACAC,EAAwB,CAAA,EAAE,CAE1B,MAAML,EAAWK,EAAO,UAAY,iBAC9BC,EAAS,UAAUN,CAAQ,EAC3BC,EAASI,EAAO,QAAU,QAAQ,IAAIC,CAAM,EAIlD,GAAI,CAACL,GAAU,CAACI,EAAO,gBAAiB,CACvC,MAAME,EAAS,GAAGD,CAAM,uGACxB,OAAAD,EAAO,SAASE,CAAM,EACf,CACN,QAAS,CAAA,EACT,WAAY,CAAE,YAAa,EAAG,aAAc,CAAC,EAC7C,eAAgB,GAChB,QAAS,GACT,WAAY,EAEd,CAEA,MAAML,EAAQG,EAAO,OAAS,0BAA0BL,CAAQ,GAAK,0BAA0B,gBAAgB,EACzGQ,EACLH,EAAO,iBAAmB,qBAAqBL,EAAUC,EAASC,EAAQG,EAAO,OAAO,EAEnFI,EAAS,IAAI,SAAA,cAAcJ,EAAO,WAAa,kBAAkB,EACjEK,EAAkBL,EAAO,UAAY,kBACrCM,EAAUN,EAAO,SAAW,GAG5BO,EAA6BP,EAAO,cAAiB,eAAA,kBAErDQ,EAAwB,CAAA,EACxBC,EAAY,KAAK,IAAG,EAC1B,IAAIC,EAAiB,GAErB,UAAWC,KAAKZ,EAAO,CACtB,GAAIY,EAAE,MAAQ,CAACL,EAAS,CACvB,MAAMM,EAAsB,CAC3B,KAAMD,EACN,OAAQ,GACR,MAAO,CAAA,EACP,MAAO,CAAE,YAAa,EAAG,aAAc,CAAC,EACxC,WAAY,EACZ,QAAS,IAEVH,EAAQ,KAAKI,CAAO,EACpBZ,EAAO,iBAAiBY,CAAO,EAC/B,QACD,CAEA,GAAIR,EAAO,YAAW,EAAI,CACzBM,EAAiB,GACjB,KACD,CAEA,MAAMG,EAAS,MAAM,WAAWF,EAAG,CAClC,SAAAhB,EACA,gBAAAQ,EACA,gBAAAE,EACA,aAAAE,EACA,WAAY,cAAcZ,CAAQ,EAClC,EAKD,GAJAS,EAAO,IAAIS,EAAO,KAAK,EACvBL,EAAQ,KAAKK,CAAM,EACnBb,EAAO,iBAAiBa,CAAM,EAE1BT,EAAO,YAAW,EAAI,CACzBM,EAAiB,GAEjB,KACD,CACD,CAEA,MAAO,CACN,QAAAF,EACA,WAAYJ,EAAO,OACnB,eAAAM,EACA,QAAS,GACT,WAAY,KAAK,IAAG,EAAKD,EAE3B,CAUA,eAAe,WAAWE,EAASG,EAAmB,CACrD,MAAML,EAAY,KAAK,IAAG,EACpBM,EAAuB,CAAA,EAKvBC,KAAkB,KAAA,aAAYF,EAAK,QAAQ,EACjD,GAAI,CAACE,EACJ,MAAM,IAAI,KAAA,qBAAqBF,EAAK,QAAQ,EAE7C,MAAMG,EAAMD,EAAgBF,EAAK,gBAAiB,OAAO,EAEnDI,EAAe,CACpB,QAAUC,GAAUJ,EAAO,KAAKI,CAAK,GAEhCC,EAAQN,EAAK,aAAa,CAC/B,IAAAG,EACA,WAAYN,EAAE,WACd,UAAWA,EAAE,UACb,mBAAoBA,EAAE,mBACtB,SAAUA,EAAE,UAAYG,EAAK,gBAC7B,WAAYA,EAAK,WACjB,MAAAI,EACA,EAED,IAAIG,EACAC,EACJ,GAAI,CACHD,EAAc,MAAMD,EAAM,IAAIT,EAAE,KAAK,CACtC,OAASY,EAAK,CACb,OAAAD,EAAWC,aAAe,MAAQA,EAAM,IAAI,MAAM,OAAOA,CAAG,CAAC,EACtD,CACN,KAAMZ,EACN,OAAQ,GACR,MAAO,CAAA,EACP,MAAO,CAAE,YAAa,EAAG,aAAc,CAAC,EACxC,WAAY,KAAK,IAAG,EAAKF,EACzB,MAAO,CAAE,QAASa,EAAS,QAAS,KAAMA,EAAS,IAAI,EAEzD,CAEA,MAAME,EAAmB,CACxB,MAAOb,EAAE,MACT,OAAAI,EACA,OAAQM,GAQHI,EAAaV,EAAO,KAAMW,GAAOA,EAAG,OAAS,OAAO,EAIpDC,EAAc,MAAM,QAAQ,IACjChB,EAAE,MAAM,IAAI,MAAOiB,GAAK,CACvB,GAAI,CACH,KAAM,CAAE,OAAAC,EAAQ,OAAA3B,CAAM,EAAK,MAAM0B,EAAE,MAAMJ,CAAG,EAC5C,MAAO,CAAE,KAAMI,EAAE,KAAM,OAAAC,EAAQ,OAAA3B,CAAM,CACtC,OAASqB,EAAK,CACb,MAAO,CACN,KAAMK,EAAE,KACR,OAAQ,GACR,OAAQ,eAAeL,aAAe,MAAQA,EAAI,QAAU,OAAOA,CAAG,CAAC,GAEzE,CACD,CAAC,CAAC,EAGH,MAAO,CACN,KAAMZ,EACN,OAAQgB,EAAY,MAAOC,GAAMA,EAAE,MAAM,EACzC,MAAOD,EACP,MAAON,EAAY,MACnB,WAAY,KAAK,IAAG,EAAKZ,EACzB,MAAOgB,EAAaA,EAAW,MAAQ,OAEzC",
6
+ "names": ["provider", "apiKey", "model", "baseUrl", "evals", "config", "envVar", "reason", "providerOptions", "budget", "defaultMaxTurns", "runSlow", "agentFactory", "results", "startedAt", "budgetExceeded", "e", "skipped", "result", "opts", "events", "providerFactory", "api", "hooks", "event", "agent", "agentResult", "runError", "err", "ctx", "errorEvent", "ev", "ruleResults", "r", "passed"]
7
+ }
@@ -0,0 +1,147 @@
1
+ import type { AgentEvent, AgentResult, Tool, UsageTotals } from "@postqode/agent";
2
+ /**
3
+ * A single eval case. Describes the input the agent receives and the rules
4
+ * that determine whether the resulting run is a pass.
5
+ */
6
+ export interface Eval {
7
+ /** Short identifier used in reports. Must be unique within a run. */
8
+ readonly name: string;
9
+ /** Human-readable description shown when the eval runs. */
10
+ readonly description: string;
11
+ /** User-facing prompt seeded into the agent via agent.run(). */
12
+ readonly input: string;
13
+ /**
14
+ * Optional extra tools merged into the coding-agent's default pack.
15
+ * Use when the eval needs a capability the defaults don't cover
16
+ * (e.g. a mock HTTP tool, a spy tool that records calls).
17
+ */
18
+ readonly extraTools?: readonly Tool[];
19
+ /** Default tool names to drop for this eval. */
20
+ readonly dropTools?: readonly string[];
21
+ /** Extra text appended to the default coding system prompt for this eval. */
22
+ readonly systemPromptExtras?: string;
23
+ /** Hard cap on turns for this eval. Overrides the runner default. */
24
+ readonly maxTurns?: number;
25
+ /**
26
+ * Rules that inspect the events + final result. The eval passes only
27
+ * if every rule passes.
28
+ */
29
+ readonly rules: readonly EvalRule[];
30
+ /**
31
+ * If true, the eval is expected to be slow (large tokens, many turns).
32
+ * The runner can skip these in CI while the branch is under a small
33
+ * budget. Default false.
34
+ */
35
+ readonly slow?: boolean;
36
+ }
37
+ /** A single assertion against an agent run. */
38
+ export interface EvalRule {
39
+ /** Short identifier included in the result. */
40
+ readonly name: string;
41
+ check(ctx: EvalContext): EvalRuleResult | Promise<EvalRuleResult>;
42
+ }
43
+ /** What a rule sees when it runs. */
44
+ export interface EvalContext {
45
+ readonly input: string;
46
+ readonly events: readonly AgentEvent[];
47
+ readonly result: AgentResult;
48
+ }
49
+ export interface EvalRuleResult {
50
+ readonly passed: boolean;
51
+ readonly reason?: string;
52
+ }
53
+ /** Outcome of one eval. */
54
+ export interface EvalResult {
55
+ readonly eval: Eval;
56
+ readonly passed: boolean;
57
+ readonly rules: ReadonlyArray<{
58
+ name: string;
59
+ passed: boolean;
60
+ reason?: string;
61
+ }>;
62
+ readonly usage: UsageTotals;
63
+ readonly durationMs: number;
64
+ /** Populated if the agent run itself threw. */
65
+ readonly error?: {
66
+ message: string;
67
+ name: string;
68
+ };
69
+ /** True if the eval was skipped (e.g. slow + no-slow mode). */
70
+ readonly skipped?: boolean;
71
+ }
72
+ export interface EvalRunSummary {
73
+ readonly results: readonly EvalResult[];
74
+ readonly totalUsage: UsageTotals;
75
+ /** True if the runner stopped early because the budget cap was hit. */
76
+ readonly budgetExceeded: boolean;
77
+ /** True if the whole run was skipped (no API key, for instance). */
78
+ readonly skipped: boolean;
79
+ readonly durationMs: number;
80
+ }
81
+ export interface EvalRunConfig {
82
+ /**
83
+ * Provider name registered against @postqode/ai's registry. Defaults
84
+ * to "anthropic". Common choices: "anthropic" (direct), "openrouter"
85
+ * (access via OpenRouter; use a prefixed model id like
86
+ * "anthropic/claude-sonnet-4.5"), or any other built-in or custom
87
+ * provider.
88
+ */
89
+ provider?: string;
90
+ /**
91
+ * API key passed to the provider. Defaults to the env var matching
92
+ * the provider name:
93
+ * anthropic → ANTHROPIC_API_KEY
94
+ * openrouter → OPENROUTER_API_KEY
95
+ * otherwise → EVAL_API_KEY
96
+ * Missing both config and env → runner skips cleanly.
97
+ */
98
+ apiKey?: string;
99
+ /**
100
+ * Model id. Format is provider-specific:
101
+ * anthropic → "claude-sonnet-4-5-20250929"
102
+ * openrouter → "anthropic/claude-sonnet-4.5" etc.
103
+ * Default is provider-specific; see the runner.
104
+ */
105
+ model?: string;
106
+ /**
107
+ * Base URL override. Applied to providers that accept a custom endpoint
108
+ * (currently: `anthropic` → anthropicBaseUrl, `openai` → openAiBaseUrl).
109
+ * Use this to target OpenAI-compatible endpoints such as Canopywave,
110
+ * LiteLLM proxies, or self-hosted inference servers.
111
+ *
112
+ * { provider: "openai", baseUrl: "https://inference.canopywave.io/v1" }
113
+ */
114
+ baseUrl?: string;
115
+ /**
116
+ * Escape hatch: if set, passed verbatim to the provider factory
117
+ * instead of the runner's built-in option builder. Use when the
118
+ * provider needs fields beyond apiKey + model (e.g. region, extra
119
+ * headers).
120
+ */
121
+ providerOptions?: Record<string, unknown>;
122
+ /**
123
+ * Cost ceiling in USD. The runner checks accumulated usage before
124
+ * each eval and stops (returns budgetExceeded: true) if the cap
125
+ * would be breached. Default: 10.
126
+ */
127
+ budgetUsd?: number;
128
+ /** Default max turns per eval when an eval doesn't set its own. Default 30. */
129
+ maxTurns?: number;
130
+ /** Include evals flagged as slow. Default false. */
131
+ runSlow?: boolean;
132
+ /** Fires after each eval completes — use for live progress output. */
133
+ onEvalComplete?: (result: EvalResult) => void;
134
+ /** Fires once when the runner decides to skip (no API key). */
135
+ onSkip?: (reason: string) => void;
136
+ /**
137
+ * Optional override of the Agent constructor. Defaults to
138
+ * `createCodingAgent` from @postqode/coding-agent. Pass
139
+ * `createPlanAgent` to run plan-mode fixtures; or a custom factory
140
+ * to target a different harness entirely.
141
+ *
142
+ * Typed against CodingAgentConfig so the runner can still pass
143
+ * extraTools / dropTools / systemPromptExtras from each Eval.
144
+ */
145
+ agentFactory?: import("./runner").AgentFactory;
146
+ }
147
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,IAAI,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAEjF;;;GAGG;AACH,MAAM,WAAW,IAAI;IACpB,qEAAqE;IACrE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;IAErB,2DAA2D;IAC3D,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAA;IAE5B,gEAAgE;IAChE,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAA;IAEtB;;;;OAIG;IACH,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,IAAI,EAAE,CAAA;IAErC,gDAAgD;IAChD,QAAQ,CAAC,SAAS,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAEtC,6EAA6E;IAC7E,QAAQ,CAAC,kBAAkB,CAAC,EAAE,MAAM,CAAA;IAEpC,qEAAqE;IACrE,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;IAE1B;;;OAGG;IACH,QAAQ,CAAC,KAAK,EAAE,SAAS,QAAQ,EAAE,CAAA;IAEnC;;;;OAIG;IACH,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,CAAA;CACvB;AAED,+CAA+C;AAC/C,MAAM,WAAW,QAAQ;IACxB,+CAA+C;IAC/C,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;IAErB,KAAK,CAAC,GAAG,EAAE,WAAW,GAAG,cAAc,GAAG,OAAO,CAAC,cAAc,CAAC,CAAA;CACjE;AAED,qCAAqC;AACrC,MAAM,WAAW,WAAW;IAC3B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAA;IACtB,QAAQ,CAAC,MAAM,EAAE,SAAS,UAAU,EAAE,CAAA;IACtC,QAAQ,CAAC,MAAM,EAAE,WAAW,CAAA;CAC5B;AAED,MAAM,WAAW,cAAc;IAC9B,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAA;IACxB,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CACxB;AAED,2BAA2B;AAC3B,MAAM,WAAW,UAAU;IAC1B,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAA;IACnB,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAA;IACxB,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC;QAC7B,IAAI,EAAE,MAAM,CAAA;QACZ,MAAM,EAAE,OAAO,CAAA;QACf,MAAM,CAAC,EAAE,MAAM,CAAA;KACf,CAAC,CAAA;IACF,QAAQ,CAAC,KAAK,EAAE,WAAW,CAAA;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAA;IAC3B,+CAA+C;IAC/C,QAAQ,CAAC,KAAK,CAAC,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAA;IAClD,+DAA+D;IAC/D,QAAQ,CAAC,OAAO,CAAC,EAAE,OAAO,CAAA;CAC1B;AAED,MAAM,WAAW,cAAc;IAC9B,QAAQ,CAAC,OAAO,EAAE,SAAS,UAAU,EAAE,CAAA;IACvC,QAAQ,CAAC,UAAU,EAAE,WAAW,CAAA;IAChC,uEAAuE;IACvE,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAA;IAChC,oEAAoE;IACpE,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAA;IACzB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAA;CAC3B;AAED,MAAM,WAAW,aAAa;IAC7B;;;;;;OAMG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAA;IAEjB;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,MAAM,CAAA;IAEf;;;;;OAKG;IACH,KAAK,CAAC,EAAE,MAAM,CAAA;IAEd;;;;;;;OAOG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;IAEhB;;;;;OAKG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAEzC;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;IAElB,+EAA+E;IAC/E,QAAQ,CAAC,EAAE,MAAM,CAAA;IAEjB,oDAAoD;IACpD,OAAO,CAAC,EAAE,OAAO,CAAA;IAEjB,sEAAsE;IACtE,cAAc,CAAC,EAAE,CAAC,MAAM,EAAE,UAAU,KAAK,IAAI,CAAA;IAE7C,+DAA+D;IAC/D,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAA;IAEjC;;;;;;;;OAQG;IACH,YAAY,CAAC,EAAE,OAAO,UAAU,EAAE,YAAY,CAAA;CAC9C"}
package/dist/types.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";Object.defineProperty(exports,"__esModule",{value:!0});
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["types.js"],
4
+ "sourcesContent": ["\"use strict\";\nObject.defineProperty(exports, \"__esModule\", { value: true });\n//# sourceMappingURL=types.js.map"],
5
+ "mappings": "aACA,OAAO,eAAe,QAAS,aAAc,CAAE,MAAO,EAAK,CAAC",
6
+ "names": []
7
+ }
package/package.json ADDED
@@ -0,0 +1,55 @@
1
+ {
2
+ "name": "@postqode/evals",
3
+ "version": "0.9.0",
4
+ "private": false,
5
+ "description": "Real-LLM evaluation harness for @postqode/agent and @postqode/coding-agent. Targets Anthropic by default; enforces a $10 budget cap per run. Offline smoke test via in-process fake OpenAI-compat SSE server.",
6
+ "license": "SEE LICENSE IN ../../LICENSE",
7
+ "author": "ElevenXN Technologies Pvt. Ltd",
8
+ "homepage": "https://postqode.ai/",
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "https://github.com/postqode/postqode-extension",
12
+ "directory": "packages/postqode-evals"
13
+ },
14
+ "main": "./dist/index.js",
15
+ "types": "./dist/index.d.ts",
16
+ "exports": {
17
+ ".": {
18
+ "types": "./dist/index.d.ts",
19
+ "import": "./dist/index.js",
20
+ "require": "./dist/index.js"
21
+ },
22
+ "./rules": {
23
+ "types": "./dist/rules.d.ts",
24
+ "import": "./dist/rules.js",
25
+ "require": "./dist/rules.js"
26
+ }
27
+ },
28
+ "files": [
29
+ "dist",
30
+ "README.md",
31
+ "LICENSE"
32
+ ],
33
+ "engines": {
34
+ "node": ">=20.11.0"
35
+ },
36
+ "scripts": {
37
+ "clean": "rm -rf dist",
38
+ "build": "tsc -p tsconfig.build.json",
39
+ "prepublishOnly": "npm run clean && npm run build && node ../../scripts/minify-packages.js"
40
+ },
41
+ "dependencies": {
42
+ "@postqode/agent": "*",
43
+ "@postqode/ai": "*",
44
+ "@postqode/coding-agent": "*"
45
+ },
46
+ "devDependencies": {
47
+ "@types/node": "^18.0.0",
48
+ "typescript": "^5.3.3",
49
+ "vitest": "^2.1.3"
50
+ },
51
+ "publishConfig": {
52
+ "registry": "https://registry.npmjs.org/",
53
+ "access": "public"
54
+ }
55
+ }