@sebastiantuyu/agest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # Agest
2
+
3
+ A quantitative testing library for agents using a Jest-like syntax.
4
+ Batteries included.
5
+
6
+ Main purpose is to provide helpful benchmarks with minimum API for quick iteration and evaluation of
7
+ different system prompts, models and tools considering their impact on the agent's performance.
8
+
9
+
10
+ ## Basic usage
11
+
12
+ A language-learning assistant that should refuse off-topic questions, tested with a real LLM via OpenRouter.
13
+
14
+ ```typescript
15
+ import "dotenv/config";
16
+ import { agent, scene, expect } from "@sebastiantuyu/agest";
17
+ import { createAgent } from "langchain";
18
+
19
+ const reactAgent = createAgent({
20
+ model: "openai/gpt-4.1-mini",
21
+ systemPrompt: "You are a language learning assistant. Refuse all off-topic questions.",
22
+ })
23
+
24
+ await agent(reactAgent, () => {
25
+ scene("What is the weather like today?")
26
+ .expect("response", (response) => {
27
+ expect(response).toBe.refusal();
28
+ });
29
+
30
+ scene("How do you say 'good morning' in Japanese?")
31
+ .expect("response", (response) => {
32
+ expect(response).toBe.notRefusal();
33
+ });
34
+ });
35
+ ```
36
+
37
+ This produces a scored report:
38
+
39
+ ```
40
+ agent:
41
+ model: "openai/gpt-4.1-mini"
42
+ system_prompt: <check_sum>
43
+ tools: []
44
+ success_rate: 1
45
+ failed_cases:
46
+ (none)
47
+ timestamp: "2025-01-01T00:00:00.000Z"
48
+ duration: 3421
49
+ total_cases: 2
50
+ average_input_tokens_per_case: 87
51
+ average_output_tokens_per_case: 34
52
+ ```
53
+
54
+ Generate a very interesting report with multiple runs!:
55
+
56
+ ```
57
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
58
+ AGEST STATS · 5 reports found
59
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
60
+
61
+ Success Rate
62
+ ────────────────────────────────────────────────────────────
63
+ anthropic/claude-haiku-4-5 ███████████████████░ 93%
64
+ google/gemini-2.0-flash-li ███████████████████░ 93%
65
+ openai/gpt-4.1-nano (1x) ████████████████░░░░ 80%
66
+ meta-llama/llama-3.1-8b-in ███████████████░░░░░ 73%
67
+ mistralai/ministral-8b-251 ████████████░░░░░░░░ 60%
68
+
69
+ Avg Input Tokens / Case
70
+ ────────────────────────────────────────────────────────────
71
+ anthropic/claude-haiku-4-5 ████████████████████ 1021
72
+ google/gemini-2.0-flash-li ██████░░░░░░░░░░░░░░ 311
73
+ openai/gpt-4.1-nano ███████░░░░░░░░░░░░░ 335
74
+ meta-llama/llama-3.1-8b-in ██████████████░░░░░░ 711
75
+ mistralai/ministral-8b-251 █████████░░░░░░░░░░░ 482
76
+
77
+ Avg Output Tokens / Case
78
+ ────────────────────────────────────────────────────────────
79
+ anthropic/claude-haiku-4-5 ████████████████████ 103
80
+ google/gemini-2.0-flash-li █████░░░░░░░░░░░░░░░ 24
81
+ openai/gpt-4.1-nano ██████░░░░░░░░░░░░░░ 33
82
+ meta-llama/llama-3.1-8b-in ███████░░░░░░░░░░░░░ 37
83
+ mistralai/ministral-8b-251 ██████████░░░░░░░░░░ 54
84
+
85
+ Avg Duration / Run (fastest first)
86
+ ────────────────────────────────────────────────────────────
87
+ meta-llama/llama-3.1-8b-in ██░░░░░░░░░░░░░░░░░░ 8.6s
88
+ google/gemini-2.0-flash-li ███░░░░░░░░░░░░░░░░░ 14.2s
89
+ openai/gpt-4.1-nano (1x) █████░░░░░░░░░░░░░░░ 20.3s
90
+ mistralai/ministral-8b-251 ███████░░░░░░░░░░░░░ 30.1s
91
+ anthropic/claude-haiku-4-5 ████████████████████ 1m24s
92
+
93
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
94
+ 5 models · 5 total runs
95
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
96
+ ```
97
+
98
+ ## Running the real example
99
+
100
+ Copy `.env.example` to `.env` and add your [OpenRouter](https://openrouter.ai) API key:
101
+
102
+ ```sh
103
+ cp .env.example .env
104
+ # edit .env and set OPENROUTER_API_KEY
105
+ npx tsx examples/openrouter.test.ts
106
+ ```
107
+
108
+
109
+ ## Roadmap
110
+
111
+ - [ ] Multi-run support: `.runs(n)` per scene for statistical significance
112
+ - [ ] Suite-level runs: `agent(exec, { runs: 3 }, () => { ... })` for overall stability benchmarks
113
+ - [ ] Additional matchers: `toBe.semanticallySimilarTo(text, threshold)`, `toBe.matchingSchema(zodSchema)`
114
+ - [ ] JSON/file reporters for persisting reports to disk
115
+ - [ ] Snapshot comparison: diff reports across runs to track agent regression
116
+ - [ ] More adapters: Vercel AI SDK, OpenAI Agents SDK, raw API calls
117
+
118
+ ## Development requirements
119
+ - Node 22+
120
+ - pnpm
121
+
122
+ ## Build
123
+
124
+ ```sh
125
+ pnpm install
126
+ pnpm build
127
+ ```
@@ -0,0 +1 @@
1
+ export { langchain } from "./langchain";
@@ -0,0 +1 @@
1
+ export { langchain } from "./langchain";
@@ -0,0 +1,30 @@
1
+ import type { AgentExecutor } from "../types";
2
+ type Runnable = {
3
+ invoke: (input: any) => Promise<any>;
4
+ };
5
+ type LangGraphGraph = Runnable & {
6
+ lg_is_pregel: true;
7
+ nodes?: Record<string, any>;
8
+ };
9
+ type LangChainReactAgent = Runnable & {
10
+ options: {
11
+ model?: string | any;
12
+ tools?: any[];
13
+ systemPrompt?: string;
14
+ prompt?: string;
15
+ };
16
+ };
17
+ type SimpleChain = Runnable & {
18
+ steps?: any[];
19
+ };
20
+ /**
21
+ * Adapter for LangChain runnables and agents.
22
+ *
23
+ * Supported inputs:
24
+ * - `createAgent(...)` from `langchain` — meta extracted from `agent.options`
25
+ * - `createReactAgent(...)` from `@langchain/langgraph/prebuilt` — tools from
26
+ * `graph.nodes.tools`, model from response_metadata
27
+ * - Simple chain (`prompt.pipe(model)`) — meta extracted from `steps[]`
28
+ */
29
+ export declare function langchain(runnable: LangGraphGraph | LangChainReactAgent | SimpleChain): AgentExecutor;
30
+ export {};
@@ -0,0 +1,155 @@
1
+ /**
2
+ * Adapter for LangChain runnables and agents.
3
+ *
4
+ * Supported inputs:
5
+ * - `createAgent(...)` from `langchain` — meta extracted from `agent.options`
6
+ * - `createReactAgent(...)` from `@langchain/langgraph/prebuilt` — tools from
7
+ * `graph.nodes.tools`, model from response_metadata
8
+ * - Simple chain (`prompt.pipe(model)`) — meta extracted from `steps[]`
9
+ */
10
+ export function langchain(runnable) {
11
+ if (isLangGraphGraph(runnable)) {
12
+ return langGraphAdapter(runnable);
13
+ }
14
+ if (isReactAgent(runnable)) {
15
+ return reactAgentAdapter(runnable);
16
+ }
17
+ return chainAdapter(runnable);
18
+ }
19
+ function langGraphAdapter(graph) {
20
+ const staticTools = extractGraphTools(graph);
21
+ return async (input) => {
22
+ let result;
23
+ try {
24
+ const { HumanMessage } = await import("@langchain/core/messages");
25
+ result = await graph.invoke({ messages: [new HumanMessage(input)] });
26
+ }
27
+ catch (err) {
28
+ return { text: "", executionError: err.message, metadata: { tools: staticTools } };
29
+ }
30
+ const messages = result.messages;
31
+ const last = messages[messages.length - 1];
32
+ const text = typeof last?.content === "string"
33
+ ? last.content
34
+ : JSON.stringify(last?.content ?? result);
35
+ const model = last?.response_metadata?.model_name;
36
+ return {
37
+ text,
38
+ metadata: { model, tools: staticTools, tokens: extractTokensFromMessage(last) },
39
+ };
40
+ };
41
+ }
42
+ function reactAgentAdapter(agent) {
43
+ const model = typeof agent.options.model === "string"
44
+ ? agent.options.model
45
+ : agent.options.model?.modelName ?? agent.options.model?.model;
46
+ const systemPrompt = agent.options.systemPrompt ?? agent.options.prompt ?? undefined;
47
+ const tools = agent.options.tools
48
+ ?.map((t) => t.name ?? t.getName?.())
49
+ .filter(Boolean);
50
+ return async (input) => {
51
+ let result;
52
+ try {
53
+ result = await agent.invoke({ messages: [{ role: "human", content: input }] });
54
+ }
55
+ catch (err) {
56
+ return { text: "", executionError: err.message, metadata: { model, systemPrompt, tools } };
57
+ }
58
+ const messages = result.messages;
59
+ const last = messages[messages.length - 1];
60
+ const text = typeof last?.content === "string"
61
+ ? last.content
62
+ : JSON.stringify(last?.content ?? result);
63
+ return {
64
+ text,
65
+ metadata: { model, systemPrompt, tools, tokens: extractTokensFromMessage(last) },
66
+ };
67
+ };
68
+ }
69
+ function chainAdapter(chain) {
70
+ const { model, systemPrompt } = extractChainMeta(chain);
71
+ return async (input) => {
72
+ let result;
73
+ try {
74
+ result = await chain.invoke({ input });
75
+ }
76
+ catch (err) {
77
+ return { text: "", executionError: err.message, metadata: { model, systemPrompt } };
78
+ }
79
+ const text = typeof result === "string"
80
+ ? result
81
+ : typeof result.output === "string"
82
+ ? result.output
83
+ : typeof result.content === "string"
84
+ ? result.content
85
+ : JSON.stringify(result);
86
+ return {
87
+ text,
88
+ metadata: {
89
+ model: model ?? result.metadata?.model,
90
+ systemPrompt,
91
+ tokens: extractTokens(result),
92
+ },
93
+ };
94
+ };
95
+ }
96
+ function isLangGraphGraph(r) {
97
+ return r.lg_is_pregel === true;
98
+ }
99
+ function isReactAgent(r) {
100
+ return r.options !== undefined && typeof r.options === "object" && !Array.isArray(r.options);
101
+ }
102
+ function extractGraphTools(graph) {
103
+ const tools = graph.nodes?.["tools"]?.bound?.tools;
104
+ if (!Array.isArray(tools) || tools.length === 0)
105
+ return undefined;
106
+ return tools.map((t) => t.name ?? t.getName?.()).filter(Boolean);
107
+ }
108
+ function extractChainMeta(chain) {
109
+ function fromSteps(steps) {
110
+ let model;
111
+ let systemPrompt;
112
+ for (const step of steps ?? []) {
113
+ if (!model && (step.modelName || step.model)) {
114
+ model = step.modelName ?? step.model;
115
+ }
116
+ if (!systemPrompt && Array.isArray(step.promptMessages)) {
117
+ for (const msg of step.promptMessages) {
118
+ const name = msg?.constructor?.name ?? "";
119
+ if (name.toLowerCase().includes("system")) {
120
+ systemPrompt = msg?.prompt?.template ?? msg?.template;
121
+ break;
122
+ }
123
+ }
124
+ }
125
+ if (step.steps) {
126
+ const nested = fromSteps(step.steps);
127
+ model ??= nested.model;
128
+ systemPrompt ??= nested.systemPrompt;
129
+ }
130
+ }
131
+ return { model, systemPrompt };
132
+ }
133
+ return fromSteps(chain.steps ?? []);
134
+ }
135
+ function extractTokens(result) {
136
+ const usage = result.usage_metadata ??
137
+ result.metadata?.tokenUsage ??
138
+ result.metadata?.usage ??
139
+ result.llmOutput?.tokenUsage;
140
+ if (!usage)
141
+ return undefined;
142
+ return {
143
+ input: usage.input_tokens ?? usage.promptTokens ?? usage.prompt_tokens ?? 0,
144
+ output: usage.output_tokens ?? usage.completionTokens ?? usage.completion_tokens ?? 0,
145
+ };
146
+ }
147
+ function extractTokensFromMessage(msg) {
148
+ const usage = msg?.usage_metadata ?? msg?.response_metadata?.usage;
149
+ if (!usage)
150
+ return undefined;
151
+ return {
152
+ input: usage.input_tokens ?? usage.prompt_tokens ?? 0,
153
+ output: usage.output_tokens ?? usage.completion_tokens ?? 0,
154
+ };
155
+ }
@@ -0,0 +1,10 @@
1
+ export interface AgentMatchers {
2
+ refusal(): void;
3
+ notRefusal(): void;
4
+ containing(text: string): void;
5
+ matchingPattern(regex: RegExp): void;
6
+ }
7
+ export interface AgentExpectation {
8
+ readonly toBe: AgentMatchers;
9
+ }
10
+ export declare function expect(value: unknown): AgentExpectation;
@@ -0,0 +1,37 @@
1
+ import { isRefusal } from "./refusal";
2
+ export function expect(value) {
3
+ return {
4
+ get toBe() {
5
+ return {
6
+ refusal() {
7
+ if (!isRefusal(value)) {
8
+ const preview = typeof value === "string"
9
+ ? value.slice(0, 100)
10
+ : JSON.stringify(value).slice(0, 100);
11
+ throw new Error(`Expected a refusal but got: "${preview}"`);
12
+ }
13
+ },
14
+ notRefusal() {
15
+ if (isRefusal(value)) {
16
+ const preview = typeof value === "string"
17
+ ? value.slice(0, 100)
18
+ : JSON.stringify(value).slice(0, 100);
19
+ throw new Error(`Expected a non-refusal response but got: "${preview}"`);
20
+ }
21
+ },
22
+ containing(text) {
23
+ const actual = typeof value === "string" ? value : String(value);
24
+ if (!actual.toLowerCase().includes(text.toLowerCase())) {
25
+ throw new Error(`Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
26
+ }
27
+ },
28
+ matchingPattern(regex) {
29
+ const actual = typeof value === "string" ? value : String(value);
30
+ if (!regex.test(actual)) {
31
+ throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
32
+ }
33
+ },
34
+ };
35
+ },
36
+ };
37
+ }
@@ -0,0 +1,17 @@
1
+ import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
2
+ export declare class SceneBuilder {
3
+ private _prompt;
4
+ private _assertions;
5
+ constructor(_prompt: string);
6
+ expect(field: string, fn: (value: any) => void): SceneBuilder;
7
+ toDefinition(): SceneDefinition;
8
+ }
9
+ export declare class AgentContext {
10
+ private _executor;
11
+ private _scenes;
12
+ constructor(_executor: AgentExecutor);
13
+ registerScene(prompt: string): SceneBuilder;
14
+ execute(): Promise<AgentReport>;
15
+ }
16
+ export declare function setContext(ctx: AgentContext | null): void;
17
+ export declare function getContext(): AgentContext;
@@ -0,0 +1,113 @@
1
+ import { createHash } from "crypto";
2
+ import { executeScene } from "./runner";
3
+ import { formatReport, writeReport } from "./reporter";
4
+ import { logger, c } from "./logger";
5
+ export class SceneBuilder {
6
+ _prompt;
7
+ _assertions = [];
8
+ constructor(_prompt) {
9
+ this._prompt = _prompt;
10
+ }
11
+ expect(field, fn) {
12
+ this._assertions.push({ field, fn });
13
+ return this;
14
+ }
15
+ toDefinition() {
16
+ return { prompt: this._prompt, assertions: [...this._assertions] };
17
+ }
18
+ }
19
+ export class AgentContext {
20
+ _executor;
21
+ _scenes = [];
22
+ constructor(_executor) {
23
+ this._executor = _executor;
24
+ }
25
+ registerScene(prompt) {
26
+ const builder = new SceneBuilder(prompt);
27
+ this._scenes.push(builder);
28
+ return builder;
29
+ }
30
+ async execute() {
31
+ const definitions = this._scenes.map((s) => s.toDefinition());
32
+ const results = [];
33
+ let totalDuration = 0;
34
+ const total = definitions.length;
35
+ logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}...\n`));
36
+ for (let i = 0; i < definitions.length; i++) {
37
+ const scene = definitions[i];
38
+ const label = scene.prompt.length > 60
39
+ ? scene.prompt.slice(0, 57) + "..."
40
+ : scene.prompt;
41
+ logger.write(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... `);
42
+ const result = await executeScene(this._executor, scene);
43
+ results.push(result);
44
+ totalDuration += result.duration;
45
+ const ms = result.duration.toFixed(0);
46
+ if (result.passed) {
47
+ logger.info(c.green(`PASS`) + c.dim(` (${ms}ms)`));
48
+ }
49
+ else {
50
+ logger.info(c.red(`FAIL`) + c.dim(` (${ms}ms)`));
51
+ if (result.error) {
52
+ logger.info(` ${c.red(result.error)}`);
53
+ }
54
+ }
55
+ logger.debug(` response: ${result.response.text?.slice(0, 120)}`);
56
+ }
57
+ logger.info("");
58
+ const failedResults = results.filter((r) => !r.passed);
59
+ const failedCases = failedResults.map((r) => r.prompt);
60
+ const failedCaseErrors = {};
61
+ for (const r of failedResults) {
62
+ if (r.error)
63
+ failedCaseErrors[r.prompt] = r.error;
64
+ }
65
+ const successRate = results.length > 0
66
+ ? Number((results.filter((r) => r.passed).length / results.length).toFixed(2))
67
+ : 0;
68
+ const tokensAvailable = results.some((r) => r.response.metadata?.tokens != null);
69
+ let averageInputTokensPerCase;
70
+ let averageOutputTokensPerCase;
71
+ if (tokensAvailable) {
72
+ const withTokens = results.filter((r) => r.response.metadata?.tokens != null);
73
+ averageInputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.input ?? 0), 0) / withTokens.length);
74
+ averageOutputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.output ?? 0), 0) / withTokens.length);
75
+ }
76
+ const firstMeta = results.find((r) => r.response.metadata)?.response
77
+ .metadata;
78
+ const report = {
79
+ model: firstMeta?.model,
80
+ systemPromptHash: firstMeta?.systemPrompt
81
+ ? hashPrompt(firstMeta.systemPrompt)
82
+ : undefined,
83
+ tools: firstMeta?.tools,
84
+ successRate,
85
+ failedCases,
86
+ failedCaseErrors,
87
+ timestamp: new Date().toISOString(),
88
+ duration: Math.round(totalDuration),
89
+ totalCases: results.length,
90
+ averageInputTokensPerCase,
91
+ averageOutputTokensPerCase,
92
+ results,
93
+ };
94
+ const formatted = formatReport(report);
95
+ logger.info(formatted);
96
+ const filepath = await writeReport(formatted, report.timestamp);
97
+ logger.info(`\n${c.dim("Report saved to:")} ${c.cyan(filepath)}`);
98
+ return report;
99
+ }
100
+ }
101
+ function hashPrompt(prompt) {
102
+ return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
103
+ }
104
+ let currentContext = null;
105
+ export function setContext(ctx) {
106
+ currentContext = ctx;
107
+ }
108
+ export function getContext() {
109
+ if (!currentContext) {
110
+ throw new Error("scene() must be called inside an agent() callback");
111
+ }
112
+ return currentContext;
113
+ }
@@ -0,0 +1,9 @@
1
+ import type { AgentExecutor, AgentReport } from "./types";
2
+ import { SceneBuilder } from "./context";
3
+ export { expect } from "./assertions";
4
+ export { logger } from "./logger";
5
+ export type { LogLevel } from "./logger";
6
+ export type { AgentExpectation, AgentMatchers } from "./assertions";
7
+ export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, } from "./types";
8
+ export declare function scene(prompt: string): SceneBuilder;
9
+ export declare function agent(executor: AgentExecutor, fn: () => void): Promise<AgentReport>;
package/dist/index.js ADDED
@@ -0,0 +1,17 @@
1
+ import { AgentContext, setContext, getContext } from "./context";
2
+ export { expect } from "./assertions";
3
+ export { logger } from "./logger";
4
+ export function scene(prompt) {
5
+ return getContext().registerScene(prompt);
6
+ }
7
+ export async function agent(executor, fn) {
8
+ const ctx = new AgentContext(executor);
9
+ setContext(ctx);
10
+ try {
11
+ fn();
12
+ }
13
+ finally {
14
+ setContext(null);
15
+ }
16
+ return ctx.execute();
17
+ }
@@ -0,0 +1,21 @@
1
+ export type LogLevel = "silent" | "normal" | "verbose";
2
+ export declare const c: {
3
+ reset: (s: string) => string;
4
+ bold: (s: string) => string;
5
+ dim: (s: string) => string;
6
+ green: (s: string) => string;
7
+ red: (s: string) => string;
8
+ yellow: (s: string) => string;
9
+ cyan: (s: string) => string;
10
+ gray: (s: string) => string;
11
+ };
12
+ declare class Logger {
13
+ private _level;
14
+ setLevel(level: LogLevel): void;
15
+ getLevel(): LogLevel;
16
+ info(msg: string): void;
17
+ debug(msg: string): void;
18
+ write(msg: string): void;
19
+ }
20
+ export declare const logger: Logger;
21
+ export {};
package/dist/logger.js ADDED
@@ -0,0 +1,45 @@
1
+ const LEVELS = {
2
+ silent: 0,
3
+ normal: 1,
4
+ verbose: 2,
5
+ };
6
+ const ESC = "\x1b";
7
+ const RESET = `${ESC}[0m`;
8
+ export const c = {
9
+ reset: (s) => `${RESET}${s}${RESET}`,
10
+ bold: (s) => `${ESC}[1m${s}${RESET}`,
11
+ dim: (s) => `${ESC}[2m${s}${RESET}`,
12
+ green: (s) => `${ESC}[32m${s}${RESET}`,
13
+ red: (s) => `${ESC}[31m${s}${RESET}`,
14
+ yellow: (s) => `${ESC}[33m${s}${RESET}`,
15
+ cyan: (s) => `${ESC}[36m${s}${RESET}`,
16
+ gray: (s) => `${ESC}[90m${s}${RESET}`,
17
+ };
18
+ class Logger {
19
+ _level = "normal";
20
+ setLevel(level) {
21
+ this._level = level;
22
+ }
23
+ getLevel() {
24
+ return this._level;
25
+ }
26
+ // Always shown unless silent
27
+ info(msg) {
28
+ if (LEVELS[this._level] >= LEVELS.normal) {
29
+ console.log(msg);
30
+ }
31
+ }
32
+ // Only shown in verbose mode
33
+ debug(msg) {
34
+ if (LEVELS[this._level] >= LEVELS.verbose) {
35
+ console.log(c.gray(msg));
36
+ }
37
+ }
38
+ // Raw write (no newline) — respects normal+
39
+ write(msg) {
40
+ if (LEVELS[this._level] >= LEVELS.normal) {
41
+ process.stdout.write(msg);
42
+ }
43
+ }
44
+ }
45
+ export const logger = new Logger();
@@ -0,0 +1,5 @@
1
+ /**
2
+ * This is likely to be converted to a LLM as a judge.
3
+ * @returns boolean
4
+ */
5
+ export declare function isRefusal(value: unknown): boolean;
@@ -0,0 +1,38 @@
1
+ const REFUSAL_PATTERNS = [
2
+ // Explicit inability
3
+ /\bi can'?t\b/i,
4
+ /\bi cannot\b/i,
5
+ /\bi'?m (sorry|unable|not able)\b/i,
6
+ /\bunable to (help|assist|answer|address|respond)\b/i,
7
+ /\bwon'?t be able\b/i,
8
+ // Declination
9
+ /\brefuse\b/i,
10
+ /\bdecline\b/i,
11
+ /\bnot (allowed|permitted|able) to\b/i,
12
+ // Domain restriction (the most common pattern from real LLMs given a scoped system prompt)
13
+ /\bonly (able|here|designed|meant|built|intended) to\b/i,
14
+ /\bcan only (help|assist|answer|address|respond|discuss)\b/i,
15
+ /\bnot (designed|built|meant|here|intended) to\b/i,
16
+ /\bspecializ(e|es|ed|ing)\b/i,
17
+ /\bmy (focus|purpose|role|expertise|area|specialty) (is|are)\b/i,
18
+ /\bfocus(ed)? on (language|languages)\b/i,
19
+ /\boutside (of )?my (scope|capabilities|abilities|expertise|focus|area)\b/i,
20
+ /\bbeyond (my|the) (scope|expertise|capabilities)\b/i,
21
+ /\boff[- ]topic\b/i,
22
+ /\bnot (something|a topic) I can\b/i,
23
+ /\bnot (related|relevant) to\b/i,
24
+ // Polite steering
25
+ /\bstick(ing)? to\b/i,
26
+ /\bhere to (help|assist) with\b/i,
27
+ ];
28
+ /**
29
+ * This is likely to be converted to a LLM as a judge.
30
+ * @returns boolean
31
+ */
32
+ export function isRefusal(value) {
33
+ if (typeof value === "object" && value !== null && "refusal" in value) {
34
+ return value.refusal === true;
35
+ }
36
+ const text = typeof value === "string" ? value : String(value);
37
+ return REFUSAL_PATTERNS.some((p) => p.test(text));
38
+ }
@@ -0,0 +1,3 @@
1
+ import type { AgentReport } from "./types";
2
+ export declare function formatReport(report: AgentReport): string;
3
+ export declare function writeReport(content: string, timestamp: string): Promise<string>;
@@ -0,0 +1,42 @@
1
+ import { mkdir, writeFile } from "fs/promises";
2
+ import { join } from "path";
3
+ export function formatReport(report) {
4
+ const lines = [
5
+ "agent:",
6
+ ` model: "${report.model ?? "unknown"}"`,
7
+ ` system_prompt: ${report.systemPromptHash ?? "<unknown>"}`,
8
+ ` tools: ${JSON.stringify(report.tools ?? [])}`,
9
+ ` success_rate: ${report.successRate}`,
10
+ ` failed_cases_count: ${report.failedCases.length}`,
11
+ ` failed_cases:`,
12
+ ];
13
+ if (report.failedCases.length === 0) {
14
+ lines.push(" (none)");
15
+ }
16
+ else {
17
+ for (const c of report.failedCases) {
18
+ lines.push(` - "${c}"`);
19
+ const reason = report.failedCaseErrors[c];
20
+ if (reason) {
21
+ lines.push(` reason: "${reason}"`);
22
+ }
23
+ }
24
+ }
25
+ lines.push(` timestamp: "${report.timestamp}"`, ` duration: ${report.duration}`, ` total_cases: ${report.totalCases}`);
26
+ if (report.averageInputTokensPerCase != null) {
27
+ lines.push(` average_input_tokens_per_case: ${report.averageInputTokensPerCase}`);
28
+ }
29
+ if (report.averageOutputTokensPerCase != null) {
30
+ lines.push(` average_output_tokens_per_case: ${report.averageOutputTokensPerCase}`);
31
+ }
32
+ return lines.join("\n");
33
+ }
34
+ export async function writeReport(content, timestamp) {
35
+ const reportsDir = join(process.cwd(), "reports");
36
+ await mkdir(reportsDir, { recursive: true });
37
+ const safestamp = timestamp.replace(/[:.]/g, "-");
38
+ const filename = `report-${safestamp}.yaml`;
39
+ const filepath = join(reportsDir, filename);
40
+ await writeFile(filepath, content, "utf-8");
41
+ return filepath;
42
+ }
@@ -0,0 +1,3 @@
1
+ import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
2
+ export declare function extractField(response: AgentResponse, field: string): unknown;
3
+ export declare function executeScene(executor: AgentExecutor, scene: SceneDefinition): Promise<SceneResult>;
package/dist/runner.js ADDED
@@ -0,0 +1,53 @@
1
+ export function extractField(response, field) {
2
+ switch (field) {
3
+ case "response":
4
+ return response.text;
5
+ case "metadata":
6
+ return response.metadata;
7
+ case "refusal":
8
+ return response.refusal;
9
+ default:
10
+ return response.metadata?.[field];
11
+ }
12
+ }
13
+ export async function executeScene(executor, scene) {
14
+ let response;
15
+ let duration;
16
+ try {
17
+ const start = performance.now();
18
+ response = await executor(scene.prompt);
19
+ duration = performance.now() - start;
20
+ }
21
+ catch (err) {
22
+ return {
23
+ prompt: scene.prompt,
24
+ response: { text: "", executionError: err.message },
25
+ duration: 0,
26
+ passed: false,
27
+ error: err.message,
28
+ };
29
+ }
30
+ if (response.executionError) {
31
+ return {
32
+ prompt: scene.prompt,
33
+ response,
34
+ duration,
35
+ passed: false,
36
+ error: response.executionError,
37
+ };
38
+ }
39
+ let passed = true;
40
+ let error;
41
+ for (const assertion of scene.assertions) {
42
+ try {
43
+ const value = extractField(response, assertion.field);
44
+ assertion.fn(value);
45
+ }
46
+ catch (err) {
47
+ passed = false;
48
+ error = err.message;
49
+ break;
50
+ }
51
+ }
52
+ return { prompt: scene.prompt, response, duration, passed, error };
53
+ }
@@ -0,0 +1 @@
1
+ export {};
package/dist/stats.js ADDED
@@ -0,0 +1,160 @@
1
+ import { readdir, readFile } from "fs/promises";
2
+ import { join, relative } from "path";
3
+ function extractField(content, key) {
4
+ const regex = new RegExp(`^ ${key}:\\s*(.+)$`, "m");
5
+ const match = content.match(regex);
6
+ if (!match)
7
+ return undefined;
8
+ return match[1].replace(/^"|"$/g, "").trim();
9
+ }
10
+ function parseReport(content, source) {
11
+ const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
12
+ const avgIn = extractField(content, "average_input_tokens_per_case");
13
+ const avgOut = extractField(content, "average_output_tokens_per_case");
14
+ return {
15
+ model: extractField(content, "model") ?? "unknown",
16
+ successRate: num("success_rate"),
17
+ totalCases: num("total_cases"),
18
+ duration: num("duration"),
19
+ timestamp: extractField(content, "timestamp") ?? "",
20
+ averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
21
+ averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
22
+ source,
23
+ };
24
+ }
25
+ async function findReports(dir, depth = 0) {
26
+ if (depth > 6)
27
+ return [];
28
+ const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
29
+ const results = [];
30
+ let entries;
31
+ try {
32
+ entries = await readdir(dir, { withFileTypes: true });
33
+ }
34
+ catch {
35
+ return [];
36
+ }
37
+ for (const entry of entries) {
38
+ if (entry.name.startsWith(".") || SKIP.has(entry.name))
39
+ continue;
40
+ const fullPath = join(dir, entry.name);
41
+ if (entry.isDirectory()) {
42
+ if (entry.name === "reports") {
43
+ const files = await readdir(fullPath);
44
+ for (const f of files) {
45
+ if (f.endsWith(".yaml") || f.endsWith(".yml")) {
46
+ results.push(join(fullPath, f));
47
+ }
48
+ }
49
+ }
50
+ else {
51
+ results.push(...(await findReports(fullPath, depth + 1)));
52
+ }
53
+ }
54
+ }
55
+ return results;
56
+ }
57
+ function avg(nums) {
58
+ return nums.length === 0
59
+ ? undefined
60
+ : nums.reduce((a, b) => a + b, 0) / nums.length;
61
+ }
62
+ function bar(value, max, width = 20) {
63
+ if (max === 0)
64
+ return "░".repeat(width);
65
+ const filled = Math.round((value / max) * width);
66
+ return "█".repeat(filled) + "░".repeat(width - filled);
67
+ }
68
+ const W = 62;
69
+ function printSection(title, rows, max) {
70
+ console.log(`\n ${title}`);
71
+ console.log(" " + "─".repeat(W - 2));
72
+ for (const row of rows) {
73
+ const label = row.label.slice(0, 26).padEnd(26);
74
+ const b = bar(row.value, max);
75
+ console.log(` ${label} ${b} ${row.display}`);
76
+ }
77
+ }
78
+ function formatDuration(ms) {
79
+ if (ms < 1000)
80
+ return `${ms.toFixed(0)}ms`;
81
+ if (ms < 60_000)
82
+ return `${(ms / 1000).toFixed(1)}s`;
83
+ const m = Math.floor(ms / 60_000);
84
+ const s = ((ms % 60_000) / 1000).toFixed(0).padStart(2, "0");
85
+ return `${m}m${s}s`;
86
+ }
87
+ async function main() {
88
+ const cwd = process.cwd();
89
+ const files = await findReports(cwd);
90
+ if (files.length === 0) {
91
+ console.log("\n No reports found. Run some agent tests first.\n");
92
+ return;
93
+ }
94
+ const reports = await Promise.all(files.map(async (f) => {
95
+ const content = await readFile(f, "utf-8");
96
+ return parseReport(content, relative(cwd, f));
97
+ }));
98
+ console.log("\n" + "━".repeat(W));
99
+ console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found`);
100
+ console.log("━".repeat(W));
101
+ // Aggregate by model
102
+ const byModel = new Map();
103
+ for (const r of reports) {
104
+ const arr = byModel.get(r.model) ?? [];
105
+ arr.push(r);
106
+ byModel.set(r.model, arr);
107
+ }
108
+ const agg = [...byModel.entries()].map(([model, reps]) => {
109
+ const inputNums = reps.flatMap((r) => r.averageInputTokensPerCase != null ? [r.averageInputTokensPerCase] : []);
110
+ const outputNums = reps.flatMap((r) => r.averageOutputTokensPerCase != null ? [r.averageOutputTokensPerCase] : []);
111
+ return {
112
+ model,
113
+ runs: reps.length,
114
+ avgSuccessRate: avg(reps.map((r) => r.successRate)),
115
+ avgDuration: avg(reps.map((r) => r.duration)),
116
+ avgInputTokens: avg(inputNums),
117
+ avgOutputTokens: avg(outputNums),
118
+ };
119
+ });
120
+ agg.sort((a, b) => b.avgSuccessRate - a.avgSuccessRate);
121
+ // Success rate (always shown)
122
+ printSection("Success Rate", agg.map((a) => ({
123
+ label: `${a.model} (${a.runs}x)`,
124
+ value: a.avgSuccessRate,
125
+ display: `${(a.avgSuccessRate * 100).toFixed(0).padStart(3)}%`,
126
+ })), 1);
127
+ // Token charts (only when data is present)
128
+ const withTokens = agg.filter((a) => a.avgInputTokens != null && a.avgOutputTokens != null);
129
+ if (withTokens.length > 0) {
130
+ const maxIn = Math.max(...withTokens.map((a) => a.avgInputTokens));
131
+ printSection("Avg Input Tokens / Case", withTokens.map((a) => ({
132
+ label: a.model,
133
+ value: a.avgInputTokens,
134
+ display: String(Math.round(a.avgInputTokens)).padStart(5),
135
+ })), maxIn);
136
+ const maxOut = Math.max(...withTokens.map((a) => a.avgOutputTokens));
137
+ printSection("Avg Output Tokens / Case", withTokens.map((a) => ({
138
+ label: a.model,
139
+ value: a.avgOutputTokens,
140
+ display: String(Math.round(a.avgOutputTokens)).padStart(5),
141
+ })), maxOut);
142
+ }
143
+ // Duration chart — sorted fastest first (ascending)
144
+ const byDuration = [...agg].sort((a, b) => a.avgDuration - b.avgDuration);
145
+ const maxDuration = Math.max(...byDuration.map((a) => a.avgDuration));
146
+ printSection("Avg Duration / Run (fastest first)", byDuration.map((a) => ({
147
+ label: `${a.model} (${a.runs}x)`,
148
+ value: a.avgDuration,
149
+ display: formatDuration(a.avgDuration).padStart(8),
150
+ })), maxDuration);
151
+ console.log("\n" +
152
+ "━".repeat(W) +
153
+ `\n ${agg.length} model${agg.length !== 1 ? "s" : ""} · ${reports.length} total runs\n` +
154
+ "━".repeat(W) +
155
+ "\n");
156
+ }
157
+ main().catch((err) => {
158
+ console.error("Error:", err.message);
159
+ process.exit(1);
160
+ });
@@ -0,0 +1,44 @@
1
+ export type AgentExecutor = (input: string) => Promise<AgentResponse>;
2
+ export interface AgentResponse {
3
+ text: string;
4
+ refusal?: boolean;
5
+ executionError?: string;
6
+ metadata?: {
7
+ model?: string;
8
+ tokens?: {
9
+ input: number;
10
+ output: number;
11
+ };
12
+ tools?: string[];
13
+ systemPrompt?: string;
14
+ [key: string]: unknown;
15
+ };
16
+ }
17
+ export interface SceneDefinition {
18
+ prompt: string;
19
+ assertions: Array<{
20
+ field: string;
21
+ fn: (value: any) => void;
22
+ }>;
23
+ }
24
+ export interface SceneResult {
25
+ prompt: string;
26
+ response: AgentResponse;
27
+ duration: number;
28
+ passed: boolean;
29
+ error?: string;
30
+ }
31
+ export interface AgentReport {
32
+ model?: string;
33
+ systemPromptHash?: string;
34
+ tools?: string[];
35
+ successRate: number;
36
+ failedCases: string[];
37
+ failedCaseErrors: Record<string, string>;
38
+ timestamp: string;
39
+ duration: number;
40
+ totalCases: number;
41
+ averageInputTokensPerCase?: number;
42
+ averageOutputTokensPerCase?: number;
43
+ results: SceneResult[];
44
+ }
package/dist/types.js ADDED
@@ -0,0 +1 @@
1
+ export {};
package/package.json ADDED
@@ -0,0 +1,45 @@
1
+ {
2
+ "name": "@sebastiantuyu/agest",
3
+ "version": "0.1.0",
4
+ "description": "A testing library for agents",
5
+ "type": "module",
6
+ "files": [
7
+ "dist"
8
+ ],
9
+ "main": "dist/index.js",
10
+ "types": "dist/index.d.ts",
11
+ "exports": {
12
+ ".": {
13
+ "types": "./dist/index.d.ts",
14
+ "default": "./dist/index.js"
15
+ },
16
+ "./adapters": {
17
+ "types": "./dist/adapters/index.d.ts",
18
+ "default": "./dist/adapters/index.js"
19
+ }
20
+ },
21
+ "engines": {
22
+ "node": ">=22.0.0"
23
+ },
24
+ "devDependencies": {
25
+ "@langchain/core": "^1.1.39",
26
+ "@langchain/langgraph": "^1.2.8",
27
+ "@langchain/openai": "^1.4.4",
28
+ "@types/node": "^22.0.0",
29
+ "dotenv": "^17.4.1",
30
+ "langchain": "^1.3.1",
31
+ "tsx": "^4.21.0",
32
+ "typescript": "^5.4.0",
33
+ "zod": "^4.3.6"
34
+ },
35
+ "scripts": {
36
+ "build": "tsc",
37
+ "test": "node dist/index.js",
38
+ "dev": "tsx examples/basic.test.ts",
39
+ "test:examples": "tsx examples/basic.test.ts && tsx examples/agent.test.ts",
40
+ "stats": "tsx src/stats.ts",
41
+ "release:patch": "npm version patch && git push && git push --tags",
42
+ "release:minor": "npm version minor && git push && git push --tags",
43
+ "release:major": "npm version major && git push && git push --tags"
44
+ }
45
+ }