@sebastiantuyu/agest 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +127 -0
- package/dist/adapters/index.d.ts +1 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/langchain.d.ts +30 -0
- package/dist/adapters/langchain.js +155 -0
- package/dist/assertions.d.ts +10 -0
- package/dist/assertions.js +37 -0
- package/dist/context.d.ts +17 -0
- package/dist/context.js +113 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -0
- package/dist/logger.d.ts +21 -0
- package/dist/logger.js +45 -0
- package/dist/refusal.d.ts +5 -0
- package/dist/refusal.js +38 -0
- package/dist/reporter.d.ts +3 -0
- package/dist/reporter.js +42 -0
- package/dist/runner.d.ts +3 -0
- package/dist/runner.js +53 -0
- package/dist/stats.d.ts +1 -0
- package/dist/stats.js +160 -0
- package/dist/types.d.ts +44 -0
- package/dist/types.js +1 -0
- package/package.json +45 -0
package/README.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Agest
|
|
2
|
+
|
|
3
|
+
A quantitative testing library for agents using a Jest-like syntax.
|
|
4
|
+
Batteries included.
|
|
5
|
+
|
|
6
|
+
Main purpose is to provide helpful benchmarks with minimum API for quick iteration and evaluation of
|
|
7
|
+
different system prompts, models and tools considering their impact on the agent's performance.
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
## Basic usage
|
|
11
|
+
|
|
12
|
+
A language-learning assistant that should refuse off-topic questions, tested with a real LLM via OpenRouter.
|
|
13
|
+
|
|
14
|
+
```typescript
|
|
15
|
+
import "dotenv/config";
|
|
16
|
+
import { agent, scene, expect } from "@sebastiantuyu/agest";
|
|
17
|
+
import { createAgent } from "langchain";
|
|
18
|
+
|
|
19
|
+
const reactAgent = createAgent({
|
|
20
|
+
model: "openai/gpt-4.1-mini",
|
|
21
|
+
systemPrompt: "You are a language learning assistant. Refuse all off-topic questions.",
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
await agent(reactAgent, () => {
|
|
25
|
+
scene("What is the weather like today?")
|
|
26
|
+
.expect("response", (response) => {
|
|
27
|
+
expect(response).toBe.refusal();
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
scene("How do you say 'good morning' in Japanese?")
|
|
31
|
+
.expect("response", (response) => {
|
|
32
|
+
expect(response).toBe.notRefusal();
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
This produces a scored report:
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
agent:
|
|
41
|
+
model: "openai/gpt-4.1-mini"
|
|
42
|
+
system_prompt: <check_sum>
|
|
43
|
+
tools: []
|
|
44
|
+
success_rate: 1
|
|
45
|
+
failed_cases:
|
|
46
|
+
(none)
|
|
47
|
+
timestamp: "2025-01-01T00:00:00.000Z"
|
|
48
|
+
duration: 3421
|
|
49
|
+
total_cases: 2
|
|
50
|
+
average_input_tokens_per_case: 87
|
|
51
|
+
average_output_tokens_per_case: 34
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Generate a very interesting report with multiple runs!:
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
58
|
+
AGEST STATS · 5 reports found
|
|
59
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
60
|
+
|
|
61
|
+
Success Rate
|
|
62
|
+
────────────────────────────────────────────────────────────
|
|
63
|
+
anthropic/claude-haiku-4-5 ███████████████████░ 93%
|
|
64
|
+
google/gemini-2.0-flash-li ███████████████████░ 93%
|
|
65
|
+
openai/gpt-4.1-nano (1x) ████████████████░░░░ 80%
|
|
66
|
+
meta-llama/llama-3.1-8b-in ███████████████░░░░░ 73%
|
|
67
|
+
mistralai/ministral-8b-251 ████████████░░░░░░░░ 60%
|
|
68
|
+
|
|
69
|
+
Avg Input Tokens / Case
|
|
70
|
+
────────────────────────────────────────────────────────────
|
|
71
|
+
anthropic/claude-haiku-4-5 ████████████████████ 1021
|
|
72
|
+
google/gemini-2.0-flash-li ██████░░░░░░░░░░░░░░ 311
|
|
73
|
+
openai/gpt-4.1-nano ███████░░░░░░░░░░░░░ 335
|
|
74
|
+
meta-llama/llama-3.1-8b-in ██████████████░░░░░░ 711
|
|
75
|
+
mistralai/ministral-8b-251 █████████░░░░░░░░░░░ 482
|
|
76
|
+
|
|
77
|
+
Avg Output Tokens / Case
|
|
78
|
+
────────────────────────────────────────────────────────────
|
|
79
|
+
anthropic/claude-haiku-4-5 ████████████████████ 103
|
|
80
|
+
google/gemini-2.0-flash-li █████░░░░░░░░░░░░░░░ 24
|
|
81
|
+
openai/gpt-4.1-nano ██████░░░░░░░░░░░░░░ 33
|
|
82
|
+
meta-llama/llama-3.1-8b-in ███████░░░░░░░░░░░░░ 37
|
|
83
|
+
mistralai/ministral-8b-251 ██████████░░░░░░░░░░ 54
|
|
84
|
+
|
|
85
|
+
Avg Duration / Run (fastest first)
|
|
86
|
+
────────────────────────────────────────────────────────────
|
|
87
|
+
meta-llama/llama-3.1-8b-in ██░░░░░░░░░░░░░░░░░░ 8.6s
|
|
88
|
+
google/gemini-2.0-flash-li ███░░░░░░░░░░░░░░░░░ 14.2s
|
|
89
|
+
openai/gpt-4.1-nano (1x) █████░░░░░░░░░░░░░░░ 20.3s
|
|
90
|
+
mistralai/ministral-8b-251 ███████░░░░░░░░░░░░░ 30.1s
|
|
91
|
+
anthropic/claude-haiku-4-5 ████████████████████ 1m24s
|
|
92
|
+
|
|
93
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
94
|
+
5 models · 5 total runs
|
|
95
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Running the real example
|
|
99
|
+
|
|
100
|
+
Copy `.env.example` to `.env` and add your [OpenRouter](https://openrouter.ai) API key:
|
|
101
|
+
|
|
102
|
+
```sh
|
|
103
|
+
cp .env.example .env
|
|
104
|
+
# edit .env and set OPENROUTER_API_KEY
|
|
105
|
+
npx tsx examples/openrouter.test.ts
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
## Roadmap
|
|
110
|
+
|
|
111
|
+
- [ ] Multi-run support: `.runs(n)` per scene for statistical significance
|
|
112
|
+
- [ ] Suite-level runs: `agent(exec, { runs: 3 }, () => { ... })` for overall stability benchmarks
|
|
113
|
+
- [ ] Additional matchers: `toBe.semanticallySimilarTo(text, threshold)`, `toBe.matchingSchema(zodSchema)`
|
|
114
|
+
- [ ] JSON/file reporters for persisting reports to disk
|
|
115
|
+
- [ ] Snapshot comparison: diff reports across runs to track agent regression
|
|
116
|
+
- [ ] More adapters: Vercel AI SDK, OpenAI Agents SDK, raw API calls
|
|
117
|
+
|
|
118
|
+
## Development requirements
|
|
119
|
+
- Node 22+
|
|
120
|
+
- pnpm
|
|
121
|
+
|
|
122
|
+
## Build
|
|
123
|
+
|
|
124
|
+
```sh
|
|
125
|
+
pnpm install
|
|
126
|
+
pnpm build
|
|
127
|
+
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { langchain } from "./langchain";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { langchain } from "./langchain";
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { AgentExecutor } from "../types";
|
|
2
|
+
type Runnable = {
|
|
3
|
+
invoke: (input: any) => Promise<any>;
|
|
4
|
+
};
|
|
5
|
+
type LangGraphGraph = Runnable & {
|
|
6
|
+
lg_is_pregel: true;
|
|
7
|
+
nodes?: Record<string, any>;
|
|
8
|
+
};
|
|
9
|
+
type LangChainReactAgent = Runnable & {
|
|
10
|
+
options: {
|
|
11
|
+
model?: string | any;
|
|
12
|
+
tools?: any[];
|
|
13
|
+
systemPrompt?: string;
|
|
14
|
+
prompt?: string;
|
|
15
|
+
};
|
|
16
|
+
};
|
|
17
|
+
type SimpleChain = Runnable & {
|
|
18
|
+
steps?: any[];
|
|
19
|
+
};
|
|
20
|
+
/**
|
|
21
|
+
* Adapter for LangChain runnables and agents.
|
|
22
|
+
*
|
|
23
|
+
* Supported inputs:
|
|
24
|
+
* - `createAgent(...)` from `langchain` — meta extracted from `agent.options`
|
|
25
|
+
* - `createReactAgent(...)` from `@langchain/langgraph/prebuilt` — tools from
|
|
26
|
+
* `graph.nodes.tools`, model from response_metadata
|
|
27
|
+
* - Simple chain (`prompt.pipe(model)`) — meta extracted from `steps[]`
|
|
28
|
+
*/
|
|
29
|
+
export declare function langchain(runnable: LangGraphGraph | LangChainReactAgent | SimpleChain): AgentExecutor;
|
|
30
|
+
export {};
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adapter for LangChain runnables and agents.
|
|
3
|
+
*
|
|
4
|
+
* Supported inputs:
|
|
5
|
+
* - `createAgent(...)` from `langchain` — meta extracted from `agent.options`
|
|
6
|
+
* - `createReactAgent(...)` from `@langchain/langgraph/prebuilt` — tools from
|
|
7
|
+
* `graph.nodes.tools`, model from response_metadata
|
|
8
|
+
* - Simple chain (`prompt.pipe(model)`) — meta extracted from `steps[]`
|
|
9
|
+
*/
|
|
10
|
+
export function langchain(runnable) {
|
|
11
|
+
if (isLangGraphGraph(runnable)) {
|
|
12
|
+
return langGraphAdapter(runnable);
|
|
13
|
+
}
|
|
14
|
+
if (isReactAgent(runnable)) {
|
|
15
|
+
return reactAgentAdapter(runnable);
|
|
16
|
+
}
|
|
17
|
+
return chainAdapter(runnable);
|
|
18
|
+
}
|
|
19
|
+
function langGraphAdapter(graph) {
|
|
20
|
+
const staticTools = extractGraphTools(graph);
|
|
21
|
+
return async (input) => {
|
|
22
|
+
let result;
|
|
23
|
+
try {
|
|
24
|
+
const { HumanMessage } = await import("@langchain/core/messages");
|
|
25
|
+
result = await graph.invoke({ messages: [new HumanMessage(input)] });
|
|
26
|
+
}
|
|
27
|
+
catch (err) {
|
|
28
|
+
return { text: "", executionError: err.message, metadata: { tools: staticTools } };
|
|
29
|
+
}
|
|
30
|
+
const messages = result.messages;
|
|
31
|
+
const last = messages[messages.length - 1];
|
|
32
|
+
const text = typeof last?.content === "string"
|
|
33
|
+
? last.content
|
|
34
|
+
: JSON.stringify(last?.content ?? result);
|
|
35
|
+
const model = last?.response_metadata?.model_name;
|
|
36
|
+
return {
|
|
37
|
+
text,
|
|
38
|
+
metadata: { model, tools: staticTools, tokens: extractTokensFromMessage(last) },
|
|
39
|
+
};
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
function reactAgentAdapter(agent) {
|
|
43
|
+
const model = typeof agent.options.model === "string"
|
|
44
|
+
? agent.options.model
|
|
45
|
+
: agent.options.model?.modelName ?? agent.options.model?.model;
|
|
46
|
+
const systemPrompt = agent.options.systemPrompt ?? agent.options.prompt ?? undefined;
|
|
47
|
+
const tools = agent.options.tools
|
|
48
|
+
?.map((t) => t.name ?? t.getName?.())
|
|
49
|
+
.filter(Boolean);
|
|
50
|
+
return async (input) => {
|
|
51
|
+
let result;
|
|
52
|
+
try {
|
|
53
|
+
result = await agent.invoke({ messages: [{ role: "human", content: input }] });
|
|
54
|
+
}
|
|
55
|
+
catch (err) {
|
|
56
|
+
return { text: "", executionError: err.message, metadata: { model, systemPrompt, tools } };
|
|
57
|
+
}
|
|
58
|
+
const messages = result.messages;
|
|
59
|
+
const last = messages[messages.length - 1];
|
|
60
|
+
const text = typeof last?.content === "string"
|
|
61
|
+
? last.content
|
|
62
|
+
: JSON.stringify(last?.content ?? result);
|
|
63
|
+
return {
|
|
64
|
+
text,
|
|
65
|
+
metadata: { model, systemPrompt, tools, tokens: extractTokensFromMessage(last) },
|
|
66
|
+
};
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
function chainAdapter(chain) {
|
|
70
|
+
const { model, systemPrompt } = extractChainMeta(chain);
|
|
71
|
+
return async (input) => {
|
|
72
|
+
let result;
|
|
73
|
+
try {
|
|
74
|
+
result = await chain.invoke({ input });
|
|
75
|
+
}
|
|
76
|
+
catch (err) {
|
|
77
|
+
return { text: "", executionError: err.message, metadata: { model, systemPrompt } };
|
|
78
|
+
}
|
|
79
|
+
const text = typeof result === "string"
|
|
80
|
+
? result
|
|
81
|
+
: typeof result.output === "string"
|
|
82
|
+
? result.output
|
|
83
|
+
: typeof result.content === "string"
|
|
84
|
+
? result.content
|
|
85
|
+
: JSON.stringify(result);
|
|
86
|
+
return {
|
|
87
|
+
text,
|
|
88
|
+
metadata: {
|
|
89
|
+
model: model ?? result.metadata?.model,
|
|
90
|
+
systemPrompt,
|
|
91
|
+
tokens: extractTokens(result),
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
function isLangGraphGraph(r) {
|
|
97
|
+
return r.lg_is_pregel === true;
|
|
98
|
+
}
|
|
99
|
+
function isReactAgent(r) {
|
|
100
|
+
return r.options !== undefined && typeof r.options === "object" && !Array.isArray(r.options);
|
|
101
|
+
}
|
|
102
|
+
function extractGraphTools(graph) {
|
|
103
|
+
const tools = graph.nodes?.["tools"]?.bound?.tools;
|
|
104
|
+
if (!Array.isArray(tools) || tools.length === 0)
|
|
105
|
+
return undefined;
|
|
106
|
+
return tools.map((t) => t.name ?? t.getName?.()).filter(Boolean);
|
|
107
|
+
}
|
|
108
|
+
function extractChainMeta(chain) {
|
|
109
|
+
function fromSteps(steps) {
|
|
110
|
+
let model;
|
|
111
|
+
let systemPrompt;
|
|
112
|
+
for (const step of steps ?? []) {
|
|
113
|
+
if (!model && (step.modelName || step.model)) {
|
|
114
|
+
model = step.modelName ?? step.model;
|
|
115
|
+
}
|
|
116
|
+
if (!systemPrompt && Array.isArray(step.promptMessages)) {
|
|
117
|
+
for (const msg of step.promptMessages) {
|
|
118
|
+
const name = msg?.constructor?.name ?? "";
|
|
119
|
+
if (name.toLowerCase().includes("system")) {
|
|
120
|
+
systemPrompt = msg?.prompt?.template ?? msg?.template;
|
|
121
|
+
break;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (step.steps) {
|
|
126
|
+
const nested = fromSteps(step.steps);
|
|
127
|
+
model ??= nested.model;
|
|
128
|
+
systemPrompt ??= nested.systemPrompt;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return { model, systemPrompt };
|
|
132
|
+
}
|
|
133
|
+
return fromSteps(chain.steps ?? []);
|
|
134
|
+
}
|
|
135
|
+
function extractTokens(result) {
|
|
136
|
+
const usage = result.usage_metadata ??
|
|
137
|
+
result.metadata?.tokenUsage ??
|
|
138
|
+
result.metadata?.usage ??
|
|
139
|
+
result.llmOutput?.tokenUsage;
|
|
140
|
+
if (!usage)
|
|
141
|
+
return undefined;
|
|
142
|
+
return {
|
|
143
|
+
input: usage.input_tokens ?? usage.promptTokens ?? usage.prompt_tokens ?? 0,
|
|
144
|
+
output: usage.output_tokens ?? usage.completionTokens ?? usage.completion_tokens ?? 0,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
function extractTokensFromMessage(msg) {
|
|
148
|
+
const usage = msg?.usage_metadata ?? msg?.response_metadata?.usage;
|
|
149
|
+
if (!usage)
|
|
150
|
+
return undefined;
|
|
151
|
+
return {
|
|
152
|
+
input: usage.input_tokens ?? usage.prompt_tokens ?? 0,
|
|
153
|
+
output: usage.output_tokens ?? usage.completion_tokens ?? 0,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export interface AgentMatchers {
|
|
2
|
+
refusal(): void;
|
|
3
|
+
notRefusal(): void;
|
|
4
|
+
containing(text: string): void;
|
|
5
|
+
matchingPattern(regex: RegExp): void;
|
|
6
|
+
}
|
|
7
|
+
export interface AgentExpectation {
|
|
8
|
+
readonly toBe: AgentMatchers;
|
|
9
|
+
}
|
|
10
|
+
export declare function expect(value: unknown): AgentExpectation;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { isRefusal } from "./refusal";
|
|
2
|
+
export function expect(value) {
|
|
3
|
+
return {
|
|
4
|
+
get toBe() {
|
|
5
|
+
return {
|
|
6
|
+
refusal() {
|
|
7
|
+
if (!isRefusal(value)) {
|
|
8
|
+
const preview = typeof value === "string"
|
|
9
|
+
? value.slice(0, 100)
|
|
10
|
+
: JSON.stringify(value).slice(0, 100);
|
|
11
|
+
throw new Error(`Expected a refusal but got: "${preview}"`);
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
notRefusal() {
|
|
15
|
+
if (isRefusal(value)) {
|
|
16
|
+
const preview = typeof value === "string"
|
|
17
|
+
? value.slice(0, 100)
|
|
18
|
+
: JSON.stringify(value).slice(0, 100);
|
|
19
|
+
throw new Error(`Expected a non-refusal response but got: "${preview}"`);
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
containing(text) {
|
|
23
|
+
const actual = typeof value === "string" ? value : String(value);
|
|
24
|
+
if (!actual.toLowerCase().includes(text.toLowerCase())) {
|
|
25
|
+
throw new Error(`Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
matchingPattern(regex) {
|
|
29
|
+
const actual = typeof value === "string" ? value : String(value);
|
|
30
|
+
if (!regex.test(actual)) {
|
|
31
|
+
throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
},
|
|
36
|
+
};
|
|
37
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
|
|
2
|
+
export declare class SceneBuilder {
|
|
3
|
+
private _prompt;
|
|
4
|
+
private _assertions;
|
|
5
|
+
constructor(_prompt: string);
|
|
6
|
+
expect(field: string, fn: (value: any) => void): SceneBuilder;
|
|
7
|
+
toDefinition(): SceneDefinition;
|
|
8
|
+
}
|
|
9
|
+
export declare class AgentContext {
|
|
10
|
+
private _executor;
|
|
11
|
+
private _scenes;
|
|
12
|
+
constructor(_executor: AgentExecutor);
|
|
13
|
+
registerScene(prompt: string): SceneBuilder;
|
|
14
|
+
execute(): Promise<AgentReport>;
|
|
15
|
+
}
|
|
16
|
+
export declare function setContext(ctx: AgentContext | null): void;
|
|
17
|
+
export declare function getContext(): AgentContext;
|
package/dist/context.js
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { createHash } from "crypto";
|
|
2
|
+
import { executeScene } from "./runner";
|
|
3
|
+
import { formatReport, writeReport } from "./reporter";
|
|
4
|
+
import { logger, c } from "./logger";
|
|
5
|
+
export class SceneBuilder {
|
|
6
|
+
_prompt;
|
|
7
|
+
_assertions = [];
|
|
8
|
+
constructor(_prompt) {
|
|
9
|
+
this._prompt = _prompt;
|
|
10
|
+
}
|
|
11
|
+
expect(field, fn) {
|
|
12
|
+
this._assertions.push({ field, fn });
|
|
13
|
+
return this;
|
|
14
|
+
}
|
|
15
|
+
toDefinition() {
|
|
16
|
+
return { prompt: this._prompt, assertions: [...this._assertions] };
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
export class AgentContext {
|
|
20
|
+
_executor;
|
|
21
|
+
_scenes = [];
|
|
22
|
+
constructor(_executor) {
|
|
23
|
+
this._executor = _executor;
|
|
24
|
+
}
|
|
25
|
+
registerScene(prompt) {
|
|
26
|
+
const builder = new SceneBuilder(prompt);
|
|
27
|
+
this._scenes.push(builder);
|
|
28
|
+
return builder;
|
|
29
|
+
}
|
|
30
|
+
async execute() {
|
|
31
|
+
const definitions = this._scenes.map((s) => s.toDefinition());
|
|
32
|
+
const results = [];
|
|
33
|
+
let totalDuration = 0;
|
|
34
|
+
const total = definitions.length;
|
|
35
|
+
logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}...\n`));
|
|
36
|
+
for (let i = 0; i < definitions.length; i++) {
|
|
37
|
+
const scene = definitions[i];
|
|
38
|
+
const label = scene.prompt.length > 60
|
|
39
|
+
? scene.prompt.slice(0, 57) + "..."
|
|
40
|
+
: scene.prompt;
|
|
41
|
+
logger.write(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... `);
|
|
42
|
+
const result = await executeScene(this._executor, scene);
|
|
43
|
+
results.push(result);
|
|
44
|
+
totalDuration += result.duration;
|
|
45
|
+
const ms = result.duration.toFixed(0);
|
|
46
|
+
if (result.passed) {
|
|
47
|
+
logger.info(c.green(`PASS`) + c.dim(` (${ms}ms)`));
|
|
48
|
+
}
|
|
49
|
+
else {
|
|
50
|
+
logger.info(c.red(`FAIL`) + c.dim(` (${ms}ms)`));
|
|
51
|
+
if (result.error) {
|
|
52
|
+
logger.info(` ${c.red(result.error)}`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
logger.debug(` response: ${result.response.text?.slice(0, 120)}`);
|
|
56
|
+
}
|
|
57
|
+
logger.info("");
|
|
58
|
+
const failedResults = results.filter((r) => !r.passed);
|
|
59
|
+
const failedCases = failedResults.map((r) => r.prompt);
|
|
60
|
+
const failedCaseErrors = {};
|
|
61
|
+
for (const r of failedResults) {
|
|
62
|
+
if (r.error)
|
|
63
|
+
failedCaseErrors[r.prompt] = r.error;
|
|
64
|
+
}
|
|
65
|
+
const successRate = results.length > 0
|
|
66
|
+
? Number((results.filter((r) => r.passed).length / results.length).toFixed(2))
|
|
67
|
+
: 0;
|
|
68
|
+
const tokensAvailable = results.some((r) => r.response.metadata?.tokens != null);
|
|
69
|
+
let averageInputTokensPerCase;
|
|
70
|
+
let averageOutputTokensPerCase;
|
|
71
|
+
if (tokensAvailable) {
|
|
72
|
+
const withTokens = results.filter((r) => r.response.metadata?.tokens != null);
|
|
73
|
+
averageInputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.input ?? 0), 0) / withTokens.length);
|
|
74
|
+
averageOutputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.output ?? 0), 0) / withTokens.length);
|
|
75
|
+
}
|
|
76
|
+
const firstMeta = results.find((r) => r.response.metadata)?.response
|
|
77
|
+
.metadata;
|
|
78
|
+
const report = {
|
|
79
|
+
model: firstMeta?.model,
|
|
80
|
+
systemPromptHash: firstMeta?.systemPrompt
|
|
81
|
+
? hashPrompt(firstMeta.systemPrompt)
|
|
82
|
+
: undefined,
|
|
83
|
+
tools: firstMeta?.tools,
|
|
84
|
+
successRate,
|
|
85
|
+
failedCases,
|
|
86
|
+
failedCaseErrors,
|
|
87
|
+
timestamp: new Date().toISOString(),
|
|
88
|
+
duration: Math.round(totalDuration),
|
|
89
|
+
totalCases: results.length,
|
|
90
|
+
averageInputTokensPerCase,
|
|
91
|
+
averageOutputTokensPerCase,
|
|
92
|
+
results,
|
|
93
|
+
};
|
|
94
|
+
const formatted = formatReport(report);
|
|
95
|
+
logger.info(formatted);
|
|
96
|
+
const filepath = await writeReport(formatted, report.timestamp);
|
|
97
|
+
logger.info(`\n${c.dim("Report saved to:")} ${c.cyan(filepath)}`);
|
|
98
|
+
return report;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
function hashPrompt(prompt) {
|
|
102
|
+
return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
|
|
103
|
+
}
|
|
104
|
+
let currentContext = null;
|
|
105
|
+
export function setContext(ctx) {
|
|
106
|
+
currentContext = ctx;
|
|
107
|
+
}
|
|
108
|
+
export function getContext() {
|
|
109
|
+
if (!currentContext) {
|
|
110
|
+
throw new Error("scene() must be called inside an agent() callback");
|
|
111
|
+
}
|
|
112
|
+
return currentContext;
|
|
113
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { AgentExecutor, AgentReport } from "./types";
|
|
2
|
+
import { SceneBuilder } from "./context";
|
|
3
|
+
export { expect } from "./assertions";
|
|
4
|
+
export { logger } from "./logger";
|
|
5
|
+
export type { LogLevel } from "./logger";
|
|
6
|
+
export type { AgentExpectation, AgentMatchers } from "./assertions";
|
|
7
|
+
export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, } from "./types";
|
|
8
|
+
export declare function scene(prompt: string): SceneBuilder;
|
|
9
|
+
export declare function agent(executor: AgentExecutor, fn: () => void): Promise<AgentReport>;
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { AgentContext, setContext, getContext } from "./context";
|
|
2
|
+
export { expect } from "./assertions";
|
|
3
|
+
export { logger } from "./logger";
|
|
4
|
+
export function scene(prompt) {
|
|
5
|
+
return getContext().registerScene(prompt);
|
|
6
|
+
}
|
|
7
|
+
export async function agent(executor, fn) {
|
|
8
|
+
const ctx = new AgentContext(executor);
|
|
9
|
+
setContext(ctx);
|
|
10
|
+
try {
|
|
11
|
+
fn();
|
|
12
|
+
}
|
|
13
|
+
finally {
|
|
14
|
+
setContext(null);
|
|
15
|
+
}
|
|
16
|
+
return ctx.execute();
|
|
17
|
+
}
|
package/dist/logger.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export type LogLevel = "silent" | "normal" | "verbose";
|
|
2
|
+
export declare const c: {
|
|
3
|
+
reset: (s: string) => string;
|
|
4
|
+
bold: (s: string) => string;
|
|
5
|
+
dim: (s: string) => string;
|
|
6
|
+
green: (s: string) => string;
|
|
7
|
+
red: (s: string) => string;
|
|
8
|
+
yellow: (s: string) => string;
|
|
9
|
+
cyan: (s: string) => string;
|
|
10
|
+
gray: (s: string) => string;
|
|
11
|
+
};
|
|
12
|
+
declare class Logger {
|
|
13
|
+
private _level;
|
|
14
|
+
setLevel(level: LogLevel): void;
|
|
15
|
+
getLevel(): LogLevel;
|
|
16
|
+
info(msg: string): void;
|
|
17
|
+
debug(msg: string): void;
|
|
18
|
+
write(msg: string): void;
|
|
19
|
+
}
|
|
20
|
+
export declare const logger: Logger;
|
|
21
|
+
export {};
|
package/dist/logger.js
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
const LEVELS = {
|
|
2
|
+
silent: 0,
|
|
3
|
+
normal: 1,
|
|
4
|
+
verbose: 2,
|
|
5
|
+
};
|
|
6
|
+
const ESC = "\x1b";
|
|
7
|
+
const RESET = `${ESC}[0m`;
|
|
8
|
+
export const c = {
|
|
9
|
+
reset: (s) => `${RESET}${s}${RESET}`,
|
|
10
|
+
bold: (s) => `${ESC}[1m${s}${RESET}`,
|
|
11
|
+
dim: (s) => `${ESC}[2m${s}${RESET}`,
|
|
12
|
+
green: (s) => `${ESC}[32m${s}${RESET}`,
|
|
13
|
+
red: (s) => `${ESC}[31m${s}${RESET}`,
|
|
14
|
+
yellow: (s) => `${ESC}[33m${s}${RESET}`,
|
|
15
|
+
cyan: (s) => `${ESC}[36m${s}${RESET}`,
|
|
16
|
+
gray: (s) => `${ESC}[90m${s}${RESET}`,
|
|
17
|
+
};
|
|
18
|
+
class Logger {
|
|
19
|
+
_level = "normal";
|
|
20
|
+
setLevel(level) {
|
|
21
|
+
this._level = level;
|
|
22
|
+
}
|
|
23
|
+
getLevel() {
|
|
24
|
+
return this._level;
|
|
25
|
+
}
|
|
26
|
+
// Always shown unless silent
|
|
27
|
+
info(msg) {
|
|
28
|
+
if (LEVELS[this._level] >= LEVELS.normal) {
|
|
29
|
+
console.log(msg);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
// Only shown in verbose mode
|
|
33
|
+
debug(msg) {
|
|
34
|
+
if (LEVELS[this._level] >= LEVELS.verbose) {
|
|
35
|
+
console.log(c.gray(msg));
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
// Raw write (no newline) — respects normal+
|
|
39
|
+
write(msg) {
|
|
40
|
+
if (LEVELS[this._level] >= LEVELS.normal) {
|
|
41
|
+
process.stdout.write(msg);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
export const logger = new Logger();
|
package/dist/refusal.js
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
const REFUSAL_PATTERNS = [
|
|
2
|
+
// Explicit inability
|
|
3
|
+
/\bi can'?t\b/i,
|
|
4
|
+
/\bi cannot\b/i,
|
|
5
|
+
/\bi'?m (sorry|unable|not able)\b/i,
|
|
6
|
+
/\bunable to (help|assist|answer|address|respond)\b/i,
|
|
7
|
+
/\bwon'?t be able\b/i,
|
|
8
|
+
// Declination
|
|
9
|
+
/\brefuse\b/i,
|
|
10
|
+
/\bdecline\b/i,
|
|
11
|
+
/\bnot (allowed|permitted|able) to\b/i,
|
|
12
|
+
// Domain restriction (the most common pattern from real LLMs given a scoped system prompt)
|
|
13
|
+
/\bonly (able|here|designed|meant|built|intended) to\b/i,
|
|
14
|
+
/\bcan only (help|assist|answer|address|respond|discuss)\b/i,
|
|
15
|
+
/\bnot (designed|built|meant|here|intended) to\b/i,
|
|
16
|
+
/\bspecializ(e|es|ed|ing)\b/i,
|
|
17
|
+
/\bmy (focus|purpose|role|expertise|area|specialty) (is|are)\b/i,
|
|
18
|
+
/\bfocus(ed)? on (language|languages)\b/i,
|
|
19
|
+
/\boutside (of )?my (scope|capabilities|abilities|expertise|focus|area)\b/i,
|
|
20
|
+
/\bbeyond (my|the) (scope|expertise|capabilities)\b/i,
|
|
21
|
+
/\boff[- ]topic\b/i,
|
|
22
|
+
/\bnot (something|a topic) I can\b/i,
|
|
23
|
+
/\bnot (related|relevant) to\b/i,
|
|
24
|
+
// Polite steering
|
|
25
|
+
/\bstick(ing)? to\b/i,
|
|
26
|
+
/\bhere to (help|assist) with\b/i,
|
|
27
|
+
];
|
|
28
|
+
/**
|
|
29
|
+
* This is likely to be converted to a LLM as a judge.
|
|
30
|
+
* @returns boolean
|
|
31
|
+
*/
|
|
32
|
+
export function isRefusal(value) {
|
|
33
|
+
if (typeof value === "object" && value !== null && "refusal" in value) {
|
|
34
|
+
return value.refusal === true;
|
|
35
|
+
}
|
|
36
|
+
const text = typeof value === "string" ? value : String(value);
|
|
37
|
+
return REFUSAL_PATTERNS.some((p) => p.test(text));
|
|
38
|
+
}
|
package/dist/reporter.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "fs/promises";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
export function formatReport(report) {
|
|
4
|
+
const lines = [
|
|
5
|
+
"agent:",
|
|
6
|
+
` model: "${report.model ?? "unknown"}"`,
|
|
7
|
+
` system_prompt: ${report.systemPromptHash ?? "<unknown>"}`,
|
|
8
|
+
` tools: ${JSON.stringify(report.tools ?? [])}`,
|
|
9
|
+
` success_rate: ${report.successRate}`,
|
|
10
|
+
` failed_cases_count: ${report.failedCases.length}`,
|
|
11
|
+
` failed_cases:`,
|
|
12
|
+
];
|
|
13
|
+
if (report.failedCases.length === 0) {
|
|
14
|
+
lines.push(" (none)");
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
for (const c of report.failedCases) {
|
|
18
|
+
lines.push(` - "${c}"`);
|
|
19
|
+
const reason = report.failedCaseErrors[c];
|
|
20
|
+
if (reason) {
|
|
21
|
+
lines.push(` reason: "${reason}"`);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
lines.push(` timestamp: "${report.timestamp}"`, ` duration: ${report.duration}`, ` total_cases: ${report.totalCases}`);
|
|
26
|
+
if (report.averageInputTokensPerCase != null) {
|
|
27
|
+
lines.push(` average_input_tokens_per_case: ${report.averageInputTokensPerCase}`);
|
|
28
|
+
}
|
|
29
|
+
if (report.averageOutputTokensPerCase != null) {
|
|
30
|
+
lines.push(` average_output_tokens_per_case: ${report.averageOutputTokensPerCase}`);
|
|
31
|
+
}
|
|
32
|
+
return lines.join("\n");
|
|
33
|
+
}
|
|
34
|
+
export async function writeReport(content, timestamp) {
|
|
35
|
+
const reportsDir = join(process.cwd(), "reports");
|
|
36
|
+
await mkdir(reportsDir, { recursive: true });
|
|
37
|
+
const safestamp = timestamp.replace(/[:.]/g, "-");
|
|
38
|
+
const filename = `report-${safestamp}.yaml`;
|
|
39
|
+
const filepath = join(reportsDir, filename);
|
|
40
|
+
await writeFile(filepath, content, "utf-8");
|
|
41
|
+
return filepath;
|
|
42
|
+
}
|
package/dist/runner.d.ts
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
|
|
2
|
+
export declare function extractField(response: AgentResponse, field: string): unknown;
|
|
3
|
+
export declare function executeScene(executor: AgentExecutor, scene: SceneDefinition): Promise<SceneResult>;
|
package/dist/runner.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
export function extractField(response, field) {
|
|
2
|
+
switch (field) {
|
|
3
|
+
case "response":
|
|
4
|
+
return response.text;
|
|
5
|
+
case "metadata":
|
|
6
|
+
return response.metadata;
|
|
7
|
+
case "refusal":
|
|
8
|
+
return response.refusal;
|
|
9
|
+
default:
|
|
10
|
+
return response.metadata?.[field];
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
export async function executeScene(executor, scene) {
|
|
14
|
+
let response;
|
|
15
|
+
let duration;
|
|
16
|
+
try {
|
|
17
|
+
const start = performance.now();
|
|
18
|
+
response = await executor(scene.prompt);
|
|
19
|
+
duration = performance.now() - start;
|
|
20
|
+
}
|
|
21
|
+
catch (err) {
|
|
22
|
+
return {
|
|
23
|
+
prompt: scene.prompt,
|
|
24
|
+
response: { text: "", executionError: err.message },
|
|
25
|
+
duration: 0,
|
|
26
|
+
passed: false,
|
|
27
|
+
error: err.message,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
if (response.executionError) {
|
|
31
|
+
return {
|
|
32
|
+
prompt: scene.prompt,
|
|
33
|
+
response,
|
|
34
|
+
duration,
|
|
35
|
+
passed: false,
|
|
36
|
+
error: response.executionError,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
let passed = true;
|
|
40
|
+
let error;
|
|
41
|
+
for (const assertion of scene.assertions) {
|
|
42
|
+
try {
|
|
43
|
+
const value = extractField(response, assertion.field);
|
|
44
|
+
assertion.fn(value);
|
|
45
|
+
}
|
|
46
|
+
catch (err) {
|
|
47
|
+
passed = false;
|
|
48
|
+
error = err.message;
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return { prompt: scene.prompt, response, duration, passed, error };
|
|
53
|
+
}
|
package/dist/stats.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/stats.js
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import { readdir, readFile } from "fs/promises";
|
|
2
|
+
import { join, relative } from "path";
|
|
3
|
+
function extractField(content, key) {
|
|
4
|
+
const regex = new RegExp(`^ ${key}:\\s*(.+)$`, "m");
|
|
5
|
+
const match = content.match(regex);
|
|
6
|
+
if (!match)
|
|
7
|
+
return undefined;
|
|
8
|
+
return match[1].replace(/^"|"$/g, "").trim();
|
|
9
|
+
}
|
|
10
|
+
function parseReport(content, source) {
|
|
11
|
+
const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
|
|
12
|
+
const avgIn = extractField(content, "average_input_tokens_per_case");
|
|
13
|
+
const avgOut = extractField(content, "average_output_tokens_per_case");
|
|
14
|
+
return {
|
|
15
|
+
model: extractField(content, "model") ?? "unknown",
|
|
16
|
+
successRate: num("success_rate"),
|
|
17
|
+
totalCases: num("total_cases"),
|
|
18
|
+
duration: num("duration"),
|
|
19
|
+
timestamp: extractField(content, "timestamp") ?? "",
|
|
20
|
+
averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
|
|
21
|
+
averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
|
|
22
|
+
source,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
async function findReports(dir, depth = 0) {
|
|
26
|
+
if (depth > 6)
|
|
27
|
+
return [];
|
|
28
|
+
const SKIP = new Set(["node_modules", "dist", ".git", ".pnpm"]);
|
|
29
|
+
const results = [];
|
|
30
|
+
let entries;
|
|
31
|
+
try {
|
|
32
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return [];
|
|
36
|
+
}
|
|
37
|
+
for (const entry of entries) {
|
|
38
|
+
if (entry.name.startsWith(".") || SKIP.has(entry.name))
|
|
39
|
+
continue;
|
|
40
|
+
const fullPath = join(dir, entry.name);
|
|
41
|
+
if (entry.isDirectory()) {
|
|
42
|
+
if (entry.name === "reports") {
|
|
43
|
+
const files = await readdir(fullPath);
|
|
44
|
+
for (const f of files) {
|
|
45
|
+
if (f.endsWith(".yaml") || f.endsWith(".yml")) {
|
|
46
|
+
results.push(join(fullPath, f));
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
results.push(...(await findReports(fullPath, depth + 1)));
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return results;
|
|
56
|
+
}
|
|
57
|
+
function avg(nums) {
|
|
58
|
+
return nums.length === 0
|
|
59
|
+
? undefined
|
|
60
|
+
: nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
61
|
+
}
|
|
62
|
+
function bar(value, max, width = 20) {
|
|
63
|
+
if (max === 0)
|
|
64
|
+
return "░".repeat(width);
|
|
65
|
+
const filled = Math.round((value / max) * width);
|
|
66
|
+
return "█".repeat(filled) + "░".repeat(width - filled);
|
|
67
|
+
}
|
|
68
|
+
const W = 62;
|
|
69
|
+
function printSection(title, rows, max) {
|
|
70
|
+
console.log(`\n ${title}`);
|
|
71
|
+
console.log(" " + "─".repeat(W - 2));
|
|
72
|
+
for (const row of rows) {
|
|
73
|
+
const label = row.label.slice(0, 26).padEnd(26);
|
|
74
|
+
const b = bar(row.value, max);
|
|
75
|
+
console.log(` ${label} ${b} ${row.display}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
function formatDuration(ms) {
|
|
79
|
+
if (ms < 1000)
|
|
80
|
+
return `${ms.toFixed(0)}ms`;
|
|
81
|
+
if (ms < 60_000)
|
|
82
|
+
return `${(ms / 1000).toFixed(1)}s`;
|
|
83
|
+
const m = Math.floor(ms / 60_000);
|
|
84
|
+
const s = ((ms % 60_000) / 1000).toFixed(0).padStart(2, "0");
|
|
85
|
+
return `${m}m${s}s`;
|
|
86
|
+
}
|
|
87
|
+
async function main() {
|
|
88
|
+
const cwd = process.cwd();
|
|
89
|
+
const files = await findReports(cwd);
|
|
90
|
+
if (files.length === 0) {
|
|
91
|
+
console.log("\n No reports found. Run some agent tests first.\n");
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
const reports = await Promise.all(files.map(async (f) => {
|
|
95
|
+
const content = await readFile(f, "utf-8");
|
|
96
|
+
return parseReport(content, relative(cwd, f));
|
|
97
|
+
}));
|
|
98
|
+
console.log("\n" + "━".repeat(W));
|
|
99
|
+
console.log(` AGEST STATS · ${reports.length} report${reports.length !== 1 ? "s" : ""} found`);
|
|
100
|
+
console.log("━".repeat(W));
|
|
101
|
+
// Aggregate by model
|
|
102
|
+
const byModel = new Map();
|
|
103
|
+
for (const r of reports) {
|
|
104
|
+
const arr = byModel.get(r.model) ?? [];
|
|
105
|
+
arr.push(r);
|
|
106
|
+
byModel.set(r.model, arr);
|
|
107
|
+
}
|
|
108
|
+
const agg = [...byModel.entries()].map(([model, reps]) => {
|
|
109
|
+
const inputNums = reps.flatMap((r) => r.averageInputTokensPerCase != null ? [r.averageInputTokensPerCase] : []);
|
|
110
|
+
const outputNums = reps.flatMap((r) => r.averageOutputTokensPerCase != null ? [r.averageOutputTokensPerCase] : []);
|
|
111
|
+
return {
|
|
112
|
+
model,
|
|
113
|
+
runs: reps.length,
|
|
114
|
+
avgSuccessRate: avg(reps.map((r) => r.successRate)),
|
|
115
|
+
avgDuration: avg(reps.map((r) => r.duration)),
|
|
116
|
+
avgInputTokens: avg(inputNums),
|
|
117
|
+
avgOutputTokens: avg(outputNums),
|
|
118
|
+
};
|
|
119
|
+
});
|
|
120
|
+
agg.sort((a, b) => b.avgSuccessRate - a.avgSuccessRate);
|
|
121
|
+
// Success rate (always shown)
|
|
122
|
+
printSection("Success Rate", agg.map((a) => ({
|
|
123
|
+
label: `${a.model} (${a.runs}x)`,
|
|
124
|
+
value: a.avgSuccessRate,
|
|
125
|
+
display: `${(a.avgSuccessRate * 100).toFixed(0).padStart(3)}%`,
|
|
126
|
+
})), 1);
|
|
127
|
+
// Token charts (only when data is present)
|
|
128
|
+
const withTokens = agg.filter((a) => a.avgInputTokens != null && a.avgOutputTokens != null);
|
|
129
|
+
if (withTokens.length > 0) {
|
|
130
|
+
const maxIn = Math.max(...withTokens.map((a) => a.avgInputTokens));
|
|
131
|
+
printSection("Avg Input Tokens / Case", withTokens.map((a) => ({
|
|
132
|
+
label: a.model,
|
|
133
|
+
value: a.avgInputTokens,
|
|
134
|
+
display: String(Math.round(a.avgInputTokens)).padStart(5),
|
|
135
|
+
})), maxIn);
|
|
136
|
+
const maxOut = Math.max(...withTokens.map((a) => a.avgOutputTokens));
|
|
137
|
+
printSection("Avg Output Tokens / Case", withTokens.map((a) => ({
|
|
138
|
+
label: a.model,
|
|
139
|
+
value: a.avgOutputTokens,
|
|
140
|
+
display: String(Math.round(a.avgOutputTokens)).padStart(5),
|
|
141
|
+
})), maxOut);
|
|
142
|
+
}
|
|
143
|
+
// Duration chart — sorted fastest first (ascending)
|
|
144
|
+
const byDuration = [...agg].sort((a, b) => a.avgDuration - b.avgDuration);
|
|
145
|
+
const maxDuration = Math.max(...byDuration.map((a) => a.avgDuration));
|
|
146
|
+
printSection("Avg Duration / Run (fastest first)", byDuration.map((a) => ({
|
|
147
|
+
label: `${a.model} (${a.runs}x)`,
|
|
148
|
+
value: a.avgDuration,
|
|
149
|
+
display: formatDuration(a.avgDuration).padStart(8),
|
|
150
|
+
})), maxDuration);
|
|
151
|
+
console.log("\n" +
|
|
152
|
+
"━".repeat(W) +
|
|
153
|
+
`\n ${agg.length} model${agg.length !== 1 ? "s" : ""} · ${reports.length} total runs\n` +
|
|
154
|
+
"━".repeat(W) +
|
|
155
|
+
"\n");
|
|
156
|
+
}
|
|
157
|
+
main().catch((err) => {
|
|
158
|
+
console.error("Error:", err.message);
|
|
159
|
+
process.exit(1);
|
|
160
|
+
});
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export type AgentExecutor = (input: string) => Promise<AgentResponse>;
|
|
2
|
+
export interface AgentResponse {
|
|
3
|
+
text: string;
|
|
4
|
+
refusal?: boolean;
|
|
5
|
+
executionError?: string;
|
|
6
|
+
metadata?: {
|
|
7
|
+
model?: string;
|
|
8
|
+
tokens?: {
|
|
9
|
+
input: number;
|
|
10
|
+
output: number;
|
|
11
|
+
};
|
|
12
|
+
tools?: string[];
|
|
13
|
+
systemPrompt?: string;
|
|
14
|
+
[key: string]: unknown;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
export interface SceneDefinition {
|
|
18
|
+
prompt: string;
|
|
19
|
+
assertions: Array<{
|
|
20
|
+
field: string;
|
|
21
|
+
fn: (value: any) => void;
|
|
22
|
+
}>;
|
|
23
|
+
}
|
|
24
|
+
export interface SceneResult {
|
|
25
|
+
prompt: string;
|
|
26
|
+
response: AgentResponse;
|
|
27
|
+
duration: number;
|
|
28
|
+
passed: boolean;
|
|
29
|
+
error?: string;
|
|
30
|
+
}
|
|
31
|
+
export interface AgentReport {
|
|
32
|
+
model?: string;
|
|
33
|
+
systemPromptHash?: string;
|
|
34
|
+
tools?: string[];
|
|
35
|
+
successRate: number;
|
|
36
|
+
failedCases: string[];
|
|
37
|
+
failedCaseErrors: Record<string, string>;
|
|
38
|
+
timestamp: string;
|
|
39
|
+
duration: number;
|
|
40
|
+
totalCases: number;
|
|
41
|
+
averageInputTokensPerCase?: number;
|
|
42
|
+
averageOutputTokensPerCase?: number;
|
|
43
|
+
results: SceneResult[];
|
|
44
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@sebastiantuyu/agest",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "A testing library for agents",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"files": [
|
|
7
|
+
"dist"
|
|
8
|
+
],
|
|
9
|
+
"main": "dist/index.js",
|
|
10
|
+
"types": "dist/index.d.ts",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"default": "./dist/index.js"
|
|
15
|
+
},
|
|
16
|
+
"./adapters": {
|
|
17
|
+
"types": "./dist/adapters/index.d.ts",
|
|
18
|
+
"default": "./dist/adapters/index.js"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"engines": {
|
|
22
|
+
"node": ">=22.0.0"
|
|
23
|
+
},
|
|
24
|
+
"devDependencies": {
|
|
25
|
+
"@langchain/core": "^1.1.39",
|
|
26
|
+
"@langchain/langgraph": "^1.2.8",
|
|
27
|
+
"@langchain/openai": "^1.4.4",
|
|
28
|
+
"@types/node": "^22.0.0",
|
|
29
|
+
"dotenv": "^17.4.1",
|
|
30
|
+
"langchain": "^1.3.1",
|
|
31
|
+
"tsx": "^4.21.0",
|
|
32
|
+
"typescript": "^5.4.0",
|
|
33
|
+
"zod": "^4.3.6"
|
|
34
|
+
},
|
|
35
|
+
"scripts": {
|
|
36
|
+
"build": "tsc",
|
|
37
|
+
"test": "node dist/index.js",
|
|
38
|
+
"dev": "tsx examples/basic.test.ts",
|
|
39
|
+
"test:examples": "tsx examples/basic.test.ts && tsx examples/agent.test.ts",
|
|
40
|
+
"stats": "tsx src/stats.ts",
|
|
41
|
+
"release:patch": "npm version patch && git push && git push --tags",
|
|
42
|
+
"release:minor": "npm version minor && git push && git push --tags",
|
|
43
|
+
"release:major": "npm version major && git push && git push --tags"
|
|
44
|
+
}
|
|
45
|
+
}
|