@dvina/agents 0.14.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval/index.d.mts +58 -13
- package/dist/eval/index.d.ts +58 -13
- package/dist/eval/index.js +672 -29
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/index.mjs +673 -30
- package/dist/eval/index.mjs.map +1 -1
- package/dist/index.d.mts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +75 -0
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +75 -0
- package/dist/index.mjs.map +1 -1
- package/dist/{model-resolver-DjKRXKtu.d.mts → model-resolver-DSJRvrqA.d.mts} +2 -5
- package/dist/{model-resolver-DjKRXKtu.d.ts → model-resolver-DSJRvrqA.d.ts} +2 -5
- package/package.json +1 -1
package/dist/eval/index.d.mts
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DSJRvrqA.mjs';
|
|
2
2
|
import * as zod from 'zod';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
5
5
|
|
|
6
|
+
/** Optional hook applied by the eval runner to wrap every tool for tracking and stop detection. */
|
|
7
|
+
type ToolWrapper = (tools: ToolDefinition[]) => ToolDefinition[];
|
|
6
8
|
/** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
|
|
7
|
-
type CreateTargetFn = (model: string, extraTools: ToolDefinition[]
|
|
9
|
+
type CreateTargetFn = (model: string, extraTools: ToolDefinition[],
|
|
10
|
+
/** When provided, the factory MUST apply this to the final merged tool array (built-in + extra) before creating the agent. */
|
|
11
|
+
wrapTools?: ToolWrapper) => Agent | Promise<Agent>;
|
|
8
12
|
interface EvalConfig {
|
|
9
13
|
/** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
|
|
10
14
|
modelConfig: LangchainModelConfig;
|
|
@@ -35,6 +39,20 @@ interface MockToolDef {
|
|
|
35
39
|
*/
|
|
36
40
|
response: string | ((input: Record<string, unknown>, callCount: number) => string);
|
|
37
41
|
}
|
|
42
|
+
interface EvalTargetInput {
|
|
43
|
+
systemPrompt?: string;
|
|
44
|
+
messages: Message[];
|
|
45
|
+
tools: MockToolDef[];
|
|
46
|
+
executionMode?: ExecutionMode;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
interface ToolCallExpectation {
|
|
50
|
+
name: string;
|
|
51
|
+
/** Returns `true` if the tool call input is valid. At least one call must satisfy it. */
|
|
52
|
+
validate?: (input: Record<string, unknown>) => boolean;
|
|
53
|
+
/** Minimum number of times the tool must be called. Defaults to 1. */
|
|
54
|
+
times?: number;
|
|
55
|
+
}
|
|
38
56
|
|
|
39
57
|
type EvaluatorFn = (args: {
|
|
40
58
|
outputs: Record<string, any>;
|
|
@@ -48,17 +66,33 @@ interface ResolvedExpectation {
|
|
|
48
66
|
type Expectation = (ctx: {
|
|
49
67
|
message: string;
|
|
50
68
|
}) => ResolvedExpectation;
|
|
69
|
+
/** A tool name (string) or an object with a name and input validator. */
|
|
70
|
+
type ToolExpectation = string | ToolCallExpectation;
|
|
51
71
|
/**
|
|
52
|
-
* Expect the agent to call
|
|
72
|
+
* Expect the agent to call the listed tools (superset trajectory match).
|
|
53
73
|
* Empty `[]` means the agent should answer directly without calling any tools.
|
|
74
|
+
*
|
|
75
|
+
* Each entry can be a plain tool name or an object with:
|
|
76
|
+
* - `validate` — callback that receives the tool input; at least one call must satisfy it.
|
|
77
|
+
* - `times` — minimum number of times the tool must be called.
|
|
78
|
+
* - Both can be combined.
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* toolsCalled([
|
|
82
|
+
* 'list-documents',
|
|
83
|
+
* { name: 'search-tables', validate: (input) => input.query?.includes('Q4') },
|
|
84
|
+
* { name: 'list-documents', times: 2 },
|
|
85
|
+
* ])
|
|
54
86
|
*/
|
|
55
|
-
declare function toolsCalled(tools:
|
|
87
|
+
declare function toolsCalled(tools: ToolExpectation[]): Expectation;
|
|
56
88
|
/**
|
|
57
|
-
*
|
|
58
|
-
*
|
|
89
|
+
* Use an LLM to judge the agent's final response against the given criteria.
|
|
90
|
+
* Works independently — does not require `toolsCalled` or any other expectation.
|
|
59
91
|
* Uses the globally configured evaluator model.
|
|
92
|
+
*
|
|
93
|
+
* @param criteria - Human-readable description of what the judge should evaluate.
|
|
60
94
|
*/
|
|
61
|
-
declare function llmJudge(): Expectation;
|
|
95
|
+
declare function llmJudge(criteria: string): Expectation;
|
|
62
96
|
/**
|
|
63
97
|
* Assert the agent made zero tool calls.
|
|
64
98
|
* Optionally allow specific tools via `except` — calls to those tools
|
|
@@ -94,6 +128,19 @@ interface ToolDef {
|
|
|
94
128
|
/** Auto-stringified if not a string or function. */
|
|
95
129
|
response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
|
|
96
130
|
}
|
|
131
|
+
/**
|
|
132
|
+
* Controls how the eval target executes.
|
|
133
|
+
* - `single-turn`: one model invocation + tool execution, then stop.
|
|
134
|
+
* - `stop-after-tool`: run until the listed tools have been called `count`
|
|
135
|
+
* times cumulatively, then stop. Defaults to 1 (stop on the first match).
|
|
136
|
+
*/
|
|
137
|
+
type ExecutionMode = {
|
|
138
|
+
type: 'single-turn';
|
|
139
|
+
} | {
|
|
140
|
+
type: 'stop-after-tool';
|
|
141
|
+
tools: string[];
|
|
142
|
+
count?: number;
|
|
143
|
+
};
|
|
97
144
|
interface TestCase {
|
|
98
145
|
/** Test name. Defaults to the last human message content if omitted. */
|
|
99
146
|
name?: string;
|
|
@@ -103,13 +150,11 @@ interface TestCase {
|
|
|
103
150
|
tools?: Record<string, ToolDef>;
|
|
104
151
|
/** Transforms messages before sending to target. Overrides suite-level and global hooks. */
|
|
105
152
|
prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
|
|
153
|
+
/** Controls target execution. Omit for default behavior (run until the agent stops on its own). */
|
|
154
|
+
executionMode?: ExecutionMode;
|
|
106
155
|
expect: Expectation[];
|
|
107
156
|
}
|
|
108
|
-
type TargetFn = (inputs: {
|
|
109
|
-
systemPrompt?: string;
|
|
110
|
-
messages: Message[];
|
|
111
|
-
tools: MockToolDef[];
|
|
112
|
-
}) => Promise<{
|
|
157
|
+
type TargetFn = (inputs: EvalTargetInput) => Promise<{
|
|
113
158
|
messages: BaseMessage[];
|
|
114
159
|
}>;
|
|
115
160
|
interface SuiteConfig {
|
|
@@ -150,4 +195,4 @@ declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
|
150
195
|
*/
|
|
151
196
|
declare function runEvals(): void;
|
|
152
197
|
|
|
153
|
-
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
|
198
|
+
export { type CreateTargetFn, type EvalConfig, type ExecutionMode, type Expectation, type SuiteConfig, type TestCase, type ToolCallExpectation, type ToolDef, type ToolExpectation, type ToolWrapper, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DSJRvrqA.js';
|
|
2
2
|
import * as zod from 'zod';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
5
5
|
|
|
6
|
+
/** Optional hook applied by the eval runner to wrap every tool for tracking and stop detection. */
|
|
7
|
+
type ToolWrapper = (tools: ToolDefinition[]) => ToolDefinition[];
|
|
6
8
|
/** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
|
|
7
|
-
type CreateTargetFn = (model: string, extraTools: ToolDefinition[]
|
|
9
|
+
type CreateTargetFn = (model: string, extraTools: ToolDefinition[],
|
|
10
|
+
/** When provided, the factory MUST apply this to the final merged tool array (built-in + extra) before creating the agent. */
|
|
11
|
+
wrapTools?: ToolWrapper) => Agent | Promise<Agent>;
|
|
8
12
|
interface EvalConfig {
|
|
9
13
|
/** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
|
|
10
14
|
modelConfig: LangchainModelConfig;
|
|
@@ -35,6 +39,20 @@ interface MockToolDef {
|
|
|
35
39
|
*/
|
|
36
40
|
response: string | ((input: Record<string, unknown>, callCount: number) => string);
|
|
37
41
|
}
|
|
42
|
+
interface EvalTargetInput {
|
|
43
|
+
systemPrompt?: string;
|
|
44
|
+
messages: Message[];
|
|
45
|
+
tools: MockToolDef[];
|
|
46
|
+
executionMode?: ExecutionMode;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
interface ToolCallExpectation {
|
|
50
|
+
name: string;
|
|
51
|
+
/** Returns `true` if the tool call input is valid. At least one call must satisfy it. */
|
|
52
|
+
validate?: (input: Record<string, unknown>) => boolean;
|
|
53
|
+
/** Minimum number of times the tool must be called. Defaults to 1. */
|
|
54
|
+
times?: number;
|
|
55
|
+
}
|
|
38
56
|
|
|
39
57
|
type EvaluatorFn = (args: {
|
|
40
58
|
outputs: Record<string, any>;
|
|
@@ -48,17 +66,33 @@ interface ResolvedExpectation {
|
|
|
48
66
|
type Expectation = (ctx: {
|
|
49
67
|
message: string;
|
|
50
68
|
}) => ResolvedExpectation;
|
|
69
|
+
/** A tool name (string) or an object with a name and input validator. */
|
|
70
|
+
type ToolExpectation = string | ToolCallExpectation;
|
|
51
71
|
/**
|
|
52
|
-
* Expect the agent to call
|
|
72
|
+
* Expect the agent to call the listed tools (superset trajectory match).
|
|
53
73
|
* Empty `[]` means the agent should answer directly without calling any tools.
|
|
74
|
+
*
|
|
75
|
+
* Each entry can be a plain tool name or an object with:
|
|
76
|
+
* - `validate` — callback that receives the tool input; at least one call must satisfy it.
|
|
77
|
+
* - `times` — minimum number of times the tool must be called.
|
|
78
|
+
* - Both can be combined.
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* toolsCalled([
|
|
82
|
+
* 'list-documents',
|
|
83
|
+
* { name: 'search-tables', validate: (input) => input.query?.includes('Q4') },
|
|
84
|
+
* { name: 'list-documents', times: 2 },
|
|
85
|
+
* ])
|
|
54
86
|
*/
|
|
55
|
-
declare function toolsCalled(tools:
|
|
87
|
+
declare function toolsCalled(tools: ToolExpectation[]): Expectation;
|
|
56
88
|
/**
|
|
57
|
-
*
|
|
58
|
-
*
|
|
89
|
+
* Use an LLM to judge the agent's final response against the given criteria.
|
|
90
|
+
* Works independently — does not require `toolsCalled` or any other expectation.
|
|
59
91
|
* Uses the globally configured evaluator model.
|
|
92
|
+
*
|
|
93
|
+
* @param criteria - Human-readable description of what the judge should evaluate.
|
|
60
94
|
*/
|
|
61
|
-
declare function llmJudge(): Expectation;
|
|
95
|
+
declare function llmJudge(criteria: string): Expectation;
|
|
62
96
|
/**
|
|
63
97
|
* Assert the agent made zero tool calls.
|
|
64
98
|
* Optionally allow specific tools via `except` — calls to those tools
|
|
@@ -94,6 +128,19 @@ interface ToolDef {
|
|
|
94
128
|
/** Auto-stringified if not a string or function. */
|
|
95
129
|
response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
|
|
96
130
|
}
|
|
131
|
+
/**
|
|
132
|
+
* Controls how the eval target executes.
|
|
133
|
+
* - `single-turn`: one model invocation + tool execution, then stop.
|
|
134
|
+
* - `stop-after-tool`: run until the listed tools have been called `count`
|
|
135
|
+
* times cumulatively, then stop. Defaults to 1 (stop on the first match).
|
|
136
|
+
*/
|
|
137
|
+
type ExecutionMode = {
|
|
138
|
+
type: 'single-turn';
|
|
139
|
+
} | {
|
|
140
|
+
type: 'stop-after-tool';
|
|
141
|
+
tools: string[];
|
|
142
|
+
count?: number;
|
|
143
|
+
};
|
|
97
144
|
interface TestCase {
|
|
98
145
|
/** Test name. Defaults to the last human message content if omitted. */
|
|
99
146
|
name?: string;
|
|
@@ -103,13 +150,11 @@ interface TestCase {
|
|
|
103
150
|
tools?: Record<string, ToolDef>;
|
|
104
151
|
/** Transforms messages before sending to target. Overrides suite-level and global hooks. */
|
|
105
152
|
prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
|
|
153
|
+
/** Controls target execution. Omit for default behavior (run until the agent stops on its own). */
|
|
154
|
+
executionMode?: ExecutionMode;
|
|
106
155
|
expect: Expectation[];
|
|
107
156
|
}
|
|
108
|
-
type TargetFn = (inputs: {
|
|
109
|
-
systemPrompt?: string;
|
|
110
|
-
messages: Message[];
|
|
111
|
-
tools: MockToolDef[];
|
|
112
|
-
}) => Promise<{
|
|
157
|
+
type TargetFn = (inputs: EvalTargetInput) => Promise<{
|
|
113
158
|
messages: BaseMessage[];
|
|
114
159
|
}>;
|
|
115
160
|
interface SuiteConfig {
|
|
@@ -150,4 +195,4 @@ declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
|
150
195
|
*/
|
|
151
196
|
declare function runEvals(): void;
|
|
152
197
|
|
|
153
|
-
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
|
198
|
+
export { type CreateTargetFn, type EvalConfig, type ExecutionMode, type Expectation, type SuiteConfig, type TestCase, type ToolCallExpectation, type ToolDef, type ToolExpectation, type ToolWrapper, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|