npm - @dvina/agents - Versions diffs - 0.14.0 → 0.17.0 - Mend

@dvina/agents 0.14.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/eval/index.d.mts +58 -13
package/dist/eval/index.d.ts +58 -13
package/dist/eval/index.js +672 -29
package/dist/eval/index.js.map +1 -1
package/dist/eval/index.mjs +673 -30
package/dist/eval/index.mjs.map +1 -1
package/dist/index.d.mts +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +75 -0
package/dist/index.js.map +1 -1
package/dist/index.mjs +75 -0
package/dist/index.mjs.map +1 -1
package/dist/{model-resolver-DjKRXKtu.d.mts → model-resolver-DSJRvrqA.d.mts} +2 -5
package/dist/{model-resolver-DjKRXKtu.d.ts → model-resolver-DSJRvrqA.d.ts} +2 -5
package/package.json +1 -1

package/dist/eval/index.d.mts CHANGED Viewed

@@ -1,10 +1,14 @@
-import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.mjs';
+import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DSJRvrqA.mjs';
 import * as zod from 'zod';
 import { z } from 'zod';
 import { BaseMessage } from '@langchain/core/messages';
+/** Optional hook applied by the eval runner to wrap every tool for tracking and stop detection. */
+type ToolWrapper = (tools: ToolDefinition[]) => ToolDefinition[];
 /** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
-type CreateTargetFn = (model: string, extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
+type CreateTargetFn = (model: string, extraTools: ToolDefinition[],
+/** When provided, the factory MUST apply this to the final merged tool array (built-in + extra) before creating the agent. */
+wrapTools?: ToolWrapper) => Agent | Promise<Agent>;
 interface EvalConfig {
     /** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
     modelConfig: LangchainModelConfig;
@@ -35,6 +39,20 @@ interface MockToolDef {
      */
     response: string | ((input: Record<string, unknown>, callCount: number) => string);
 }
+interface EvalTargetInput {
+    systemPrompt?: string;
+    messages: Message[];
+    tools: MockToolDef[];
+    executionMode?: ExecutionMode;
+}
+interface ToolCallExpectation {
+    name: string;
+    /** Returns `true` if the tool call input is valid. At least one call must satisfy it. */
+    validate?: (input: Record<string, unknown>) => boolean;
+    /** Minimum number of times the tool must be called. Defaults to 1. */
+    times?: number;
+}
 type EvaluatorFn = (args: {
     outputs: Record<string, any>;
@@ -48,17 +66,33 @@ interface ResolvedExpectation {
 type Expectation = (ctx: {
     message: string;
 }) => ResolvedExpectation;
+/** A tool name (string) or an object with a name and input validator. */
+type ToolExpectation = string | ToolCallExpectation;
 /**
- * Expect the agent to call tools in order (superset trajectory match).
+ * Expect the agent to call the listed tools (superset trajectory match).
  * Empty `[]` means the agent should answer directly without calling any tools.
+ *
+ * Each entry can be a plain tool name or an object with:
+ * - `validate` — callback that receives the tool input; at least one call must satisfy it.
+ * - `times` — minimum number of times the tool must be called.
+ * - Both can be combined.
+ *
+ * @example
+ * toolsCalled([
+ *   'list-documents',
+ *   { name: 'search-tables', validate: (input) => input.query?.includes('Q4') },
+ *   { name: 'list-documents', times: 2 },
+ * ])
  */
-declare function toolsCalled(tools: string[]): Expectation;
+declare function toolsCalled(tools: ToolExpectation[]): Expectation;
 /**
- * Run an LLM-as-judge evaluator on the trajectory.
- * Requires `toolsCalled` in the same expect array.
+ * Use an LLM to judge the agent's final response against the given criteria.
+ * Works independently — does not require `toolsCalled` or any other expectation.
  * Uses the globally configured evaluator model.
+ *
+ * @param criteria - Human-readable description of what the judge should evaluate.
  */
-declare function llmJudge(): Expectation;
+declare function llmJudge(criteria: string): Expectation;
 /**
  * Assert the agent made zero tool calls.
  * Optionally allow specific tools via `except` — calls to those tools
@@ -94,6 +128,19 @@ interface ToolDef {
     /** Auto-stringified if not a string or function. */
     response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
 }
+/**
+ * Controls how the eval target executes.
+ * - `single-turn`: one model invocation + tool execution, then stop.
+ * - `stop-after-tool`: run until the listed tools have been called `count`
+ *   times cumulatively, then stop. Defaults to 1 (stop on the first match).
+ */
+type ExecutionMode = {
+    type: 'single-turn';
+} | {
+    type: 'stop-after-tool';
+    tools: string[];
+    count?: number;
+};
 interface TestCase {
     /** Test name. Defaults to the last human message content if omitted. */
     name?: string;
@@ -103,13 +150,11 @@ interface TestCase {
     tools?: Record<string, ToolDef>;
     /** Transforms messages before sending to target. Overrides suite-level and global hooks. */
     prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
+    /** Controls target execution. Omit for default behavior (run until the agent stops on its own). */
+    executionMode?: ExecutionMode;
     expect: Expectation[];
 }
-type TargetFn = (inputs: {
-    systemPrompt?: string;
-    messages: Message[];
-    tools: MockToolDef[];
-}) => Promise<{
+type TargetFn = (inputs: EvalTargetInput) => Promise<{
     messages: BaseMessage[];
 }>;
 interface SuiteConfig {
@@ -150,4 +195,4 @@ declare function defineSuite(name: string, config: SuiteConfig): void;
  */
 declare function runEvals(): void;
-export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
+export { type CreateTargetFn, type EvalConfig, type ExecutionMode, type Expectation, type SuiteConfig, type TestCase, type ToolCallExpectation, type ToolDef, type ToolExpectation, type ToolWrapper, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };

package/dist/eval/index.d.ts CHANGED Viewed

@@ -1,10 +1,14 @@
-import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.js';
+import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DSJRvrqA.js';
 import * as zod from 'zod';
 import { z } from 'zod';
 import { BaseMessage } from '@langchain/core/messages';
+/** Optional hook applied by the eval runner to wrap every tool for tracking and stop detection. */
+type ToolWrapper = (tools: ToolDefinition[]) => ToolDefinition[];
 /** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
-type CreateTargetFn = (model: string, extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
+type CreateTargetFn = (model: string, extraTools: ToolDefinition[],
+/** When provided, the factory MUST apply this to the final merged tool array (built-in + extra) before creating the agent. */
+wrapTools?: ToolWrapper) => Agent | Promise<Agent>;
 interface EvalConfig {
     /** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
     modelConfig: LangchainModelConfig;
@@ -35,6 +39,20 @@ interface MockToolDef {
      */
     response: string | ((input: Record<string, unknown>, callCount: number) => string);
 }
+interface EvalTargetInput {
+    systemPrompt?: string;
+    messages: Message[];
+    tools: MockToolDef[];
+    executionMode?: ExecutionMode;
+}
+interface ToolCallExpectation {
+    name: string;
+    /** Returns `true` if the tool call input is valid. At least one call must satisfy it. */
+    validate?: (input: Record<string, unknown>) => boolean;
+    /** Minimum number of times the tool must be called. Defaults to 1. */
+    times?: number;
+}
 type EvaluatorFn = (args: {
     outputs: Record<string, any>;
@@ -48,17 +66,33 @@ interface ResolvedExpectation {
 type Expectation = (ctx: {
     message: string;
 }) => ResolvedExpectation;
+/** A tool name (string) or an object with a name and input validator. */
+type ToolExpectation = string | ToolCallExpectation;
 /**
- * Expect the agent to call tools in order (superset trajectory match).
+ * Expect the agent to call the listed tools (superset trajectory match).
  * Empty `[]` means the agent should answer directly without calling any tools.
+ *
+ * Each entry can be a plain tool name or an object with:
+ * - `validate` — callback that receives the tool input; at least one call must satisfy it.
+ * - `times` — minimum number of times the tool must be called.
+ * - Both can be combined.
+ *
+ * @example
+ * toolsCalled([
+ *   'list-documents',
+ *   { name: 'search-tables', validate: (input) => input.query?.includes('Q4') },
+ *   { name: 'list-documents', times: 2 },
+ * ])
  */
-declare function toolsCalled(tools: string[]): Expectation;
+declare function toolsCalled(tools: ToolExpectation[]): Expectation;
 /**
- * Run an LLM-as-judge evaluator on the trajectory.
- * Requires `toolsCalled` in the same expect array.
+ * Use an LLM to judge the agent's final response against the given criteria.
+ * Works independently — does not require `toolsCalled` or any other expectation.
  * Uses the globally configured evaluator model.
+ *
+ * @param criteria - Human-readable description of what the judge should evaluate.
  */
-declare function llmJudge(): Expectation;
+declare function llmJudge(criteria: string): Expectation;
 /**
  * Assert the agent made zero tool calls.
  * Optionally allow specific tools via `except` — calls to those tools
@@ -94,6 +128,19 @@ interface ToolDef {
     /** Auto-stringified if not a string or function. */
     response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
 }
+/**
+ * Controls how the eval target executes.
+ * - `single-turn`: one model invocation + tool execution, then stop.
+ * - `stop-after-tool`: run until the listed tools have been called `count`
+ *   times cumulatively, then stop. Defaults to 1 (stop on the first match).
+ */
+type ExecutionMode = {
+    type: 'single-turn';
+} | {
+    type: 'stop-after-tool';
+    tools: string[];
+    count?: number;
+};
 interface TestCase {
     /** Test name. Defaults to the last human message content if omitted. */
     name?: string;
@@ -103,13 +150,11 @@ interface TestCase {
     tools?: Record<string, ToolDef>;
     /** Transforms messages before sending to target. Overrides suite-level and global hooks. */
     prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
+    /** Controls target execution. Omit for default behavior (run until the agent stops on its own). */
+    executionMode?: ExecutionMode;
     expect: Expectation[];
 }
-type TargetFn = (inputs: {
-    systemPrompt?: string;
-    messages: Message[];
-    tools: MockToolDef[];
-}) => Promise<{
+type TargetFn = (inputs: EvalTargetInput) => Promise<{
     messages: BaseMessage[];
 }>;
 interface SuiteConfig {
@@ -150,4 +195,4 @@ declare function defineSuite(name: string, config: SuiteConfig): void;
  */
 declare function runEvals(): void;
-export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
+export { type CreateTargetFn, type EvalConfig, type ExecutionMode, type Expectation, type SuiteConfig, type TestCase, type ToolCallExpectation, type ToolDef, type ToolExpectation, type ToolWrapper, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };