npm - @dvina/agents - Versions diffs - 0.4.0 → 0.5.0 - Mend

@dvina/agents 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/{chunk-LEEZCLZM.mjs → chunk-NHWEEBN2.mjs} +65 -11
package/dist/chunk-NHWEEBN2.mjs.map +1 -0
package/dist/eval/index.d.mts +31 -23
package/dist/eval/index.d.ts +31 -23
package/dist/eval/index.js +180 -56
package/dist/eval/index.js.map +1 -1
package/dist/eval/index.mjs +139 -68
package/dist/eval/index.mjs.map +1 -1
package/dist/index.d.mts +4 -125
package/dist/index.d.ts +4 -125
package/dist/index.js +74 -42
package/dist/index.js.map +1 -1
package/dist/index.mjs +7 -30
package/dist/index.mjs.map +1 -1
package/dist/model-resolver-U0J9x1a6.d.mts +158 -0
package/dist/model-resolver-U0J9x1a6.d.ts +158 -0
package/package.json +1 -1
package/dist/chunk-LEEZCLZM.mjs.map +0 -1
package/dist/model-resolver-BRAaBV9n.d.mts +0 -15
package/dist/model-resolver-BRAaBV9n.d.ts +0 -15

package/dist/{chunk-LEEZCLZM.mjs → chunk-NHWEEBN2.mjs} RENAMED Viewed

@@ -71,18 +71,21 @@ var LangchainModelResolver = class {
       tags
     });
   }
-  resolveAzure(configName, deploymentName, tags) {
-    const providerConfig = this.config.azure?.[configName];
-    if (!providerConfig) {
-      throw new Error(`Configuration "${configName}" for provider "azure" is missing`);
+  resolveAzure(resourceName, modelName, tags) {
+    const resource = this.config.azure?.[resourceName];
+    if (!resource) {
+      throw new Error(`Resource "${resourceName}" for provider "azure" is missing`);
+    }
+    const modelEntry = resource.models.find((m) => m.model === modelName);
+    if (!modelEntry) {
+      throw new Error(`Model "${modelName}" not found in Azure resource "${resourceName}"`);
     }
     return new AzureChatOpenAI({
-      model: providerConfig.model,
-      // shows (perhaps even uses) 3.5-turbo when not specifid
-      azureOpenAIApiKey: providerConfig.apiKey,
-      azureOpenAIApiInstanceName: this.extractInstanceName(providerConfig.endpoint),
-      azureOpenAIApiDeploymentName: deploymentName,
-      azureOpenAIApiVersion: providerConfig.apiVersion,
+      model: modelEntry.model,
+      azureOpenAIApiKey: resource.apiKey,
+      azureOpenAIApiInstanceName: this.extractInstanceName(resource.endpoint),
+      azureOpenAIApiDeploymentName: modelEntry.deploymentName,
+      azureOpenAIApiVersion: modelEntry.apiVersion,
       tags
     });
   }
@@ -96,10 +99,61 @@ var LangchainModelResolver = class {
   }
 };
+// src/runtime/langchain/utils.ts
+import { AIMessage, HumanMessage, ToolMessage } from "langchain";
+function convertToLangchainMessages(messages) {
+  const result = [];
+  let tcIdx = 0;
+  let pendingToolCallIds = [];
+  for (const msg of messages) {
+    if (msg.role === "human") {
+      result.push(
+        new HumanMessage({
+          content: msg.content.map((c) => {
+            if (c.type === "image") {
+              return { type: "image_url", image_url: { url: c.url } };
+            }
+            return c;
+          })
+        })
+      );
+    } else if (msg.role === "ai") {
+      if (msg.toolCalls && msg.toolCalls.length > 0) {
+        pendingToolCallIds = msg.toolCalls.map(() => `tc_${++tcIdx}`);
+        result.push(
+          new AIMessage({
+            content: msg.content,
+            tool_calls: msg.toolCalls.map((tc, i) => ({
+              id: pendingToolCallIds[i],
+              name: tc.name,
+              args: tc.input ? JSON.parse(tc.input) : {}
+            }))
+          })
+        );
+      } else {
+        result.push(new AIMessage(msg.content));
+      }
+    } else if (msg.role === "tool") {
+      const toolCallId = pendingToolCallIds.shift();
+      if (!toolCallId)
+        throw new Error(`ToolMessage for "${msg.name}" without a preceding AiMessage with toolCalls`);
+      result.push(
+        new ToolMessage({
+          content: msg.output,
+          tool_call_id: toolCallId,
+          name: msg.name
+        })
+      );
+    }
+  }
+  return result;
+}
 export {
   __require,
   __commonJS,
   __toESM,
+  convertToLangchainMessages,
   LangchainModelResolver
 };
-//# sourceMappingURL=chunk-LEEZCLZM.mjs.map
+//# sourceMappingURL=chunk-NHWEEBN2.mjs.map

package/dist/chunk-NHWEEBN2.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/runtime/langchain/model-resolver.ts","../src/runtime/langchain/utils.ts"],"sourcesContent":["import { BaseLanguageModel } from '@langchain/core/language_models/base';\nimport { AzureChatOpenAI, ChatOpenAI } from '@langchain/openai';\n\nexport type LangchainOpenAIConfig = {\n\tapiKey: string;\n};\n\nexport type LangchainAzureResourceConfig = {\n\tapiKey: string;\n\tendpoint: string;\n\tmodels: {\n\t\tmodel: string;\n\t\tapiVersion: string;\n\t\tdeploymentName: string;\n\t}[];\n};\n\nexport type ResourceName = string;\n\nexport type LangchainModelConfig = {\n\topenai?: Record<string, LangchainOpenAIConfig>;\n\tazure?: Record<ResourceName, LangchainAzureResourceConfig>;\n};\n\nexport class LangchainModelResolver {\n\tconstructor(private config: LangchainModelConfig) {}\n\n\tresolve(modelString: string, tags?: string[]): BaseLanguageModel {\n\t\tconst parts = modelString.split(':');\n\n\t\tif (parts.length === 2) {\n\t\t\tconst [provider, modelName] = parts;\n\t\t\treturn this.resolveByProvider(provider, 'default', modelName, tags);\n\t\t}\n\n\t\tif (parts.length === 3) {\n\t\t\tconst [provider, configName, modelName] = parts;\n\t\t\treturn this.resolveByProvider(provider, configName, modelName, tags);\n\t\t}\n\n\t\tthrow new Error(\n\t\t\t'Model string must follow format \"provider:modelName\" (uses \"default\" config) or \"provider:configName:modelName\"',\n\t\t);\n\t}\n\n\tprivate resolveByProvider(\n\t\tprovider: string,\n\t\tconfigName: string,\n\t\tmodelName: string,\n\t\ttags?: string[],\n\t): BaseLanguageModel {\n\t\tswitch (provider) {\n\t\t\tcase 'openai':\n\t\t\t\treturn this.resolveOpenAI(configName, modelName, tags);\n\t\t\tcase 'azure':\n\t\t\t\treturn this.resolveAzure(configName, modelName, tags);\n\t\t\tdefault:\n\t\t\t\tthrow new Error(`Unsupported model provider: ${provider}`);\n\t\t}\n\t}\n\n\tprivate resolveOpenAI(configName: string, modelName: string, tags?: string[]): ChatOpenAI {\n\t\tconst providerConfig = this.config.openai?.[configName];\n\t\tif (!providerConfig) {\n\t\t\tthrow new Error(`Configuration \"${configName}\" for provider \"openai\" is missing`);\n\t\t}\n\n\t\treturn new ChatOpenAI({\n\t\t\tapiKey: providerConfig.apiKey,\n\t\t\tmodelName: modelName,\n\t\t\ttags: tags,\n\t\t});\n\t}\n\n\tprivate resolveAzure(resourceName: string, modelName: string, tags?: string[]): AzureChatOpenAI {\n\t\tconst resource = this.config.azure?.[resourceName];\n\t\tif (!resource) {\n\t\t\tthrow new Error(`Resource \"${resourceName}\" for provider \"azure\" is missing`);\n\t\t}\n\n\t\tconst modelEntry = resource.models.find((m) => m.model === modelName);\n\t\tif (!modelEntry) {\n\t\t\tthrow new Error(`Model \"${modelName}\" not found in Azure resource \"${resourceName}\"`);\n\t\t}\n\n\t\treturn new AzureChatOpenAI({\n\t\t\tmodel: modelEntry.model,\n\t\t\tazureOpenAIApiKey: resource.apiKey,\n\t\t\tazureOpenAIApiInstanceName: this.extractInstanceName(resource.endpoint),\n\t\t\tazureOpenAIApiDeploymentName: modelEntry.deploymentName,\n\t\t\tazureOpenAIApiVersion: modelEntry.apiVersion,\n\t\t\ttags: tags,\n\t\t});\n\t}\n\n\tprivate extractInstanceName(endpoint: string): string {\n\t\ttry {\n\t\t\tconst url = new URL(endpoint);\n\t\t\treturn url.hostname.split('.')[0];\n\t\t} catch (e) {\n\t\t\treturn endpoint;\n\t\t}\n\t}\n}\n","import { Message } from '@core/agent.interface';\nimport { AIMessage, BaseMessage, HumanMessage, ToolMessage } from 'langchain';\n\nexport function convertToLangchainMessages(messages: Message[]): BaseMessage[] {\n\tconst result: BaseMessage[] = [];\n\tlet tcIdx = 0;\n\tlet pendingToolCallIds: string[] = [];\n\n\tfor (const msg of messages) {\n\t\tif (msg.role === 'human') {\n\t\t\tresult.push(\n\t\t\t\tnew HumanMessage({\n\t\t\t\t\tcontent: msg.content.map((c) => {\n\t\t\t\t\t\tif (c.type === 'image') {\n\t\t\t\t\t\t\treturn { type: 'image_url', image_url: { url: c.url } };\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn c;\n\t\t\t\t\t}) as any,\n\t\t\t\t}),\n\t\t\t);\n\t\t} else if (msg.role === 'ai') {\n\t\t\tif (msg.toolCalls && msg.toolCalls.length > 0) {\n\t\t\t\tpendingToolCallIds = msg.toolCalls.map(() => `tc_${++tcIdx}`);\n\t\t\t\tresult.push(\n\t\t\t\t\tnew AIMessage({\n\t\t\t\t\t\tcontent: msg.content,\n\t\t\t\t\t\ttool_calls: msg.toolCalls.map((tc, i) => ({\n\t\t\t\t\t\t\tid: pendingToolCallIds[i],\n\t\t\t\t\t\t\tname: tc.name,\n\t\t\t\t\t\t\targs: tc.input ? JSON.parse(tc.input) : {},\n\t\t\t\t\t\t})),\n\t\t\t\t\t}),\n\t\t\t\t);\n\t\t\t} else {\n\t\t\t\tresult.push(new AIMessage(msg.content));\n\t\t\t}\n\t\t} else if (msg.role === 'tool') {\n\t\t\tconst toolCallId = pendingToolCallIds.shift();\n\t\t\tif (!toolCallId)\n\t\t\t\tthrow new Error(`ToolMessage for \"${msg.name}\" without a preceding AiMessage with toolCalls`);\n\t\t\tresult.push(\n\t\t\t\tnew ToolMessage({\n\t\t\t\t\tcontent: msg.output,\n\t\t\t\t\ttool_call_id: toolCallId,\n\t\t\t\t\tname: msg.name,\n\t\t\t\t}),\n\t\t\t);\n\t\t}\n\t}\n\n\treturn result;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AACA,SAAS,iBAAiB,kBAAkB;AAuBrC,IAAM,yBAAN,MAA6B;AAAA,EACnC,YAAoB,QAA8B;AAA9B;AAAA,EAA+B;AAAA,EAEnD,QAAQ,aAAqB,MAAoC;AAChE,UAAM,QAAQ,YAAY,MAAM,GAAG;AAEnC,QAAI,MAAM,WAAW,GAAG;AACvB,YAAM,CAAC,UAAU,SAAS,IAAI;AAC9B,aAAO,KAAK,kBAAkB,UAAU,WAAW,WAAW,IAAI;AAAA,IACnE;AAEA,QAAI,MAAM,WAAW,GAAG;AACvB,YAAM,CAAC,UAAU,YAAY,SAAS,IAAI;AAC1C,aAAO,KAAK,kBAAkB,UAAU,YAAY,WAAW,IAAI;AAAA,IACpE;AAEA,UAAM,IAAI;AAAA,MACT;AAAA,IACD;AAAA,EACD;AAAA,EAEQ,kBACP,UACA,YACA,WACA,MACoB;AACpB,YAAQ,UAAU;AAAA,MACjB,KAAK;AACJ,eAAO,KAAK,cAAc,YAAY,WAAW,IAAI;AAAA,MACtD,KAAK;AACJ,eAAO,KAAK,aAAa,YAAY,WAAW,IAAI;AAAA,MACrD;AACC,cAAM,IAAI,MAAM,+BAA+B,QAAQ,EAAE;AAAA,IAC3D;AAAA,EACD;AAAA,EAEQ,cAAc,YAAoB,WAAmB,MAA6B;AACzF,UAAM,iBAAiB,KAAK,OAAO,SAAS,UAAU;AACtD,QAAI,CAAC,gBAAgB;AACpB,YAAM,IAAI,MAAM,kBAAkB,UAAU,oCAAoC;AAAA,IACjF;AAEA,WAAO,IAAI,WAAW;AAAA,MACrB,QAAQ,eAAe;AAAA,MACvB;AAAA,MACA;AAAA,IACD,CAAC;AAAA,EACF;AAAA,EAEQ,aAAa,cAAsB,WAAmB,MAAkC;AAC/F,UAAM,WAAW,KAAK,OAAO,QAAQ,YAAY;AACjD,QAAI,CAAC,UAAU;AACd,YAAM,IAAI,MAAM,aAAa,YAAY,mCAAmC;AAAA,IAC7E;AAEA,UAAM,aAAa,SAAS,OAAO,KAAK,CAAC,MAAM,EAAE,UAAU,SAAS;AACpE,QAAI,CAAC,YAAY;AAChB,YAAM,IAAI,MAAM,UAAU,SAAS,kCAAkC,YAAY,GAAG;AAAA,IACrF;AAEA,WAAO,IAAI,gBAAgB;AAAA,MAC1B,OAAO,WAAW;AAAA,MAClB,mBAAmB,SAAS;AAAA,MAC5B,4BAA4B,KAAK,oBAAoB,SAAS,QAAQ;AAAA,MACtE,8BAA8B,WAAW;AAAA,MACzC,uBAAuB,WAAW;AAAA,MAClC;AAAA,IACD,CAAC;AAAA,EACF;AAAA,EAEQ,oBAAoB,UAA0B;AACrD,QAAI;AACH,YAAM,MAAM,IAAI,IAAI,QAAQ;AAC5B,aAAO,IAAI,SAAS,MAAM,GAAG,EAAE,CAAC;AAAA,IACjC,SAAS,GAAG;AACX,aAAO;AAAA,IACR;AAAA,EACD;AACD;;;ACtGA,SAAS,WAAwB,cAAc,mBAAmB;AAE3D,SAAS,2BAA2B,UAAoC;AAC9E,QAAM,SAAwB,CAAC;AAC/B,MAAI,QAAQ;AACZ,MAAI,qBAA+B,CAAC;AAEpC,aAAW,OAAO,UAAU;AAC3B,QAAI,IAAI,SAAS,SAAS;AACzB,aAAO;AAAA,QACN,IAAI,aAAa;AAAA,UAChB,SAAS,IAAI,QAAQ,IAAI,CAAC,MAAM;AAC/B,gBAAI,EAAE,SAAS,SAAS;AACvB,qBAAO,EAAE,MAAM,aAAa,WAAW,EAAE,KAAK,EAAE,IAAI,EAAE;AAAA,YACvD;AACA,mBAAO;AAAA,UACR,CAAC;AAAA,QACF,CAAC;AAAA,MACF;AAAA,IACD,WAAW,IAAI,SAAS,MAAM;AAC7B,UAAI,IAAI,aAAa,IAAI,UAAU,SAAS,GAAG;AAC9C,6BAAqB,IAAI,UAAU,IAAI,MAAM,MAAM,EAAE,KAAK,EAAE;AAC5D,eAAO;AAAA,UACN,IAAI,UAAU;AAAA,YACb,SAAS,IAAI;AAAA,YACb,YAAY,IAAI,UAAU,IAAI,CAAC,IAAI,OAAO;AAAA,cACzC,IAAI,mBAAmB,CAAC;AAAA,cACxB,MAAM,GAAG;AAAA,cACT,MAAM,GAAG,QAAQ,KAAK,MAAM,GAAG,KAAK,IAAI,CAAC;AAAA,YAC1C,EAAE;AAAA,UACH,CAAC;AAAA,QACF;AAAA,MACD,OAAO;AACN,eAAO,KAAK,IAAI,UAAU,IAAI,OAAO,CAAC;AAAA,MACvC;AAAA,IACD,WAAW,IAAI,SAAS,QAAQ;AAC/B,YAAM,aAAa,mBAAmB,MAAM;AAC5C,UAAI,CAAC;AACJ,cAAM,IAAI,MAAM,oBAAoB,IAAI,IAAI,gDAAgD;AAC7F,aAAO;AAAA,QACN,IAAI,YAAY;AAAA,UACf,SAAS,IAAI;AAAA,UACb,cAAc;AAAA,UACd,MAAM,IAAI;AAAA,QACX,CAAC;AAAA,MACF;AAAA,IACD;AAAA,EACD;AAEA,SAAO;AACR;","names":[]}

package/dist/eval/index.d.mts CHANGED Viewed

@@ -1,20 +1,28 @@
-import { L as LangchainModelConfig } from '../model-resolver-BRAaBV9n.mjs';
+import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, a as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-U0J9x1a6.mjs';
+import * as zod from 'zod';
+import { z } from 'zod';
 import { BaseMessage } from '@langchain/core/messages';
+/** Factory that creates a fresh Agent per test case. Receives extra suite-level tools as ToolDefinition[]. */
+type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
 interface EvalConfig {
+    /** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
     modelConfig: LangchainModelConfig;
-    model: string;
-    /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). Defaults to `model`. */
-    evaluatorModel?: string;
-    /** System prompt prepended to every eval invocation. Can be overridden per-suite or per-case. */
+    /** Required for model-based target. Also used as fallback for evaluatorModel. */
+    model?: string;
+    /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
+    evaluatorModel: string;
+    /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
     systemPrompt?: string;
+    /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
+    createTarget?: CreateTargetFn;
 }
 declare function configureEvals(config: EvalConfig): void;
 interface MockToolDef {
     name: string;
     description: string;
-    schema: Record<string, unknown>;
+    schema: z.ZodObject<any> | Record<string, unknown>;
     /**
      * Canned response the mock tool returns.
      * Can be a static string, or a function that receives input and returns a response.
@@ -60,23 +68,13 @@ declare function contains(strings: string[]): Expectation;
 /** Assert the response does not contain any of the given strings. */
 declare function notContains(strings: string[]): Expectation;
-declare function human(content: string): {
-    role: "human";
-    content: string;
-};
-declare function ai(content: string, toolCalls?: string[]): {
-    toolCalls?: string[] | undefined;
-    role: "ai";
-    content: string;
-};
-declare function toolResult(content: string): {
-    role: "tool";
-    content: string;
-};
-type Message = ReturnType<typeof human> | ReturnType<typeof ai> | ReturnType<typeof toolResult>;
+declare function human(content: string): HumanMessage;
+declare function ai(content: string, toolCalls?: string[]): AiMessage;
+declare function toolResult(name: string, output: string): ToolMessage;
 interface ToolDef {
     description: string;
-    schema?: Record<string, string>;
+    /** A plain key→description record, or a ZodObject passed through from a ToolSpec. */
+    schema?: Record<string, string> | zod.ZodObject<any>;
     /** Auto-stringified if not a string or function. */
     response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
 }
@@ -99,11 +97,21 @@ type TargetFn = (inputs: {
 interface SuiteConfig {
     /** Custom target function, or model string override. Auto-created from global config if omitted. */
     target?: TargetFn | string;
+    /** Factory that creates a fresh Agent per test case. Overrides global createTarget. */
+    createTarget?: CreateTargetFn;
     /** System prompt for all cases in this suite. Overrides the global prompt; can be overridden per-case. */
     systemPrompt?: string;
-    tools: Record<string, ToolDef>;
+    tools?: Record<string, ToolDef>;
     cases: TestCase[];
 }
+/**
+ * Converts a `ToolSpec[]` (from a real tool provider) into the
+ * `Record<string, ToolDef>` that `defineSuite` expects.
+ *
+ * `responses` maps tool names to canned mock responses. Tools without an
+ * entry in `responses` default to `''`.
+ */
+declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
 declare function defineSuite(name: string, config: SuiteConfig): void;
-export { type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
+export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };

package/dist/eval/index.d.ts CHANGED Viewed

@@ -1,20 +1,28 @@
-import { L as LangchainModelConfig } from '../model-resolver-BRAaBV9n.js';
+import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, a as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-U0J9x1a6.js';
+import * as zod from 'zod';
+import { z } from 'zod';
 import { BaseMessage } from '@langchain/core/messages';
+/** Factory that creates a fresh Agent per test case. Receives extra suite-level tools as ToolDefinition[]. */
+type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
 interface EvalConfig {
+    /** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
     modelConfig: LangchainModelConfig;
-    model: string;
-    /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). Defaults to `model`. */
-    evaluatorModel?: string;
-    /** System prompt prepended to every eval invocation. Can be overridden per-suite or per-case. */
+    /** Required for model-based target. Also used as fallback for evaluatorModel. */
+    model?: string;
+    /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
+    evaluatorModel: string;
+    /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
     systemPrompt?: string;
+    /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
+    createTarget?: CreateTargetFn;
 }
 declare function configureEvals(config: EvalConfig): void;
 interface MockToolDef {
     name: string;
     description: string;
-    schema: Record<string, unknown>;
+    schema: z.ZodObject<any> | Record<string, unknown>;
     /**
      * Canned response the mock tool returns.
      * Can be a static string, or a function that receives input and returns a response.
@@ -60,23 +68,13 @@ declare function contains(strings: string[]): Expectation;
 /** Assert the response does not contain any of the given strings. */
 declare function notContains(strings: string[]): Expectation;
-declare function human(content: string): {
-    role: "human";
-    content: string;
-};
-declare function ai(content: string, toolCalls?: string[]): {
-    toolCalls?: string[] | undefined;
-    role: "ai";
-    content: string;
-};
-declare function toolResult(content: string): {
-    role: "tool";
-    content: string;
-};
-type Message = ReturnType<typeof human> | ReturnType<typeof ai> | ReturnType<typeof toolResult>;
+declare function human(content: string): HumanMessage;
+declare function ai(content: string, toolCalls?: string[]): AiMessage;
+declare function toolResult(name: string, output: string): ToolMessage;
 interface ToolDef {
     description: string;
-    schema?: Record<string, string>;
+    /** A plain key→description record, or a ZodObject passed through from a ToolSpec. */
+    schema?: Record<string, string> | zod.ZodObject<any>;
     /** Auto-stringified if not a string or function. */
     response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
 }
@@ -99,11 +97,21 @@ type TargetFn = (inputs: {
 interface SuiteConfig {
     /** Custom target function, or model string override. Auto-created from global config if omitted. */
     target?: TargetFn | string;
+    /** Factory that creates a fresh Agent per test case. Overrides global createTarget. */
+    createTarget?: CreateTargetFn;
     /** System prompt for all cases in this suite. Overrides the global prompt; can be overridden per-case. */
     systemPrompt?: string;
-    tools: Record<string, ToolDef>;
+    tools?: Record<string, ToolDef>;
     cases: TestCase[];
 }
+/**
+ * Converts a `ToolSpec[]` (from a real tool provider) into the
+ * `Record<string, ToolDef>` that `defineSuite` expects.
+ *
+ * `responses` maps tool names to canned mock responses. Tools without an
+ * entry in `responses` default to `''`.
+ */
+declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
 declare function defineSuite(name: string, config: SuiteConfig): void;
-export { type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
+export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };

package/dist/eval/index.js CHANGED Viewed

@@ -34,6 +34,7 @@ __export(eval_exports, {
   configureEvals: () => configureEvals,
   contains: () => contains,
   defineSuite: () => defineSuite,
+  fromToolSpecs: () => fromToolSpecs,
   human: () => human,
   llmJudge: () => llmJudge,
   noTools: () => noTools,
@@ -105,18 +106,21 @@ var LangchainModelResolver = class {
       tags
     });
   }
-  resolveAzure(configName, deploymentName, tags) {
-    const providerConfig = this.config.azure?.[configName];
-    if (!providerConfig) {
-      throw new Error(`Configuration "${configName}" for provider "azure" is missing`);
+  resolveAzure(resourceName, modelName, tags) {
+    const resource = this.config.azure?.[resourceName];
+    if (!resource) {
+      throw new Error(`Resource "${resourceName}" for provider "azure" is missing`);
+    }
+    const modelEntry = resource.models.find((m) => m.model === modelName);
+    if (!modelEntry) {
+      throw new Error(`Model "${modelName}" not found in Azure resource "${resourceName}"`);
     }
     return new import_openai.AzureChatOpenAI({
-      model: providerConfig.model,
-      // shows (perhaps even uses) 3.5-turbo when not specifid
-      azureOpenAIApiKey: providerConfig.apiKey,
-      azureOpenAIApiInstanceName: this.extractInstanceName(providerConfig.endpoint),
-      azureOpenAIApiDeploymentName: deploymentName,
-      azureOpenAIApiVersion: providerConfig.apiVersion,
+      model: modelEntry.model,
+      azureOpenAIApiKey: resource.apiKey,
+      azureOpenAIApiInstanceName: this.extractInstanceName(resource.endpoint),
+      azureOpenAIApiDeploymentName: modelEntry.deploymentName,
+      azureOpenAIApiVersion: modelEntry.apiVersion,
       tags
     });
   }
@@ -130,51 +134,64 @@ var LangchainModelResolver = class {
   }
 };
-// src/eval/target.ts
-var MAX_AGENT_LOOPS = 10;
-function convertMessages(msgs) {
+// src/runtime/langchain/utils.ts
+var import_langchain = require("langchain");
+function convertToLangchainMessages(messages) {
   const result = [];
   let tcIdx = 0;
-  let pendingToolCalls = [];
-  for (const msg of msgs) {
+  let pendingToolCallIds = [];
+  for (const msg of messages) {
     if (msg.role === "human") {
-      result.push(new import_messages.HumanMessage(msg.content));
+      result.push(
+        new import_langchain.HumanMessage({
+          content: msg.content.map((c) => {
+            if (c.type === "image") {
+              return { type: "image_url", image_url: { url: c.url } };
+            }
+            return c;
+          })
+        })
+      );
     } else if (msg.role === "ai") {
       if (msg.toolCalls && msg.toolCalls.length > 0) {
-        pendingToolCalls = msg.toolCalls.map((name) => ({
-          id: `hist_tc${++tcIdx}`,
-          name
-        }));
+        pendingToolCallIds = msg.toolCalls.map(() => `tc_${++tcIdx}`);
         result.push(
-          new import_messages.AIMessage({
+          new import_langchain.AIMessage({
             content: msg.content,
-            tool_calls: pendingToolCalls.map((tc) => ({
-              id: tc.id,
+            tool_calls: msg.toolCalls.map((tc, i) => ({
+              id: pendingToolCallIds[i],
               name: tc.name,
-              args: {}
+              args: tc.input ? JSON.parse(tc.input) : {}
             }))
           })
         );
       } else {
-        result.push(new import_messages.AIMessage(msg.content));
+        result.push(new import_langchain.AIMessage(msg.content));
       }
     } else if (msg.role === "tool") {
-      const tc = pendingToolCalls.shift();
-      if (!tc) throw new Error("toolResult() without a preceding ai() with toolCalls");
+      const toolCallId = pendingToolCallIds.shift();
+      if (!toolCallId)
+        throw new Error(`ToolMessage for "${msg.name}" without a preceding AiMessage with toolCalls`);
       result.push(
-        new import_messages.ToolMessage({
-          content: msg.content,
-          tool_call_id: tc.id,
-          name: tc.name
+        new import_langchain.ToolMessage({
+          content: msg.output,
+          tool_call_id: toolCallId,
+          name: msg.name
         })
       );
     }
   }
   return result;
 }
+// src/eval/target.ts
+var MAX_AGENT_LOOPS = 10;
 function createEvalTarget(modelConfig, modelString) {
   return async (inputs) => {
     const config = modelConfig && modelString ? { modelConfig, model: modelString } : getEvalConfig();
+    if (!config.model) {
+      throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
+    }
     const resolver = new LangchainModelResolver(config.modelConfig);
     const model = resolver.resolve(config.model);
     const toolCallCounts = {};
@@ -191,7 +208,7 @@ function createEvalTarget(modelConfig, modelString) {
         {
           name: mockTool.name,
           description: mockTool.description,
-          schema: import_zod.z.object(
+          schema: mockTool.schema instanceof import_zod.z.ZodObject ? mockTool.schema : import_zod.z.object(
             Object.fromEntries(
               Object.entries(mockTool.schema).map(([key, val]) => {
                 if (typeof val === "string") return [key, import_zod.z.string().describe(val)];
@@ -208,8 +225,7 @@ function createEvalTarget(modelConfig, modelString) {
     if (inputs.systemPrompt) {
       messages.push(new import_messages.SystemMessage(inputs.systemPrompt));
     }
-    const inputMessages = inputs.messages ?? (inputs.userMessages ?? []).map((content) => ({ role: "human", content }));
-    messages.push(...convertMessages(inputMessages));
+    messages.push(...convertToLangchainMessages(inputs.messages));
     let loopCount = 0;
     while (loopCount < MAX_AGENT_LOOPS) {
       loopCount++;
@@ -244,16 +260,107 @@ function createEvalTarget(modelConfig, modelString) {
     return { messages };
   };
 }
+function agentResultToMessages(inputMessages, result) {
+  const messages = convertToLangchainMessages(inputMessages);
+  let pendingToolCalls = [];
+  for (const block of result.content) {
+    if (block.type === "tool_call") {
+      const tc = block;
+      pendingToolCalls.push({
+        id: tc.toolCallId,
+        name: tc.name,
+        args: tc.input ? JSON.parse(tc.input) : {},
+        output: tc.output
+      });
+    } else if (block.type === "text") {
+      if (pendingToolCalls.length > 0) {
+        messages.push(
+          new import_messages.AIMessage({
+            content: "",
+            tool_calls: pendingToolCalls.map((tc) => ({ id: tc.id, name: tc.name, args: tc.args }))
+          })
+        );
+        for (const tc of pendingToolCalls) {
+          messages.push(new import_messages.ToolMessage({ content: tc.output, tool_call_id: tc.id, name: tc.name }));
+        }
+        pendingToolCalls = [];
+      }
+      messages.push(new import_messages.AIMessage(block.output));
+    }
+  }
+  if (pendingToolCalls.length > 0) {
+    messages.push(
+      new import_messages.AIMessage({
+        content: "",
+        tool_calls: pendingToolCalls.map((tc) => ({ id: tc.id, name: tc.name, args: tc.args }))
+      })
+    );
+    for (const tc of pendingToolCalls) {
+      messages.push(new import_messages.ToolMessage({ content: tc.output, tool_call_id: tc.id, name: tc.name }));
+    }
+  }
+  return messages;
+}
+function toolDefsToDefinitions(defs) {
+  const callCounts = {};
+  return Object.entries(defs).map(([name, def]) => {
+    callCounts[name] = 0;
+    return {
+      name,
+      toolKit: "eval-mock",
+      description: def.description,
+      inputSchema: def.schema instanceof import_zod.z.ZodObject ? def.schema : import_zod.z.object(
+        Object.fromEntries(
+          Object.entries(def.schema ?? {}).map(([key, val]) => {
+            if (typeof val === "string") return [key, import_zod.z.string().describe(val)];
+            return [key, import_zod.z.any()];
+          })
+        )
+      ),
+      exec: async (input) => {
+        callCounts[name]++;
+        if (typeof def.response === "function") {
+          return def.response(
+            input,
+            callCounts[name]
+          );
+        }
+        return typeof def.response === "string" ? def.response : JSON.stringify(def.response);
+      }
+    };
+  });
+}
+async function runAgentTarget(createTarget, evalMessages, extraToolDefs) {
+  const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
+  const agent = await createTarget(extraTools);
+  const result = await agent.run({
+    threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
+    messages: evalMessages
+  });
+  return { messages: agentResultToMessages(evalMessages, result) };
+}
 // src/eval/suite.ts
 function human(content) {
-  return { role: "human", content };
+  return { role: "human", content: [{ type: "text", text: content }] };
 }
 function ai(content, toolCalls) {
-  return { role: "ai", content, ...toolCalls ? { toolCalls } : {} };
+  return { role: "ai", content, ...toolCalls ? { toolCalls: toolCalls.map((name) => ({ name })) } : {} };
 }
-function toolResult(content) {
-  return { role: "tool", content };
+function toolResult(name, output) {
+  return { role: "tool", name, output };
+}
+function fromToolSpecs(specs, responses = {}) {
+  return Object.fromEntries(
+    specs.map((spec) => [
+      spec.name,
+      {
+        description: spec.description,
+        schema: spec.inputSchema,
+        response: responses[spec.name] ?? ""
+      }
+    ])
+  );
 }
 function toMockTools(defs) {
   return Object.entries(defs).map(([name, def]) => ({
@@ -266,51 +373,67 @@ function toMockTools(defs) {
 function toSerializableTools(tools) {
   return tools.map((t) => ({
     ...t,
+    schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
     response: typeof t.response === "function" ? "<function>" : t.response
   }));
 }
 function lastHumanContent(messages) {
   for (let i = messages.length - 1; i >= 0; i--) {
-    if (messages[i].role === "human") return messages[i].content;
+    const msg = messages[i];
+    if (msg.role === "human") {
+      const textBlock = msg.content.find((c) => c.type === "text");
+      return textBlock ? textBlock.text : "";
+    }
   }
-  return messages[0]?.content ?? "";
+  return "";
 }
-function resolveTarget(config) {
+function resolveModelTarget(config) {
   if (typeof config.target === "function") return config.target;
   const evalConfig = getEvalConfig();
+  if (!evalConfig.model && typeof config.target !== "string") {
+    throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
+  }
   const model = typeof config.target === "string" ? config.target : evalConfig.model;
   return createEvalTarget(evalConfig.modelConfig, model);
 }
+function resolveCreateTarget(config) {
+  return config.createTarget ?? getEvalConfig().createTarget;
+}
 function defineSuite(name, config) {
-  const target = resolveTarget(config);
-  const suiteTools = config.tools;
-  const globalPrompt = getEvalConfig().systemPrompt;
+  const suiteTools = config.tools ?? {};
+  const createTarget = config.target ? void 0 : resolveCreateTarget(config);
   ls.describe(name, () => {
     for (const tc of config.cases) {
       const testName = tc.name ?? lastHumanContent(tc.messages);
-      const tools = toMockTools(tc.tools ?? suiteTools);
+      const caseToolDefs = tc.tools ?? suiteTools;
+      const tools = toMockTools(caseToolDefs);
       const ctx = { message: lastHumanContent(tc.messages) };
       const resolved = tc.expect.map((exp) => exp(ctx));
       const evaluators = resolved.map((r) => r.evaluator);
       const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
-      const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
-      const targetInputs = {
-        messages: tc.messages,
-        tools,
-        ...systemPrompt ? { systemPrompt } : {}
-      };
       ls.test(
         testName,
         {
           inputs: {
             messages: tc.messages,
-            tools: toSerializableTools(tools),
-            ...systemPrompt ? { systemPrompt } : {}
+            tools: toSerializableTools(tools)
           },
           referenceOutputs
         },
         async ({ referenceOutputs: refOut }) => {
-          const output = await target(targetInputs);
+          let output;
+          if (createTarget) {
+            output = await runAgentTarget(createTarget, tc.messages, caseToolDefs);
+          } else {
+            const target = resolveModelTarget(config);
+            const globalPrompt = getEvalConfig().systemPrompt;
+            const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
+            output = await target({
+              messages: tc.messages,
+              tools,
+              ...systemPrompt ? { systemPrompt } : {}
+            });
+          }
           ls.logOutputs(output);
           for (const evaluator of evaluators) {
             await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
@@ -462,7 +585,7 @@ function toolsCalled(tools) {
 function llmJudge() {
   return () => {
     const config = getEvalConfig();
-    const model = config.evaluatorModel ?? config.model;
+    const model = config.evaluatorModel;
     return {
       evaluator: ls2.wrapEvaluator(
         withTrajectoryGuard(
@@ -483,7 +606,7 @@ function noTools() {
 function respondsInLanguage(code) {
   return () => {
     const config = getEvalConfig();
-    const model = config.evaluatorModel ?? config.model;
+    const model = config.evaluatorModel;
     return {
       evaluator: ls2.wrapEvaluator(createLanguageEvaluator(config.modelConfig, model)),
       referenceOutputs: { expectedLanguage: code }
@@ -508,6 +631,7 @@ function notContains(strings) {
   configureEvals,
   contains,
   defineSuite,
+  fromToolSpecs,
   human,
   llmJudge,
   noTools,