npm - @toolpack-sdk/agents - Versions diffs - 2.1.1 → 2.3.0 - Mend

@toolpack-sdk/agents 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +72 -4
package/dist/{base-agent-DPdK4Pnl.d.cts → base-agent-65162dq7.d.cts} +1 -1
package/dist/{base-agent-nU8pr4nu.d.ts → base-agent-DzspMyaG.d.ts} +1 -1
package/dist/capabilities/index.cjs +2 -2
package/dist/capabilities/index.d.cts +3 -3
package/dist/capabilities/index.d.ts +3 -3
package/dist/capabilities/index.js +3 -3
package/dist/channels/index.cjs +2 -2
package/dist/channels/index.d.cts +2 -2
package/dist/channels/index.d.ts +2 -2
package/dist/channels/index.js +2 -2
package/dist/eval-report-BUD6NRiP.d.cts +343 -0
package/dist/eval-report-C6dSvR3Y.d.ts +343 -0
package/dist/{index-Du6S0eG7.d.cts → index-BxUlu-qG.d.ts} +76 -2
package/dist/{index-o8Lbzv5N.d.ts → index-DrigwC1A.d.cts} +76 -2
package/dist/index.cjs +40 -26
package/dist/index.d.cts +9 -8
package/dist/index.d.ts +9 -8
package/dist/index.js +40 -26
package/dist/{intent-classifier-agent-DxyfJWcm.d.cts → intent-classifier-agent-D0rWtviD.d.cts} +2 -2
package/dist/{intent-classifier-agent-0JZDlhpk.d.ts → intent-classifier-agent-mmNoAozf.d.ts} +2 -2
package/dist/interceptors/index.cjs +1 -1
package/dist/interceptors/index.d.cts +92 -5
package/dist/interceptors/index.d.ts +92 -5
package/dist/interceptors/index.js +1 -1
package/dist/testing/index.cjs +16 -2
package/dist/testing/index.d.cts +3 -2
package/dist/testing/index.d.ts +3 -2
package/dist/testing/index.js +16 -2
package/dist/{types-TB6yypig.d.cts → types-C3eW-auY.d.cts} +6 -8
package/dist/{types-TB6yypig.d.ts → types-C3eW-auY.d.ts} +6 -8
package/package.json +18 -9

package/dist/eval-report-C6dSvR3Y.d.ts ADDED Viewed

@@ -0,0 +1,343 @@
+import { B as BaseAgent } from './base-agent-DzspMyaG.js';
+/**
+ * Eval primitives — shared types across EvalDataset, EvalRunner, EvalScorer, and EvalReport.
+ */
+/**
+ * A single eval case: an input fed to the agent and the expected output used
+ * for scoring.
+ */
+interface EvalCase {
+    /** Unique identifier for this case. */
+    id: string;
+    /** The input passed to `agent.invokeAgent()`. */
+    input: {
+        message: string;
+        intent?: string;
+        conversationId?: string;
+        context?: Record<string, unknown>;
+    };
+    /**
+     * The expected output used by scorers.
+     * Exact-match and contains scorers compare `actualOutput` against this.
+     * LLM-judge scorers use it as the reference answer.
+     */
+    expectedOutput: string;
+    /** Optional free-form metadata (e.g. tags, difficulty, source). */
+    metadata?: Record<string, unknown>;
+}
+/**
+ * The actual output produced by running a single eval case against an agent.
+ */
+interface EvalCaseResult {
+    /** The eval case that was run. */
+    evalCase: EvalCase;
+    /** The output produced by the agent. */
+    actualOutput: string;
+    /** Wall-clock duration in milliseconds. */
+    durationMs: number;
+    /** Error message if the agent threw, otherwise undefined. */
+    error?: string;
+}
+/**
+ * The result of running an entire dataset through an agent.
+ */
+interface EvalRun {
+    /** Identifier for this run (e.g. "v1.2", "pr-456"). */
+    runId: string;
+    /** ISO timestamp of when the run started. */
+    startedAt: string;
+    /** ISO timestamp of when the run completed. */
+    completedAt: string;
+    /** Total wall-clock duration in milliseconds. */
+    totalDurationMs: number;
+    /** Per-case results, in dataset order. */
+    results: EvalCaseResult[];
+}
+/** The verdict for a single scored case. */
+type EvalVerdict = 'pass' | 'fail';
+/**
+ * A scored result — wraps an EvalCaseResult with a pass/fail verdict and
+ * an optional explanation.
+ */
+interface EvalScoredResult {
+    /** The underlying case result. */
+    caseResult: EvalCaseResult;
+    /** Pass or fail. */
+    verdict: EvalVerdict;
+    /**
+     * Optional human-readable explanation of the verdict.
+     * Populated by LLMJudgeScorer; optional for other scorers.
+     */
+    explanation?: string;
+}
+/**
+ * A fully scored run — an EvalRun annotated with per-case verdicts and
+ * aggregate pass/fail counts.
+ */
+interface EvalScoredRun {
+    /** The original run. */
+    run: EvalRun;
+    /** Scored results, in run order. */
+    scoredResults: EvalScoredResult[];
+    /** Number of passing cases. */
+    passCount: number;
+    /** Number of failing cases. */
+    failCount: number;
+    /** Pass rate as a fraction between 0 and 1. */
+    passRate: number;
+}
+/**
+ * A regression entry — a case that passed in the baseline but fails in the
+ * candidate.
+ */
+interface EvalRegression {
+    caseId: string;
+    baselineOutput: string;
+    candidateOutput: string;
+}
+/**
+ * An improvement entry — a case that failed in the baseline but passes in the
+ * candidate.
+ */
+interface EvalImprovement {
+    caseId: string;
+    baselineOutput: string;
+    candidateOutput: string;
+}
+/**
+ * Comparison report between a baseline scored run and a candidate scored run.
+ */
+interface EvalReport {
+    baselineRunId: string;
+    candidateRunId: string;
+    baselinePassRate: number;
+    candidatePassRate: number;
+    /** Δ pass rate (candidate − baseline). Positive = improvement. */
+    delta: number;
+    regressions: EvalRegression[];
+    improvements: EvalImprovement[];
+    /** Cases that passed in both runs. */
+    stablePasses: string[];
+    /** Cases that failed in both runs. */
+    stableFails: string[];
+}
+/**
+ * A collection of eval cases that can be loaded from / saved to JSON.
+ *
+ * @example
+ * ```ts
+ * const dataset = new EvalDataset([
+ *   {
+ *     id: 'q1',
+ *     input: { message: 'What is 2 + 2?' },
+ *     expectedOutput: '4',
+ *   },
+ * ]);
+ *
+ * dataset.save('./evals/math.json');
+ *
+ * const loaded = EvalDataset.load('./evals/math.json');
+ * ```
+ */
+declare class EvalDataset {
+    private _cases;
+    constructor(cases?: EvalCase[]);
+    /** All cases in the dataset. */
+    get cases(): EvalCase[];
+    /** Number of cases. */
+    get size(): number;
+    /**
+     * Get a case by ID.
+     * Returns `undefined` if not found.
+     */
+    get(id: string): EvalCase | undefined;
+    /**
+     * Add one or more cases.
+     * Throws if a case with the same ID already exists.
+     */
+    add(...cases: EvalCase[]): this;
+    /**
+     * Remove a case by ID.
+     * Returns `true` if removed, `false` if not found.
+     */
+    remove(id: string): boolean;
+    /**
+     * Filter cases by a predicate. Returns a new EvalDataset.
+     */
+    filter(predicate: (c: EvalCase) => boolean): EvalDataset;
+    /**
+     * Serialize to a plain array (suitable for `JSON.stringify`).
+     */
+    toJSON(): EvalCase[];
+    /**
+     * Save cases to a JSON file.
+     *
+     * @param filePath Absolute or relative path to the output file.
+     */
+    save(filePath: string): void;
+    /**
+     * Load cases from a JSON file.
+     * The file must contain a JSON array of `EvalCase` objects.
+     *
+     * @param filePath Absolute or relative path to the JSON file.
+     */
+    static load(filePath: string): EvalDataset;
+    /**
+     * Create an `EvalDataset` from a plain array (e.g. from a database query).
+     */
+    static from(cases: EvalCase[]): EvalDataset;
+}
+interface EvalRunnerOptions {
+    /**
+     * Identifier for this run — use something meaningful like a version or PR number.
+     * Defaults to a timestamp string.
+     */
+    runId?: string;
+    /**
+     * Concurrency limit — how many cases to run in parallel.
+     * Defaults to 1 (sequential) to avoid overwhelming the provider.
+     */
+    concurrency?: number;
+}
+/**
+ * Runs an agent against every case in an `EvalDataset` and collects the
+ * results into an `EvalRun`.
+ *
+ * @example
+ * ```ts
+ * const runner = new EvalRunner(agent);
+ * const run = await runner.run(dataset, { runId: 'v1.2' });
+ *
+ * console.log(`${run.results.length} cases run in ${run.totalDurationMs}ms`);
+ * ```
+ */
+declare class EvalRunner {
+    private agent;
+    constructor(agent: BaseAgent);
+    /**
+     * Run all cases in the dataset and return an `EvalRun`.
+     */
+    run(dataset: EvalDataset, options?: EvalRunnerOptions): Promise<EvalRun>;
+}
+/**
+ * A scorer evaluates each `EvalCaseResult` in a run and produces a
+ * pass/fail verdict with an optional explanation.
+ *
+ * Implement this interface to create custom scoring logic.
+ */
+interface EvalScorer {
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Passes a case when `actualOutput` exactly equals `expectedOutput`.
+ * Optionally case-insensitive and/or trimmed.
+ *
+ * @example
+ * ```ts
+ * const scorer = new ExactMatchScorer({ trim: true, caseInsensitive: true });
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class ExactMatchScorer implements EvalScorer {
+    private trim;
+    private caseInsensitive;
+    constructor(options?: {
+        trim?: boolean;
+        caseInsensitive?: boolean;
+    });
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Passes a case when `actualOutput` contains `expectedOutput` as a substring.
+ * Optionally case-insensitive.
+ *
+ * @example
+ * ```ts
+ * const scorer = new ContainsScorer({ caseInsensitive: true });
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class ContainsScorer implements EvalScorer {
+    private caseInsensitive;
+    constructor(options?: {
+        caseInsensitive?: boolean;
+    });
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+interface LLMJudgeScorerOptions {
+    /**
+     * Custom judge prompt template.
+     * Use `{{question}}`, `{{expected}}`, and `{{actual}}` as placeholders.
+     * Must instruct the LLM to respond with only "pass" or "fail" on the first line,
+     * optionally followed by an explanation.
+     */
+    promptTemplate?: string;
+}
+/**
+ * Uses an LLM agent as a judge to score each case.
+ * The judge is prompted with the question, expected answer, and actual answer.
+ *
+ * @example
+ * ```ts
+ * const judgeAgent = new MyAgent({ toolpack });
+ * const scorer = new LLMJudgeScorer(judgeAgent);
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class LLMJudgeScorer implements EvalScorer {
+    private judgeAgent;
+    private promptTemplate;
+    constructor(judgeAgent: BaseAgent, options?: LLMJudgeScorerOptions);
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Wraps a user-supplied scoring function.
+ *
+ * @example
+ * ```ts
+ * const scorer = new CustomScorer(async (result) => {
+ *   const pass = result.actualOutput.includes('Paris');
+ *   return { verdict: pass ? 'pass' : 'fail' };
+ * });
+ * ```
+ */
+declare class CustomScorer implements EvalScorer {
+    private fn;
+    constructor(fn: (result: EvalCaseResult) => Promise<{
+        verdict: EvalVerdict;
+        explanation?: string;
+    }>);
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Compares two scored runs and produces a regression/improvement report.
+ *
+ * @example
+ * ```ts
+ * const report = compareEvalRuns(baselineScoredRun, candidateScoredRun);
+ *
+ * if (report.regressions.length > 0) {
+ *   console.error('Regressions detected:', report.regressions);
+ *   process.exit(1);
+ * }
+ *
+ * console.log(`Pass rate: ${report.baselinePassRate} → ${report.candidatePassRate} (Δ${report.delta > 0 ? '+' : ''}${report.delta.toFixed(2)})`);
+ * ```
+ */
+declare function compareEvalRuns(baseline: EvalScoredRun, candidate: EvalScoredRun): EvalReport;
+/**
+ * Format an `EvalReport` as a human-readable summary string.
+ *
+ * @example
+ * ```ts
+ * console.log(formatEvalReport(report));
+ * ```
+ */
+declare function formatEvalReport(report: EvalReport): string;
+export { ContainsScorer as C, type EvalCase as E, LLMJudgeScorer as L, CustomScorer as a, type EvalCaseResult as b, EvalDataset as c, type EvalImprovement as d, type EvalRegression as e, type EvalReport as f, type EvalRun as g, EvalRunner as h, type EvalRunnerOptions as i, type EvalScoredResult as j, type EvalScoredRun as k, type EvalScorer as l, type EvalVerdict as m, ExactMatchScorer as n, type LLMJudgeScorerOptions as o, compareEvalRuns as p, formatEvalReport as q };

package/dist/{index-Du6S0eG7.d.cts → index-BxUlu-qG.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as AgentInput, e as AgentOutput } from './types-TB6yypig.cjs';
+import { A as AgentInput, e as AgentOutput } from './types-C3eW-auY.js';
 import { Participant } from 'toolpack-sdk';
 /**
@@ -924,4 +924,78 @@ declare class SMSChannel extends BaseChannel {
     stop(): Promise<void>;
 }
-export { BaseChannel as B, type CreateJobOptions as C, DiscordChannel as D, EmailChannel as E, type JobStatus as J, SchedulerStore as S, TelegramChannel as T, WebhookChannel as W, type CreateJobResult as a, type DiscordChannelConfig as b, type EmailChannelConfig as c, SMSChannel as d, type SMSChannelConfig as e, ScheduledChannel as f, type ScheduledChannelConfig as g, type ScheduledJob as h, SlackChannel as i, type SlackChannelConfig as j, type TelegramChannelConfig as k, type WebhookChannelConfig as l };
+interface McpChannelConfig {
+    /**
+     * Maximum milliseconds to wait for the agent to respond.
+     * Default: 120_000 (2 minutes).
+     */
+    timeout?: number;
+}
+/**
+ * Channel that connects a Toolpack agent to an MCP server as a tool.
+ *
+ * Unlike other channels (Slack, Webhook) this channel does not own a server or
+ * socket. Instead it exposes a `trigger()` method that the MCP tools/call handler
+ * calls directly. The agent runs and sends its output back through `send()`, which
+ * resolves the Promise that `trigger()` is waiting on.
+ *
+ * Usage:
+ * ```typescript
+ * const ch = new McpChannel();
+ * const agent = new PrReviewerAgent({ channels: [ch] });
+ * await agent.start();
+ *
+ * await sdk.startMcpServer({
+ *   transport: 'stdio',
+ *   agents: [ch.asAgentDefinition(agent)],
+ * });
+ * ```
+ *
+ * ⚠ One McpChannel handles one concurrent call at a time. If two tools/call
+ * requests arrive for the same channel simultaneously, the second call's
+ * pendingResolve overwrites the first and the first call's result is lost.
+ * Create one McpChannel per agent instance and do not share channels.
+ */
+declare class McpChannel extends BaseChannel {
+    readonly isTriggerChannel = false;
+    private readonly _timeout;
+    private _pendingResolve?;
+    constructor(config?: McpChannelConfig);
+    /**
+     * No-op — McpChannel is driven by trigger(), not a background listener.
+     */
+    listen(): void;
+    /**
+     * Resolves the pending trigger() Promise with the agent's output.
+     */
+    send(output: AgentOutput): Promise<void>;
+    /**
+     * Convert raw MCP arguments into AgentInput.
+     * If args contains a string 'message' field it is used as the message;
+     * otherwise the entire args object is JSON-stringified as the message.
+     */
+    normalize(incoming: unknown): AgentInput;
+    /**
+     * Called by the MCP tools/call handler.
+     * Triggers the agent and waits for it to respond via send().
+     * Rejects if the agent does not respond within the configured timeout.
+     */
+    trigger(args: Record<string, unknown>): Promise<string>;
+    /**
+     * Produce an McpAgentDefinition suitable for startMcpServer({ agents: [...] }).
+     *
+     * @param agent  Object with name and description (typically a BaseAgent instance).
+     * @param inputSchema  Optional JSON Schema for the tool's input parameters.
+     */
+    asAgentDefinition(agent: {
+        name: string;
+        description: string;
+    }, inputSchema?: Record<string, unknown>): {
+        invoke: (args: Record<string, unknown>) => Promise<string>;
+        inputSchema?: Record<string, unknown> | undefined;
+        name: string;
+        description: string;
+    };
+}
+export { BaseChannel as B, type CreateJobOptions as C, DiscordChannel as D, EmailChannel as E, type JobStatus as J, McpChannel as M, SchedulerStore as S, TelegramChannel as T, WebhookChannel as W, type CreateJobResult as a, type DiscordChannelConfig as b, type EmailChannelConfig as c, type McpChannelConfig as d, SMSChannel as e, type SMSChannelConfig as f, ScheduledChannel as g, type ScheduledChannelConfig as h, type ScheduledJob as i, SlackChannel as j, type SlackChannelConfig as k, type TelegramChannelConfig as l, type WebhookChannelConfig as m };

package/dist/{index-o8Lbzv5N.d.ts → index-DrigwC1A.d.cts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as AgentInput, e as AgentOutput } from './types-TB6yypig.js';
+import { A as AgentInput, e as AgentOutput } from './types-C3eW-auY.cjs';
 import { Participant } from 'toolpack-sdk';
 /**
@@ -924,4 +924,78 @@ declare class SMSChannel extends BaseChannel {
     stop(): Promise<void>;
 }
-export { BaseChannel as B, type CreateJobOptions as C, DiscordChannel as D, EmailChannel as E, type JobStatus as J, SchedulerStore as S, TelegramChannel as T, WebhookChannel as W, type CreateJobResult as a, type DiscordChannelConfig as b, type EmailChannelConfig as c, SMSChannel as d, type SMSChannelConfig as e, ScheduledChannel as f, type ScheduledChannelConfig as g, type ScheduledJob as h, SlackChannel as i, type SlackChannelConfig as j, type TelegramChannelConfig as k, type WebhookChannelConfig as l };
+interface McpChannelConfig {
+    /**
+     * Maximum milliseconds to wait for the agent to respond.
+     * Default: 120_000 (2 minutes).
+     */
+    timeout?: number;
+}
+/**
+ * Channel that connects a Toolpack agent to an MCP server as a tool.
+ *
+ * Unlike other channels (Slack, Webhook) this channel does not own a server or
+ * socket. Instead it exposes a `trigger()` method that the MCP tools/call handler
+ * calls directly. The agent runs and sends its output back through `send()`, which
+ * resolves the Promise that `trigger()` is waiting on.
+ *
+ * Usage:
+ * ```typescript
+ * const ch = new McpChannel();
+ * const agent = new PrReviewerAgent({ channels: [ch] });
+ * await agent.start();
+ *
+ * await sdk.startMcpServer({
+ *   transport: 'stdio',
+ *   agents: [ch.asAgentDefinition(agent)],
+ * });
+ * ```
+ *
+ * ⚠ One McpChannel handles one concurrent call at a time. If two tools/call
+ * requests arrive for the same channel simultaneously, the second call's
+ * pendingResolve overwrites the first and the first call's result is lost.
+ * Create one McpChannel per agent instance and do not share channels.
+ */
+declare class McpChannel extends BaseChannel {
+    readonly isTriggerChannel = false;
+    private readonly _timeout;
+    private _pendingResolve?;
+    constructor(config?: McpChannelConfig);
+    /**
+     * No-op — McpChannel is driven by trigger(), not a background listener.
+     */
+    listen(): void;
+    /**
+     * Resolves the pending trigger() Promise with the agent's output.
+     */
+    send(output: AgentOutput): Promise<void>;
+    /**
+     * Convert raw MCP arguments into AgentInput.
+     * If args contains a string 'message' field it is used as the message;
+     * otherwise the entire args object is JSON-stringified as the message.
+     */
+    normalize(incoming: unknown): AgentInput;
+    /**
+     * Called by the MCP tools/call handler.
+     * Triggers the agent and waits for it to respond via send().
+     * Rejects if the agent does not respond within the configured timeout.
+     */
+    trigger(args: Record<string, unknown>): Promise<string>;
+    /**
+     * Produce an McpAgentDefinition suitable for startMcpServer({ agents: [...] }).
+     *
+     * @param agent  Object with name and description (typically a BaseAgent instance).
+     * @param inputSchema  Optional JSON Schema for the tool's input parameters.
+     */
+    asAgentDefinition(agent: {
+        name: string;
+        description: string;
+    }, inputSchema?: Record<string, unknown>): {
+        invoke: (args: Record<string, unknown>) => Promise<string>;
+        inputSchema?: Record<string, unknown> | undefined;
+        name: string;
+        description: string;
+    };
+}
+export { BaseChannel as B, type CreateJobOptions as C, DiscordChannel as D, EmailChannel as E, type JobStatus as J, McpChannel as M, SchedulerStore as S, TelegramChannel as T, WebhookChannel as W, type CreateJobResult as a, type DiscordChannelConfig as b, type EmailChannelConfig as c, type McpChannelConfig as d, SMSChannel as e, type SMSChannelConfig as f, ScheduledChannel as g, type ScheduledChannelConfig as h, type ScheduledJob as i, SlackChannel as j, type SlackChannelConfig as k, type TelegramChannelConfig as l, type WebhookChannelConfig as m };