npm - @toolpack-sdk/agents - Versions diffs - 2.1.0 → 2.2.0 - Mend

@toolpack-sdk/agents 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +72 -4
package/dist/channels/index.cjs +2 -2
package/dist/channels/index.d.cts +1 -1
package/dist/channels/index.d.ts +1 -1
package/dist/channels/index.js +2 -2
package/dist/eval-report-BCGixIYd.d.cts +343 -0
package/dist/eval-report-CpdAMa2b.d.ts +343 -0
package/dist/{index-Du6S0eG7.d.cts → index-DVhRtkNq.d.cts} +75 -1
package/dist/{index-o8Lbzv5N.d.ts → index-dhKoHWTy.d.ts} +75 -1
package/dist/index.cjs +40 -26
package/dist/index.d.cts +4 -3
package/dist/index.d.ts +4 -3
package/dist/index.js +40 -26
package/dist/interceptors/index.cjs +1 -1
package/dist/interceptors/index.d.cts +88 -1
package/dist/interceptors/index.d.ts +88 -1
package/dist/interceptors/index.js +1 -1
package/dist/testing/index.cjs +16 -2
package/dist/testing/index.d.cts +1 -0
package/dist/testing/index.d.ts +1 -0
package/dist/testing/index.js +16 -2
package/package.json +18 -9

package/dist/eval-report-BCGixIYd.d.cts ADDED Viewed

@@ -0,0 +1,343 @@
+import { B as BaseAgent } from './base-agent-DPdK4Pnl.cjs';
+/**
+ * Eval primitives — shared types across EvalDataset, EvalRunner, EvalScorer, and EvalReport.
+ */
+/**
+ * A single eval case: an input fed to the agent and the expected output used
+ * for scoring.
+ */
+interface EvalCase {
+    /** Unique identifier for this case. */
+    id: string;
+    /** The input passed to `agent.invokeAgent()`. */
+    input: {
+        message: string;
+        intent?: string;
+        conversationId?: string;
+        context?: Record<string, unknown>;
+    };
+    /**
+     * The expected output used by scorers.
+     * Exact-match and contains scorers compare `actualOutput` against this.
+     * LLM-judge scorers use it as the reference answer.
+     */
+    expectedOutput: string;
+    /** Optional free-form metadata (e.g. tags, difficulty, source). */
+    metadata?: Record<string, unknown>;
+}
+/**
+ * The actual output produced by running a single eval case against an agent.
+ */
+interface EvalCaseResult {
+    /** The eval case that was run. */
+    evalCase: EvalCase;
+    /** The output produced by the agent. */
+    actualOutput: string;
+    /** Wall-clock duration in milliseconds. */
+    durationMs: number;
+    /** Error message if the agent threw, otherwise undefined. */
+    error?: string;
+}
+/**
+ * The result of running an entire dataset through an agent.
+ */
+interface EvalRun {
+    /** Identifier for this run (e.g. "v1.2", "pr-456"). */
+    runId: string;
+    /** ISO timestamp of when the run started. */
+    startedAt: string;
+    /** ISO timestamp of when the run completed. */
+    completedAt: string;
+    /** Total wall-clock duration in milliseconds. */
+    totalDurationMs: number;
+    /** Per-case results, in dataset order. */
+    results: EvalCaseResult[];
+}
+/** The verdict for a single scored case. */
+type EvalVerdict = 'pass' | 'fail';
+/**
+ * A scored result — wraps an EvalCaseResult with a pass/fail verdict and
+ * an optional explanation.
+ */
+interface EvalScoredResult {
+    /** The underlying case result. */
+    caseResult: EvalCaseResult;
+    /** Pass or fail. */
+    verdict: EvalVerdict;
+    /**
+     * Optional human-readable explanation of the verdict.
+     * Populated by LLMJudgeScorer; optional for other scorers.
+     */
+    explanation?: string;
+}
+/**
+ * A fully scored run — an EvalRun annotated with per-case verdicts and
+ * aggregate pass/fail counts.
+ */
+interface EvalScoredRun {
+    /** The original run. */
+    run: EvalRun;
+    /** Scored results, in run order. */
+    scoredResults: EvalScoredResult[];
+    /** Number of passing cases. */
+    passCount: number;
+    /** Number of failing cases. */
+    failCount: number;
+    /** Pass rate as a fraction between 0 and 1. */
+    passRate: number;
+}
+/**
+ * A regression entry — a case that passed in the baseline but fails in the
+ * candidate.
+ */
+interface EvalRegression {
+    caseId: string;
+    baselineOutput: string;
+    candidateOutput: string;
+}
+/**
+ * An improvement entry — a case that failed in the baseline but passes in the
+ * candidate.
+ */
+interface EvalImprovement {
+    caseId: string;
+    baselineOutput: string;
+    candidateOutput: string;
+}
+/**
+ * Comparison report between a baseline scored run and a candidate scored run.
+ */
+interface EvalReport {
+    baselineRunId: string;
+    candidateRunId: string;
+    baselinePassRate: number;
+    candidatePassRate: number;
+    /** Δ pass rate (candidate − baseline). Positive = improvement. */
+    delta: number;
+    regressions: EvalRegression[];
+    improvements: EvalImprovement[];
+    /** Cases that passed in both runs. */
+    stablePasses: string[];
+    /** Cases that failed in both runs. */
+    stableFails: string[];
+}
+/**
+ * A collection of eval cases that can be loaded from / saved to JSON.
+ *
+ * @example
+ * ```ts
+ * const dataset = new EvalDataset([
+ *   {
+ *     id: 'q1',
+ *     input: { message: 'What is 2 + 2?' },
+ *     expectedOutput: '4',
+ *   },
+ * ]);
+ *
+ * dataset.save('./evals/math.json');
+ *
+ * const loaded = EvalDataset.load('./evals/math.json');
+ * ```
+ */
+declare class EvalDataset {
+    private _cases;
+    constructor(cases?: EvalCase[]);
+    /** All cases in the dataset. */
+    get cases(): EvalCase[];
+    /** Number of cases. */
+    get size(): number;
+    /**
+     * Get a case by ID.
+     * Returns `undefined` if not found.
+     */
+    get(id: string): EvalCase | undefined;
+    /**
+     * Add one or more cases.
+     * Throws if a case with the same ID already exists.
+     */
+    add(...cases: EvalCase[]): this;
+    /**
+     * Remove a case by ID.
+     * Returns `true` if removed, `false` if not found.
+     */
+    remove(id: string): boolean;
+    /**
+     * Filter cases by a predicate. Returns a new EvalDataset.
+     */
+    filter(predicate: (c: EvalCase) => boolean): EvalDataset;
+    /**
+     * Serialize to a plain array (suitable for `JSON.stringify`).
+     */
+    toJSON(): EvalCase[];
+    /**
+     * Save cases to a JSON file.
+     *
+     * @param filePath Absolute or relative path to the output file.
+     */
+    save(filePath: string): void;
+    /**
+     * Load cases from a JSON file.
+     * The file must contain a JSON array of `EvalCase` objects.
+     *
+     * @param filePath Absolute or relative path to the JSON file.
+     */
+    static load(filePath: string): EvalDataset;
+    /**
+     * Create an `EvalDataset` from a plain array (e.g. from a database query).
+     */
+    static from(cases: EvalCase[]): EvalDataset;
+}
+interface EvalRunnerOptions {
+    /**
+     * Identifier for this run — use something meaningful like a version or PR number.
+     * Defaults to a timestamp string.
+     */
+    runId?: string;
+    /**
+     * Concurrency limit — how many cases to run in parallel.
+     * Defaults to 1 (sequential) to avoid overwhelming the provider.
+     */
+    concurrency?: number;
+}
+/**
+ * Runs an agent against every case in an `EvalDataset` and collects the
+ * results into an `EvalRun`.
+ *
+ * @example
+ * ```ts
+ * const runner = new EvalRunner(agent);
+ * const run = await runner.run(dataset, { runId: 'v1.2' });
+ *
+ * console.log(`${run.results.length} cases run in ${run.totalDurationMs}ms`);
+ * ```
+ */
+declare class EvalRunner {
+    private agent;
+    constructor(agent: BaseAgent);
+    /**
+     * Run all cases in the dataset and return an `EvalRun`.
+     */
+    run(dataset: EvalDataset, options?: EvalRunnerOptions): Promise<EvalRun>;
+}
+/**
+ * A scorer evaluates each `EvalCaseResult` in a run and produces a
+ * pass/fail verdict with an optional explanation.
+ *
+ * Implement this interface to create custom scoring logic.
+ */
+interface EvalScorer {
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Passes a case when `actualOutput` exactly equals `expectedOutput`.
+ * Optionally case-insensitive and/or trimmed.
+ *
+ * @example
+ * ```ts
+ * const scorer = new ExactMatchScorer({ trim: true, caseInsensitive: true });
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class ExactMatchScorer implements EvalScorer {
+    private trim;
+    private caseInsensitive;
+    constructor(options?: {
+        trim?: boolean;
+        caseInsensitive?: boolean;
+    });
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Passes a case when `actualOutput` contains `expectedOutput` as a substring.
+ * Optionally case-insensitive.
+ *
+ * @example
+ * ```ts
+ * const scorer = new ContainsScorer({ caseInsensitive: true });
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class ContainsScorer implements EvalScorer {
+    private caseInsensitive;
+    constructor(options?: {
+        caseInsensitive?: boolean;
+    });
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+interface LLMJudgeScorerOptions {
+    /**
+     * Custom judge prompt template.
+     * Use `{{question}}`, `{{expected}}`, and `{{actual}}` as placeholders.
+     * Must instruct the LLM to respond with only "pass" or "fail" on the first line,
+     * optionally followed by an explanation.
+     */
+    promptTemplate?: string;
+}
+/**
+ * Uses an LLM agent as a judge to score each case.
+ * The judge is prompted with the question, expected answer, and actual answer.
+ *
+ * @example
+ * ```ts
+ * const judgeAgent = new MyAgent({ toolpack });
+ * const scorer = new LLMJudgeScorer(judgeAgent);
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class LLMJudgeScorer implements EvalScorer {
+    private judgeAgent;
+    private promptTemplate;
+    constructor(judgeAgent: BaseAgent, options?: LLMJudgeScorerOptions);
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Wraps a user-supplied scoring function.
+ *
+ * @example
+ * ```ts
+ * const scorer = new CustomScorer(async (result) => {
+ *   const pass = result.actualOutput.includes('Paris');
+ *   return { verdict: pass ? 'pass' : 'fail' };
+ * });
+ * ```
+ */
+declare class CustomScorer implements EvalScorer {
+    private fn;
+    constructor(fn: (result: EvalCaseResult) => Promise<{
+        verdict: EvalVerdict;
+        explanation?: string;
+    }>);
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Compares two scored runs and produces a regression/improvement report.
+ *
+ * @example
+ * ```ts
+ * const report = compareEvalRuns(baselineScoredRun, candidateScoredRun);
+ *
+ * if (report.regressions.length > 0) {
+ *   console.error('Regressions detected:', report.regressions);
+ *   process.exit(1);
+ * }
+ *
+ * console.log(`Pass rate: ${report.baselinePassRate} → ${report.candidatePassRate} (Δ${report.delta > 0 ? '+' : ''}${report.delta.toFixed(2)})`);
+ * ```
+ */
+declare function compareEvalRuns(baseline: EvalScoredRun, candidate: EvalScoredRun): EvalReport;
+/**
+ * Format an `EvalReport` as a human-readable summary string.
+ *
+ * @example
+ * ```ts
+ * console.log(formatEvalReport(report));
+ * ```
+ */
+declare function formatEvalReport(report: EvalReport): string;
+export { ContainsScorer as C, type EvalCase as E, LLMJudgeScorer as L, CustomScorer as a, type EvalCaseResult as b, EvalDataset as c, type EvalImprovement as d, type EvalRegression as e, type EvalReport as f, type EvalRun as g, EvalRunner as h, type EvalRunnerOptions as i, type EvalScoredResult as j, type EvalScoredRun as k, type EvalScorer as l, type EvalVerdict as m, ExactMatchScorer as n, type LLMJudgeScorerOptions as o, compareEvalRuns as p, formatEvalReport as q };

package/dist/eval-report-CpdAMa2b.d.ts ADDED Viewed

@@ -0,0 +1,343 @@
+import { B as BaseAgent } from './base-agent-nU8pr4nu.js';
+/**
+ * Eval primitives — shared types across EvalDataset, EvalRunner, EvalScorer, and EvalReport.
+ */
+/**
+ * A single eval case: an input fed to the agent and the expected output used
+ * for scoring.
+ */
+interface EvalCase {
+    /** Unique identifier for this case. */
+    id: string;
+    /** The input passed to `agent.invokeAgent()`. */
+    input: {
+        message: string;
+        intent?: string;
+        conversationId?: string;
+        context?: Record<string, unknown>;
+    };
+    /**
+     * The expected output used by scorers.
+     * Exact-match and contains scorers compare `actualOutput` against this.
+     * LLM-judge scorers use it as the reference answer.
+     */
+    expectedOutput: string;
+    /** Optional free-form metadata (e.g. tags, difficulty, source). */
+    metadata?: Record<string, unknown>;
+}
+/**
+ * The actual output produced by running a single eval case against an agent.
+ */
+interface EvalCaseResult {
+    /** The eval case that was run. */
+    evalCase: EvalCase;
+    /** The output produced by the agent. */
+    actualOutput: string;
+    /** Wall-clock duration in milliseconds. */
+    durationMs: number;
+    /** Error message if the agent threw, otherwise undefined. */
+    error?: string;
+}
+/**
+ * The result of running an entire dataset through an agent.
+ */
+interface EvalRun {
+    /** Identifier for this run (e.g. "v1.2", "pr-456"). */
+    runId: string;
+    /** ISO timestamp of when the run started. */
+    startedAt: string;
+    /** ISO timestamp of when the run completed. */
+    completedAt: string;
+    /** Total wall-clock duration in milliseconds. */
+    totalDurationMs: number;
+    /** Per-case results, in dataset order. */
+    results: EvalCaseResult[];
+}
+/** The verdict for a single scored case. */
+type EvalVerdict = 'pass' | 'fail';
+/**
+ * A scored result — wraps an EvalCaseResult with a pass/fail verdict and
+ * an optional explanation.
+ */
+interface EvalScoredResult {
+    /** The underlying case result. */
+    caseResult: EvalCaseResult;
+    /** Pass or fail. */
+    verdict: EvalVerdict;
+    /**
+     * Optional human-readable explanation of the verdict.
+     * Populated by LLMJudgeScorer; optional for other scorers.
+     */
+    explanation?: string;
+}
+/**
+ * A fully scored run — an EvalRun annotated with per-case verdicts and
+ * aggregate pass/fail counts.
+ */
+interface EvalScoredRun {
+    /** The original run. */
+    run: EvalRun;
+    /** Scored results, in run order. */
+    scoredResults: EvalScoredResult[];
+    /** Number of passing cases. */
+    passCount: number;
+    /** Number of failing cases. */
+    failCount: number;
+    /** Pass rate as a fraction between 0 and 1. */
+    passRate: number;
+}
+/**
+ * A regression entry — a case that passed in the baseline but fails in the
+ * candidate.
+ */
+interface EvalRegression {
+    caseId: string;
+    baselineOutput: string;
+    candidateOutput: string;
+}
+/**
+ * An improvement entry — a case that failed in the baseline but passes in the
+ * candidate.
+ */
+interface EvalImprovement {
+    caseId: string;
+    baselineOutput: string;
+    candidateOutput: string;
+}
+/**
+ * Comparison report between a baseline scored run and a candidate scored run.
+ */
+interface EvalReport {
+    baselineRunId: string;
+    candidateRunId: string;
+    baselinePassRate: number;
+    candidatePassRate: number;
+    /** Δ pass rate (candidate − baseline). Positive = improvement. */
+    delta: number;
+    regressions: EvalRegression[];
+    improvements: EvalImprovement[];
+    /** Cases that passed in both runs. */
+    stablePasses: string[];
+    /** Cases that failed in both runs. */
+    stableFails: string[];
+}
+/**
+ * A collection of eval cases that can be loaded from / saved to JSON.
+ *
+ * @example
+ * ```ts
+ * const dataset = new EvalDataset([
+ *   {
+ *     id: 'q1',
+ *     input: { message: 'What is 2 + 2?' },
+ *     expectedOutput: '4',
+ *   },
+ * ]);
+ *
+ * dataset.save('./evals/math.json');
+ *
+ * const loaded = EvalDataset.load('./evals/math.json');
+ * ```
+ */
+declare class EvalDataset {
+    private _cases;
+    constructor(cases?: EvalCase[]);
+    /** All cases in the dataset. */
+    get cases(): EvalCase[];
+    /** Number of cases. */
+    get size(): number;
+    /**
+     * Get a case by ID.
+     * Returns `undefined` if not found.
+     */
+    get(id: string): EvalCase | undefined;
+    /**
+     * Add one or more cases.
+     * Throws if a case with the same ID already exists.
+     */
+    add(...cases: EvalCase[]): this;
+    /**
+     * Remove a case by ID.
+     * Returns `true` if removed, `false` if not found.
+     */
+    remove(id: string): boolean;
+    /**
+     * Filter cases by a predicate. Returns a new EvalDataset.
+     */
+    filter(predicate: (c: EvalCase) => boolean): EvalDataset;
+    /**
+     * Serialize to a plain array (suitable for `JSON.stringify`).
+     */
+    toJSON(): EvalCase[];
+    /**
+     * Save cases to a JSON file.
+     *
+     * @param filePath Absolute or relative path to the output file.
+     */
+    save(filePath: string): void;
+    /**
+     * Load cases from a JSON file.
+     * The file must contain a JSON array of `EvalCase` objects.
+     *
+     * @param filePath Absolute or relative path to the JSON file.
+     */
+    static load(filePath: string): EvalDataset;
+    /**
+     * Create an `EvalDataset` from a plain array (e.g. from a database query).
+     */
+    static from(cases: EvalCase[]): EvalDataset;
+}
+interface EvalRunnerOptions {
+    /**
+     * Identifier for this run — use something meaningful like a version or PR number.
+     * Defaults to a timestamp string.
+     */
+    runId?: string;
+    /**
+     * Concurrency limit — how many cases to run in parallel.
+     * Defaults to 1 (sequential) to avoid overwhelming the provider.
+     */
+    concurrency?: number;
+}
+/**
+ * Runs an agent against every case in an `EvalDataset` and collects the
+ * results into an `EvalRun`.
+ *
+ * @example
+ * ```ts
+ * const runner = new EvalRunner(agent);
+ * const run = await runner.run(dataset, { runId: 'v1.2' });
+ *
+ * console.log(`${run.results.length} cases run in ${run.totalDurationMs}ms`);
+ * ```
+ */
+declare class EvalRunner {
+    private agent;
+    constructor(agent: BaseAgent);
+    /**
+     * Run all cases in the dataset and return an `EvalRun`.
+     */
+    run(dataset: EvalDataset, options?: EvalRunnerOptions): Promise<EvalRun>;
+}
+/**
+ * A scorer evaluates each `EvalCaseResult` in a run and produces a
+ * pass/fail verdict with an optional explanation.
+ *
+ * Implement this interface to create custom scoring logic.
+ */
+interface EvalScorer {
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Passes a case when `actualOutput` exactly equals `expectedOutput`.
+ * Optionally case-insensitive and/or trimmed.
+ *
+ * @example
+ * ```ts
+ * const scorer = new ExactMatchScorer({ trim: true, caseInsensitive: true });
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class ExactMatchScorer implements EvalScorer {
+    private trim;
+    private caseInsensitive;
+    constructor(options?: {
+        trim?: boolean;
+        caseInsensitive?: boolean;
+    });
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Passes a case when `actualOutput` contains `expectedOutput` as a substring.
+ * Optionally case-insensitive.
+ *
+ * @example
+ * ```ts
+ * const scorer = new ContainsScorer({ caseInsensitive: true });
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class ContainsScorer implements EvalScorer {
+    private caseInsensitive;
+    constructor(options?: {
+        caseInsensitive?: boolean;
+    });
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+interface LLMJudgeScorerOptions {
+    /**
+     * Custom judge prompt template.
+     * Use `{{question}}`, `{{expected}}`, and `{{actual}}` as placeholders.
+     * Must instruct the LLM to respond with only "pass" or "fail" on the first line,
+     * optionally followed by an explanation.
+     */
+    promptTemplate?: string;
+}
+/**
+ * Uses an LLM agent as a judge to score each case.
+ * The judge is prompted with the question, expected answer, and actual answer.
+ *
+ * @example
+ * ```ts
+ * const judgeAgent = new MyAgent({ toolpack });
+ * const scorer = new LLMJudgeScorer(judgeAgent);
+ * const scored = await scorer.score(run);
+ * ```
+ */
+declare class LLMJudgeScorer implements EvalScorer {
+    private judgeAgent;
+    private promptTemplate;
+    constructor(judgeAgent: BaseAgent, options?: LLMJudgeScorerOptions);
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Wraps a user-supplied scoring function.
+ *
+ * @example
+ * ```ts
+ * const scorer = new CustomScorer(async (result) => {
+ *   const pass = result.actualOutput.includes('Paris');
+ *   return { verdict: pass ? 'pass' : 'fail' };
+ * });
+ * ```
+ */
+declare class CustomScorer implements EvalScorer {
+    private fn;
+    constructor(fn: (result: EvalCaseResult) => Promise<{
+        verdict: EvalVerdict;
+        explanation?: string;
+    }>);
+    score(run: EvalRun): Promise<EvalScoredRun>;
+}
+/**
+ * Compares two scored runs and produces a regression/improvement report.
+ *
+ * @example
+ * ```ts
+ * const report = compareEvalRuns(baselineScoredRun, candidateScoredRun);
+ *
+ * if (report.regressions.length > 0) {
+ *   console.error('Regressions detected:', report.regressions);
+ *   process.exit(1);
+ * }
+ *
+ * console.log(`Pass rate: ${report.baselinePassRate} → ${report.candidatePassRate} (Δ${report.delta > 0 ? '+' : ''}${report.delta.toFixed(2)})`);
+ * ```
+ */
+declare function compareEvalRuns(baseline: EvalScoredRun, candidate: EvalScoredRun): EvalReport;
+/**
+ * Format an `EvalReport` as a human-readable summary string.
+ *
+ * @example
+ * ```ts
+ * console.log(formatEvalReport(report));
+ * ```
+ */
+declare function formatEvalReport(report: EvalReport): string;
+export { ContainsScorer as C, type EvalCase as E, LLMJudgeScorer as L, CustomScorer as a, type EvalCaseResult as b, EvalDataset as c, type EvalImprovement as d, type EvalRegression as e, type EvalReport as f, type EvalRun as g, EvalRunner as h, type EvalRunnerOptions as i, type EvalScoredResult as j, type EvalScoredRun as k, type EvalScorer as l, type EvalVerdict as m, ExactMatchScorer as n, type LLMJudgeScorerOptions as o, compareEvalRuns as p, formatEvalReport as q };