npm - langsmith - Versions diffs - 0.1.23 → 0.1.24 - Mend

langsmith 0.1.23 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/client.cjs +46 -5
package/dist/client.d.ts +17 -3
package/dist/client.js +46 -5
package/dist/evaluation/_runner.cjs +25 -7
package/dist/evaluation/_runner.d.ts +4 -4
package/dist/evaluation/_runner.js +25 -7
package/dist/evaluation/evaluate_comparative.cjs +202 -0
package/dist/evaluation/evaluate_comparative.d.ts +51 -0
package/dist/evaluation/evaluate_comparative.js +195 -0
package/dist/evaluation/evaluator.cjs +10 -13
package/dist/evaluation/evaluator.js +11 -14
package/dist/evaluation/index.cjs +3 -1
package/dist/evaluation/index.d.ts +1 -0
package/dist/evaluation/index.js +1 -0
package/dist/index.cjs +1 -1
package/dist/index.d.ts +1 -1
package/dist/index.js +1 -1
package/dist/run_trees.cjs +1 -0
package/dist/run_trees.js +1 -0
package/dist/schemas.d.ts +17 -0
package/dist/traceable.cjs +209 -23
package/dist/traceable.js +206 -20
package/dist/utils/shuffle.cjs +15 -0
package/dist/utils/shuffle.d.ts +1 -0
package/dist/utils/shuffle.js +11 -0
package/package.json +1 -1

package/dist/client.cjs CHANGED Viewed

@@ -304,8 +304,8 @@ class Client {
             return this.webUrl;
         }
         else if (isLocalhost(this.apiUrl)) {
-            this.webUrl = "http://localhost";
-            return "http://localhost";
+            this.webUrl = "http://localhost:3000";
+            return this.webUrl;
         }
         else if (this.apiUrl.includes("/api") &&
             !this.apiUrl.split(".", 1)[0].endsWith("api")) {
@@ -314,11 +314,11 @@ class Client {
         }
         else if (this.apiUrl.split(".", 1)[0].includes("dev")) {
             this.webUrl = "https://dev.smith.langchain.com";
-            return "https://dev.smith.langchain.com";
+            return this.webUrl;
         }
         else {
             this.webUrl = "https://smith.langchain.com";
-            return "https://smith.langchain.com";
+            return this.webUrl;
         }
     }
     get headers() {
@@ -1162,6 +1162,14 @@ class Client {
         }
         return result;
     }
+    async getProjectUrl({ projectId, projectName, }) {
+        if (projectId === undefined && projectName === undefined) {
+            throw new Error("Must provide either projectName or projectId");
+        }
+        const project = await this.readProject({ projectId, projectName });
+        const tenantId = await this._getTenantId();
+        return `${this.getHostUrl()}/o/${tenantId}/projects/p/${project.id}`;
+    }
     async _getTenantId() {
         if (this._tenantId !== null) {
             return this._tenantId;
@@ -1609,7 +1617,7 @@ class Client {
             sourceRunId: feedbackResult?.sourceRunId,
         });
     }
-    async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, }) {
+    async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }) {
         if (!runId && !projectId) {
             throw new Error("One of runId or projectId must be provided");
         }
@@ -1638,6 +1646,7 @@ class Client {
             correction,
             comment,
             feedback_source: feedback_source,
+            comparative_experiment_id: comparativeExperimentId,
             feedbackConfig,
             session_id: projectId,
         };
@@ -1759,6 +1768,38 @@ class Client {
         const result = await response.json();
         return result;
     }
+    async createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }) {
+        if (experimentIds.length === 0) {
+            throw new Error("At least one experiment is required");
+        }
+        if (!referenceDatasetId) {
+            referenceDatasetId = (await this.readProject({
+                projectId: experimentIds[0],
+            })).reference_dataset_id;
+        }
+        if (!referenceDatasetId == null) {
+            throw new Error("A reference dataset is required");
+        }
+        const body = {
+            id,
+            name,
+            experiment_ids: experimentIds,
+            reference_dataset_id: referenceDatasetId,
+            description,
+            created_at: (createdAt ?? new Date())?.toISOString(),
+            extra: {},
+        };
+        if (metadata)
+            body.extra["metadata"] = metadata;
+        const response = await this.caller.call(fetch, `${this.apiUrl}/datasets/comparative`, {
+            method: "POST",
+            headers: { ...this.headers, "Content-Type": "application/json" },
+            body: JSON.stringify(body),
+            signal: AbortSignal.timeout(this.timeout_ms),
+            ...this.fetchOptions,
+        });
+        return await response.json();
+    }
     /**
      * Retrieves a list of presigned feedback tokens for a given run ID.
      * @param runId The ID of the run.

package/dist/client.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { AsyncCallerParams } from "./utils/async_caller.js";
-import { DataType, Dataset, DatasetDiffInfo, DatasetShareSchema, Example, ExampleUpdate, Feedback, FeedbackConfig, FeedbackIngestToken, KVMap, LangChainBaseMessage, Run, RunCreate, RunUpdate, ScoreType, TimeDelta, TracerSession, TracerSessionResult, ValueType } from "./schemas.js";
+import { ComparativeExperiment, DataType, Dataset, DatasetDiffInfo, DatasetShareSchema, Example, ExampleUpdate, Feedback, FeedbackConfig, FeedbackIngestToken, KVMap, LangChainBaseMessage, Run, RunCreate, RunUpdate, ScoreType, TimeDelta, TracerSession, TracerSessionResult, ValueType } from "./schemas.js";
 import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluation/evaluator.js";
 interface ClientConfig {
     apiUrl?: string;
@@ -182,7 +182,7 @@ export declare class Client {
         hideInputs?: boolean;
         hideOutputs?: boolean;
     };
-    private getHostUrl;
+    getHostUrl(): string;
     private get headers();
     private processInputs;
     private processOutputs;
@@ -334,6 +334,10 @@ export declare class Client {
         projectName?: string;
         includeStats?: boolean;
     }): Promise<TracerSessionResult>;
+    getProjectUrl({ projectId, projectName, }: {
+        projectId?: string;
+        projectName?: string;
+    }): Promise<string>;
     private _getTenantId;
     listProjects({ projectIds, name, nameContains, referenceDatasetId, referenceDatasetName, referenceFree, }?: {
         projectIds?: string[];
@@ -409,7 +413,7 @@ export declare class Client {
         loadChildRuns: boolean;
         referenceExample?: Example;
     }): Promise<Feedback>;
-    createFeedback(runId: string | null, key: string, { score, value, correction, comment, sourceInfo, feedbackSourceType, sourceRunId, feedbackId, feedbackConfig, projectId, }: {
+    createFeedback(runId: string | null, key: string, { score, value, correction, comment, sourceInfo, feedbackSourceType, sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }: {
         score?: ScoreType;
         value?: ValueType;
         correction?: object;
@@ -421,6 +425,7 @@ export declare class Client {
         feedbackId?: string;
         eager?: boolean;
         projectId?: string;
+        comparativeExperimentId?: string;
     }): Promise<Feedback>;
     updateFeedback(feedbackId: string, { score, value, correction, comment, }: {
         score?: number | boolean | null;
@@ -454,6 +459,15 @@ export declare class Client {
         expiration?: string | TimeDelta;
         feedbackConfig?: FeedbackConfig;
     }): Promise<FeedbackIngestToken>;
+    createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }: {
+        name: string;
+        experimentIds: Array<string>;
+        referenceDatasetId?: string;
+        createdAt?: Date;
+        description?: string;
+        metadata?: Record<string, unknown>;
+        id?: string;
+    }): Promise<ComparativeExperiment>;
     /**
      * Retrieves a list of presigned feedback tokens for a given run ID.
      * @param runId The ID of the run.

package/dist/client.js CHANGED Viewed

@@ -277,8 +277,8 @@ export class Client {
             return this.webUrl;
         }
         else if (isLocalhost(this.apiUrl)) {
-            this.webUrl = "http://localhost";
-            return "http://localhost";
+            this.webUrl = "http://localhost:3000";
+            return this.webUrl;
         }
         else if (this.apiUrl.includes("/api") &&
             !this.apiUrl.split(".", 1)[0].endsWith("api")) {
@@ -287,11 +287,11 @@ export class Client {
         }
         else if (this.apiUrl.split(".", 1)[0].includes("dev")) {
             this.webUrl = "https://dev.smith.langchain.com";
-            return "https://dev.smith.langchain.com";
+            return this.webUrl;
         }
         else {
             this.webUrl = "https://smith.langchain.com";
-            return "https://smith.langchain.com";
+            return this.webUrl;
         }
     }
     get headers() {
@@ -1135,6 +1135,14 @@ export class Client {
         }
         return result;
     }
+    async getProjectUrl({ projectId, projectName, }) {
+        if (projectId === undefined && projectName === undefined) {
+            throw new Error("Must provide either projectName or projectId");
+        }
+        const project = await this.readProject({ projectId, projectName });
+        const tenantId = await this._getTenantId();
+        return `${this.getHostUrl()}/o/${tenantId}/projects/p/${project.id}`;
+    }
     async _getTenantId() {
         if (this._tenantId !== null) {
             return this._tenantId;
@@ -1582,7 +1590,7 @@ export class Client {
             sourceRunId: feedbackResult?.sourceRunId,
         });
     }
-    async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, }) {
+    async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }) {
         if (!runId && !projectId) {
             throw new Error("One of runId or projectId must be provided");
         }
@@ -1611,6 +1619,7 @@ export class Client {
             correction,
             comment,
             feedback_source: feedback_source,
+            comparative_experiment_id: comparativeExperimentId,
             feedbackConfig,
             session_id: projectId,
         };
@@ -1732,6 +1741,38 @@ export class Client {
         const result = await response.json();
         return result;
     }
+    async createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }) {
+        if (experimentIds.length === 0) {
+            throw new Error("At least one experiment is required");
+        }
+        if (!referenceDatasetId) {
+            referenceDatasetId = (await this.readProject({
+                projectId: experimentIds[0],
+            })).reference_dataset_id;
+        }
+        if (!referenceDatasetId == null) {
+            throw new Error("A reference dataset is required");
+        }
+        const body = {
+            id,
+            name,
+            experiment_ids: experimentIds,
+            reference_dataset_id: referenceDatasetId,
+            description,
+            created_at: (createdAt ?? new Date())?.toISOString(),
+            extra: {},
+        };
+        if (metadata)
+            body.extra["metadata"] = metadata;
+        const response = await this.caller.call(fetch, `${this.apiUrl}/datasets/comparative`, {
+            method: "POST",
+            headers: { ...this.headers, "Content-Type": "application/json" },
+            body: JSON.stringify(body),
+            signal: AbortSignal.timeout(this.timeout_ms),
+            ...this.fetchOptions,
+        });
+        return await response.json();
+    }
     /**
      * Retrieves a list of presigned feedback tokens for a given run ID.
      * @param runId The ID of the run.

package/dist/evaluation/_runner.cjs CHANGED Viewed

@@ -470,10 +470,30 @@ class _ExperimentManager {
     async _getDatasetVersion() {
         const examples = await this.getExamples();
         const modifiedAt = examples.map((ex) => ex.modified_at);
-        const maxModifiedAt = modifiedAt.length > 0
-            ? new Date(Math.max(...modifiedAt.map((date) => new Date(date).getTime())))
-            : undefined;
-        return maxModifiedAt?.toISOString();
+        // Python might return microseconds, which we need
+        // to account for when comparing dates.
+        const modifiedAtTime = modifiedAt.map((date) => {
+            function getMiliseconds(isoString) {
+                const time = isoString.split("T").at(1);
+                if (!time)
+                    return "";
+                const regex = /[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]+)/;
+                const strMiliseconds = time.match(regex)?.[1];
+                return strMiliseconds ?? "";
+            }
+            const jsDate = new Date(date);
+            let source = getMiliseconds(date);
+            let parsed = getMiliseconds(jsDate.toISOString());
+            const length = Math.max(source.length, parsed.length);
+            source = source.padEnd(length, "0");
+            parsed = parsed.padEnd(length, "0");
+            const microseconds = (Number.parseInt(source, 10) - Number.parseInt(parsed, 10)) / 1000;
+            const time = jsDate.getTime() + microseconds;
+            return { date, time };
+        });
+        if (modifiedAtTime.length === 0)
+            return undefined;
+        return modifiedAtTime.reduce((max, current) => (current.time > max.time ? current : max), modifiedAtTime[0]).date;
     }
     async _end() {
         const experiment = this._experiment;
@@ -572,9 +592,7 @@ async function _evaluate(target, fields) {
         runs: newRuns ?? undefined,
     }).start();
     if (_isCallable(target)) {
-        manager = await manager.withPredictions(convertInvokeToTopLevel(target), {
-            maxConcurrency: fields.maxConcurrency,
-        });
+        manager = await manager.withPredictions(convertInvokeToTopLevel(target), { maxConcurrency: fields.maxConcurrency });
     }
     if (fields.evaluators) {
         manager = await manager.withEvaluators(fields.evaluators, {

package/dist/evaluation/_runner.d.ts CHANGED Viewed

@@ -1,12 +1,12 @@
 import { Client } from "../index.js";
 import { Example, KVMap, Run, TracerSession } from "../schemas.js";
 import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluator.js";
-type TargetT = ((input: KVMap, config?: KVMap) => Promise<KVMap>) | ((input: KVMap, config?: KVMap) => KVMap) | {
-    invoke: (input: KVMap, config?: KVMap) => KVMap;
+type TargetT<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput) | {
+    invoke: (input: TInput, config?: KVMap) => TOutput;
 } | {
-    invoke: (input: KVMap, config?: KVMap) => Promise<KVMap>;
+    invoke: (input: TInput, config?: KVMap) => Promise<TOutput>;
 };
-type TargetNoInvoke = ((input: KVMap, config?: KVMap) => Promise<KVMap>) | ((input: KVMap, config?: KVMap) => KVMap);
+type TargetNoInvoke<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput);
 type DataT = string | AsyncIterable<Example> | Example[];
 type SummaryEvaluatorT = ((runs: Array<Run>, examples: Array<Example>) => Promise<EvaluationResult | EvaluationResults>) | ((runs: Array<Run>, examples: Array<Example>) => EvaluationResult | EvaluationResults);
 type EvaluatorT = RunEvaluator | ((run: Run, example?: Example) => EvaluationResult) | ((run: Run, example?: Example) => Promise<EvaluationResult>);

package/dist/evaluation/_runner.js CHANGED Viewed

@@ -466,10 +466,30 @@ class _ExperimentManager {
     async _getDatasetVersion() {
         const examples = await this.getExamples();
         const modifiedAt = examples.map((ex) => ex.modified_at);
-        const maxModifiedAt = modifiedAt.length > 0
-            ? new Date(Math.max(...modifiedAt.map((date) => new Date(date).getTime())))
-            : undefined;
-        return maxModifiedAt?.toISOString();
+        // Python might return microseconds, which we need
+        // to account for when comparing dates.
+        const modifiedAtTime = modifiedAt.map((date) => {
+            function getMiliseconds(isoString) {
+                const time = isoString.split("T").at(1);
+                if (!time)
+                    return "";
+                const regex = /[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]+)/;
+                const strMiliseconds = time.match(regex)?.[1];
+                return strMiliseconds ?? "";
+            }
+            const jsDate = new Date(date);
+            let source = getMiliseconds(date);
+            let parsed = getMiliseconds(jsDate.toISOString());
+            const length = Math.max(source.length, parsed.length);
+            source = source.padEnd(length, "0");
+            parsed = parsed.padEnd(length, "0");
+            const microseconds = (Number.parseInt(source, 10) - Number.parseInt(parsed, 10)) / 1000;
+            const time = jsDate.getTime() + microseconds;
+            return { date, time };
+        });
+        if (modifiedAtTime.length === 0)
+            return undefined;
+        return modifiedAtTime.reduce((max, current) => (current.time > max.time ? current : max), modifiedAtTime[0]).date;
     }
     async _end() {
         const experiment = this._experiment;
@@ -568,9 +588,7 @@ async function _evaluate(target, fields) {
         runs: newRuns ?? undefined,
     }).start();
     if (_isCallable(target)) {
-        manager = await manager.withPredictions(convertInvokeToTopLevel(target), {
-            maxConcurrency: fields.maxConcurrency,
-        });
+        manager = await manager.withPredictions(convertInvokeToTopLevel(target), { maxConcurrency: fields.maxConcurrency });
     }
     if (fields.evaluators) {
         manager = await manager.withEvaluators(fields.evaluators, {

package/dist/evaluation/evaluate_comparative.cjs ADDED Viewed

@@ -0,0 +1,202 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.evaluateComparative = void 0;
+const uuid_1 = require("uuid");
+const index_js_1 = require("../index.cjs");
+const shuffle_js_1 = require("../utils/shuffle.cjs");
+const async_caller_js_1 = require("../utils/async_caller.cjs");
+const p_retry_1 = __importDefault(require("p-retry"));
+const traceable_js_1 = require("../traceable.cjs");
+function isExperimentResultsList(value) {
+    return value.some((x) => typeof x !== "string");
+}
+async function loadExperiment(client, experiment) {
+    const value = typeof experiment === "string" ? experiment : experiment.experimentName;
+    return client.readProject((0, uuid_1.validate)(value) ? { projectId: value } : { projectName: value });
+}
+async function loadTraces(client, experiment, options) {
+    const executionOrder = options.loadNested ? undefined : 1;
+    const runs = await client.listRuns((0, uuid_1.validate)(experiment)
+        ? { projectId: experiment, executionOrder }
+        : { projectName: experiment, executionOrder });
+    const treeMap = {};
+    const runIdMap = {};
+    const results = [];
+    for await (const run of runs) {
+        if (run.parent_run_id != null) {
+            treeMap[run.parent_run_id] ??= [];
+            treeMap[run.parent_run_id].push(run);
+        }
+        else {
+            results.push(run);
+        }
+        runIdMap[run.id] = run;
+    }
+    for (const [parentRunId, childRuns] of Object.entries(treeMap)) {
+        const parentRun = runIdMap[parentRunId];
+        parentRun.child_runs = childRuns.sort((a, b) => {
+            if (a.dotted_order == null || b.dotted_order == null)
+                return 0;
+            return a.dotted_order.localeCompare(b.dotted_order);
+        });
+    }
+    return results;
+}
+async function evaluateComparative(experiments, options) {
+    if (experiments.length < 2) {
+        throw new Error("Comparative evaluation requires at least 2 experiments.");
+    }
+    if (!options.evaluators.length) {
+        throw new Error("At least one evaluator is required for comparative evaluation.");
+    }
+    if (options.maxConcurrency && options.maxConcurrency < 0) {
+        throw new Error("maxConcurrency must be a positive number.");
+    }
+    const client = options.client ?? new index_js_1.Client();
+    const resolvedExperiments = await Promise.all(experiments);
+    const projects = await (() => {
+        if (!isExperimentResultsList(resolvedExperiments)) {
+            return Promise.all(resolvedExperiments.map((experiment) => loadExperiment(client, experiment)));
+        }
+        // if we know the number of runs beforehand, check if the
+        // number of runs in the project matches the expected number of runs
+        return Promise.all(resolvedExperiments.map((experiment) => (0, p_retry_1.default)(async () => {
+            const project = await loadExperiment(client, experiment);
+            if (project.run_count !== experiment?.results.length) {
+                throw new Error("Experiment is missing runs. Retrying.");
+            }
+            return project;
+        }, { factor: 2, minTimeout: 1000, retries: 10 })));
+    })();
+    if (new Set(projects.map((p) => p.reference_dataset_id)).size > 1) {
+        throw new Error("All experiments must have the same reference dataset.");
+    }
+    const referenceDatasetId = projects.at(0)?.reference_dataset_id;
+    if (!referenceDatasetId) {
+        throw new Error("Reference dataset is required for comparative evaluation.");
+    }
+    if (new Set(projects.map((p) => p.extra?.metadata?.dataset_version)).size > 1) {
+        console.warn("Detected multiple dataset versions used by experiments, which may lead to inaccurate results.");
+    }
+    const datasetVersion = projects.at(0)?.extra?.metadata?.dataset_version;
+    const id = (0, uuid_1.v4)();
+    const experimentName = (() => {
+        if (!options.experimentPrefix) {
+            const names = projects
+                .map((p) => p.name)
+                .filter(Boolean)
+                .join(" vs. ");
+            return `${names}-${(0, uuid_1.v4)().slice(0, 4)}`;
+        }
+        return `${options.experimentPrefix}-${(0, uuid_1.v4)().slice(0, 4)}`;
+    })();
+    // TODO: add URL to the comparative experiment
+    console.log(`Starting pairwise evaluation of: ${experimentName}`);
+    const comparativeExperiment = await client.createComparativeExperiment({
+        id,
+        name: experimentName,
+        experimentIds: projects.map((p) => p.id),
+        description: options.description,
+        metadata: options.metadata,
+        referenceDatasetId: projects.at(0)?.reference_dataset_id,
+    });
+    const viewUrl = await (async () => {
+        const projectId = projects.at(0)?.id ?? projects.at(1)?.id;
+        const datasetId = comparativeExperiment?.reference_dataset_id;
+        if (projectId && datasetId) {
+            const hostUrl = (await client.getProjectUrl({ projectId }))
+                .split("/projects/p/")
+                .at(0);
+            const result = new URL(`${hostUrl}/datasets/${datasetId}/compare`);
+            result.searchParams.set("selectedSessions", projects.map((p) => p.id).join(","));
+            result.searchParams.set("comparativeExperiment", comparativeExperiment.id);
+            return result.toString();
+        }
+        return null;
+    })();
+    if (viewUrl != null) {
+        console.log(`View results at: ${viewUrl}`);
+    }
+    const experimentRuns = await Promise.all(projects.map((p) => loadTraces(client, p.id, { loadNested: !!options.loadNested })));
+    let exampleIdsIntersect;
+    for (const runs of experimentRuns) {
+        const exampleIdsSet = new Set(runs
+            .map((r) => r.reference_example_id)
+            .filter((x) => x != null));
+        if (!exampleIdsIntersect) {
+            exampleIdsIntersect = exampleIdsSet;
+        }
+        else {
+            exampleIdsIntersect = new Set([...exampleIdsIntersect].filter((x) => exampleIdsSet.has(x)));
+        }
+    }
+    const exampleIds = [...(exampleIdsIntersect ?? [])];
+    if (!exampleIds.length) {
+        throw new Error("No examples found in common between experiments.");
+    }
+    const exampleMap = {};
+    for (let start = 0; start < exampleIds.length; start += 99) {
+        const exampleIdsChunk = exampleIds.slice(start, start + 99);
+        for await (const example of client.listExamples({
+            datasetId: referenceDatasetId,
+            exampleIds: exampleIdsChunk,
+            asOf: datasetVersion,
+        })) {
+            exampleMap[example.id] = example;
+        }
+    }
+    const runMapByExampleId = {};
+    for (const runs of experimentRuns) {
+        for (const run of runs) {
+            if (run.reference_example_id == null ||
+                !exampleIds.includes(run.reference_example_id)) {
+                continue;
+            }
+            runMapByExampleId[run.reference_example_id] ??= [];
+            runMapByExampleId[run.reference_example_id].push(run);
+        }
+    }
+    const caller = new async_caller_js_1.AsyncCaller({ maxConcurrency: options.maxConcurrency });
+    async function evaluateAndSubmitFeedback(runs, example, evaluator) {
+        const expectedRunIds = new Set(runs.map((r) => r.id));
+        const result = await evaluator(options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs, example);
+        for (const [runId, score] of Object.entries(result.scores)) {
+            // validate if the run id
+            if (!expectedRunIds.has(runId)) {
+                throw new Error(`Returning an invalid run id ${runId} from evaluator.`);
+            }
+            await client.createFeedback(runId, result.key, {
+                score,
+                sourceRunId: result.source_run_id,
+                comparativeExperimentId: comparativeExperiment.id,
+            });
+        }
+        return result;
+    }
+    const tracedEvaluators = options.evaluators.map((evaluator) => (0, traceable_js_1.traceable)(async (runs, example) => {
+        const evaluatorRun = (0, traceable_js_1.getCurrentRunTree)();
+        const result = await evaluator(runs, example);
+        // sanitise the payload before sending to LangSmith
+        evaluatorRun.inputs = { runs: runs, example: example };
+        evaluatorRun.outputs = result;
+        return {
+            ...result,
+            source_run_id: result.source_run_id ?? evaluatorRun.id,
+        };
+    }, {
+        project_name: "evaluators",
+        name: evaluator.name || "evaluator",
+    }));
+    const promises = Object.entries(runMapByExampleId).flatMap(([exampleId, runs]) => {
+        const example = exampleMap[exampleId];
+        if (!example)
+            throw new Error(`Example ${exampleId} not found.`);
+        return tracedEvaluators.map((evaluator) => caller.call(evaluateAndSubmitFeedback, runs, exampleMap[exampleId], evaluator));
+    });
+    const results = await Promise.all(promises);
+    return { experimentName, results };
+}
+exports.evaluateComparative = evaluateComparative;

package/dist/evaluation/evaluate_comparative.d.ts ADDED Viewed

@@ -0,0 +1,51 @@
+import { Client } from "../index.js";
+import { ComparisonEvaluationResult as ComparisonEvaluationResultRow, Example, Run } from "../schemas.js";
+import { evaluate } from "./index.js";
+type ExperimentResults = Awaited<ReturnType<typeof evaluate>>;
+export interface EvaluateComparativeOptions {
+    /**
+     * A list of evaluators to use for comparative evaluation.
+     */
+    evaluators: Array<(runs: Run[], example: Example) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>>;
+    /**
+     * Randomize the order of outputs for each evaluation
+     * @default false
+     */
+    randomizeOrder?: boolean;
+    /**
+     * The LangSmith client to use.
+     * @default undefined
+     */
+    client?: Client;
+    /**
+     * Metadata to attach to the experiment.
+     * @default undefined
+     */
+    metadata?: Record<string, unknown>;
+    /**
+     * A prefix to use for your experiment name.
+     * @default undefined
+     */
+    experimentPrefix?: string;
+    /**
+     * A free-form description of the experiment.
+     * @default undefined
+     */
+    description?: string;
+    /**
+     * Whether to load all child runs for the experiment.
+     * @default false
+     */
+    loadNested?: boolean;
+    /**
+     * The maximum number of concurrent evaluators to run.
+     * @default undefined
+     */
+    maxConcurrency?: number;
+}
+export interface ComparisonEvaluationResults {
+    experimentName: string;
+    results: ComparisonEvaluationResultRow[];
+}
+export declare function evaluateComparative(experiments: Array<string> | Array<Promise<ExperimentResults> | ExperimentResults>, options: EvaluateComparativeOptions): Promise<ComparisonEvaluationResults>;
+export {};