langsmith 0.1.23 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/client.cjs CHANGED
@@ -304,8 +304,8 @@ class Client {
304
304
  return this.webUrl;
305
305
  }
306
306
  else if (isLocalhost(this.apiUrl)) {
307
- this.webUrl = "http://localhost";
308
- return "http://localhost";
307
+ this.webUrl = "http://localhost:3000";
308
+ return this.webUrl;
309
309
  }
310
310
  else if (this.apiUrl.includes("/api") &&
311
311
  !this.apiUrl.split(".", 1)[0].endsWith("api")) {
@@ -314,11 +314,11 @@ class Client {
314
314
  }
315
315
  else if (this.apiUrl.split(".", 1)[0].includes("dev")) {
316
316
  this.webUrl = "https://dev.smith.langchain.com";
317
- return "https://dev.smith.langchain.com";
317
+ return this.webUrl;
318
318
  }
319
319
  else {
320
320
  this.webUrl = "https://smith.langchain.com";
321
- return "https://smith.langchain.com";
321
+ return this.webUrl;
322
322
  }
323
323
  }
324
324
  get headers() {
@@ -1162,6 +1162,14 @@ class Client {
1162
1162
  }
1163
1163
  return result;
1164
1164
  }
1165
+ async getProjectUrl({ projectId, projectName, }) {
1166
+ if (projectId === undefined && projectName === undefined) {
1167
+ throw new Error("Must provide either projectName or projectId");
1168
+ }
1169
+ const project = await this.readProject({ projectId, projectName });
1170
+ const tenantId = await this._getTenantId();
1171
+ return `${this.getHostUrl()}/o/${tenantId}/projects/p/${project.id}`;
1172
+ }
1165
1173
  async _getTenantId() {
1166
1174
  if (this._tenantId !== null) {
1167
1175
  return this._tenantId;
@@ -1609,7 +1617,7 @@ class Client {
1609
1617
  sourceRunId: feedbackResult?.sourceRunId,
1610
1618
  });
1611
1619
  }
1612
- async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, }) {
1620
+ async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }) {
1613
1621
  if (!runId && !projectId) {
1614
1622
  throw new Error("One of runId or projectId must be provided");
1615
1623
  }
@@ -1638,6 +1646,7 @@ class Client {
1638
1646
  correction,
1639
1647
  comment,
1640
1648
  feedback_source: feedback_source,
1649
+ comparative_experiment_id: comparativeExperimentId,
1641
1650
  feedbackConfig,
1642
1651
  session_id: projectId,
1643
1652
  };
@@ -1759,6 +1768,38 @@ class Client {
1759
1768
  const result = await response.json();
1760
1769
  return result;
1761
1770
  }
1771
+ async createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }) {
1772
+ if (experimentIds.length === 0) {
1773
+ throw new Error("At least one experiment is required");
1774
+ }
1775
+ if (!referenceDatasetId) {
1776
+ referenceDatasetId = (await this.readProject({
1777
+ projectId: experimentIds[0],
1778
+ })).reference_dataset_id;
1779
+ }
1780
+ if (!referenceDatasetId == null) {
1781
+ throw new Error("A reference dataset is required");
1782
+ }
1783
+ const body = {
1784
+ id,
1785
+ name,
1786
+ experiment_ids: experimentIds,
1787
+ reference_dataset_id: referenceDatasetId,
1788
+ description,
1789
+ created_at: (createdAt ?? new Date())?.toISOString(),
1790
+ extra: {},
1791
+ };
1792
+ if (metadata)
1793
+ body.extra["metadata"] = metadata;
1794
+ const response = await this.caller.call(fetch, `${this.apiUrl}/datasets/comparative`, {
1795
+ method: "POST",
1796
+ headers: { ...this.headers, "Content-Type": "application/json" },
1797
+ body: JSON.stringify(body),
1798
+ signal: AbortSignal.timeout(this.timeout_ms),
1799
+ ...this.fetchOptions,
1800
+ });
1801
+ return await response.json();
1802
+ }
1762
1803
  /**
1763
1804
  * Retrieves a list of presigned feedback tokens for a given run ID.
1764
1805
  * @param runId The ID of the run.
package/dist/client.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { AsyncCallerParams } from "./utils/async_caller.js";
2
- import { DataType, Dataset, DatasetDiffInfo, DatasetShareSchema, Example, ExampleUpdate, Feedback, FeedbackConfig, FeedbackIngestToken, KVMap, LangChainBaseMessage, Run, RunCreate, RunUpdate, ScoreType, TimeDelta, TracerSession, TracerSessionResult, ValueType } from "./schemas.js";
2
+ import { ComparativeExperiment, DataType, Dataset, DatasetDiffInfo, DatasetShareSchema, Example, ExampleUpdate, Feedback, FeedbackConfig, FeedbackIngestToken, KVMap, LangChainBaseMessage, Run, RunCreate, RunUpdate, ScoreType, TimeDelta, TracerSession, TracerSessionResult, ValueType } from "./schemas.js";
3
3
  import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluation/evaluator.js";
4
4
  interface ClientConfig {
5
5
  apiUrl?: string;
@@ -182,7 +182,7 @@ export declare class Client {
182
182
  hideInputs?: boolean;
183
183
  hideOutputs?: boolean;
184
184
  };
185
- private getHostUrl;
185
+ getHostUrl(): string;
186
186
  private get headers();
187
187
  private processInputs;
188
188
  private processOutputs;
@@ -334,6 +334,10 @@ export declare class Client {
334
334
  projectName?: string;
335
335
  includeStats?: boolean;
336
336
  }): Promise<TracerSessionResult>;
337
+ getProjectUrl({ projectId, projectName, }: {
338
+ projectId?: string;
339
+ projectName?: string;
340
+ }): Promise<string>;
337
341
  private _getTenantId;
338
342
  listProjects({ projectIds, name, nameContains, referenceDatasetId, referenceDatasetName, referenceFree, }?: {
339
343
  projectIds?: string[];
@@ -409,7 +413,7 @@ export declare class Client {
409
413
  loadChildRuns: boolean;
410
414
  referenceExample?: Example;
411
415
  }): Promise<Feedback>;
412
- createFeedback(runId: string | null, key: string, { score, value, correction, comment, sourceInfo, feedbackSourceType, sourceRunId, feedbackId, feedbackConfig, projectId, }: {
416
+ createFeedback(runId: string | null, key: string, { score, value, correction, comment, sourceInfo, feedbackSourceType, sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }: {
413
417
  score?: ScoreType;
414
418
  value?: ValueType;
415
419
  correction?: object;
@@ -421,6 +425,7 @@ export declare class Client {
421
425
  feedbackId?: string;
422
426
  eager?: boolean;
423
427
  projectId?: string;
428
+ comparativeExperimentId?: string;
424
429
  }): Promise<Feedback>;
425
430
  updateFeedback(feedbackId: string, { score, value, correction, comment, }: {
426
431
  score?: number | boolean | null;
@@ -454,6 +459,15 @@ export declare class Client {
454
459
  expiration?: string | TimeDelta;
455
460
  feedbackConfig?: FeedbackConfig;
456
461
  }): Promise<FeedbackIngestToken>;
462
+ createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }: {
463
+ name: string;
464
+ experimentIds: Array<string>;
465
+ referenceDatasetId?: string;
466
+ createdAt?: Date;
467
+ description?: string;
468
+ metadata?: Record<string, unknown>;
469
+ id?: string;
470
+ }): Promise<ComparativeExperiment>;
457
471
  /**
458
472
  * Retrieves a list of presigned feedback tokens for a given run ID.
459
473
  * @param runId The ID of the run.
package/dist/client.js CHANGED
@@ -277,8 +277,8 @@ export class Client {
277
277
  return this.webUrl;
278
278
  }
279
279
  else if (isLocalhost(this.apiUrl)) {
280
- this.webUrl = "http://localhost";
281
- return "http://localhost";
280
+ this.webUrl = "http://localhost:3000";
281
+ return this.webUrl;
282
282
  }
283
283
  else if (this.apiUrl.includes("/api") &&
284
284
  !this.apiUrl.split(".", 1)[0].endsWith("api")) {
@@ -287,11 +287,11 @@ export class Client {
287
287
  }
288
288
  else if (this.apiUrl.split(".", 1)[0].includes("dev")) {
289
289
  this.webUrl = "https://dev.smith.langchain.com";
290
- return "https://dev.smith.langchain.com";
290
+ return this.webUrl;
291
291
  }
292
292
  else {
293
293
  this.webUrl = "https://smith.langchain.com";
294
- return "https://smith.langchain.com";
294
+ return this.webUrl;
295
295
  }
296
296
  }
297
297
  get headers() {
@@ -1135,6 +1135,14 @@ export class Client {
1135
1135
  }
1136
1136
  return result;
1137
1137
  }
1138
+ async getProjectUrl({ projectId, projectName, }) {
1139
+ if (projectId === undefined && projectName === undefined) {
1140
+ throw new Error("Must provide either projectName or projectId");
1141
+ }
1142
+ const project = await this.readProject({ projectId, projectName });
1143
+ const tenantId = await this._getTenantId();
1144
+ return `${this.getHostUrl()}/o/${tenantId}/projects/p/${project.id}`;
1145
+ }
1138
1146
  async _getTenantId() {
1139
1147
  if (this._tenantId !== null) {
1140
1148
  return this._tenantId;
@@ -1582,7 +1590,7 @@ export class Client {
1582
1590
  sourceRunId: feedbackResult?.sourceRunId,
1583
1591
  });
1584
1592
  }
1585
- async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, }) {
1593
+ async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }) {
1586
1594
  if (!runId && !projectId) {
1587
1595
  throw new Error("One of runId or projectId must be provided");
1588
1596
  }
@@ -1611,6 +1619,7 @@ export class Client {
1611
1619
  correction,
1612
1620
  comment,
1613
1621
  feedback_source: feedback_source,
1622
+ comparative_experiment_id: comparativeExperimentId,
1614
1623
  feedbackConfig,
1615
1624
  session_id: projectId,
1616
1625
  };
@@ -1732,6 +1741,38 @@ export class Client {
1732
1741
  const result = await response.json();
1733
1742
  return result;
1734
1743
  }
1744
+ async createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }) {
1745
+ if (experimentIds.length === 0) {
1746
+ throw new Error("At least one experiment is required");
1747
+ }
1748
+ if (!referenceDatasetId) {
1749
+ referenceDatasetId = (await this.readProject({
1750
+ projectId: experimentIds[0],
1751
+ })).reference_dataset_id;
1752
+ }
1753
+ if (!referenceDatasetId == null) {
1754
+ throw new Error("A reference dataset is required");
1755
+ }
1756
+ const body = {
1757
+ id,
1758
+ name,
1759
+ experiment_ids: experimentIds,
1760
+ reference_dataset_id: referenceDatasetId,
1761
+ description,
1762
+ created_at: (createdAt ?? new Date())?.toISOString(),
1763
+ extra: {},
1764
+ };
1765
+ if (metadata)
1766
+ body.extra["metadata"] = metadata;
1767
+ const response = await this.caller.call(fetch, `${this.apiUrl}/datasets/comparative`, {
1768
+ method: "POST",
1769
+ headers: { ...this.headers, "Content-Type": "application/json" },
1770
+ body: JSON.stringify(body),
1771
+ signal: AbortSignal.timeout(this.timeout_ms),
1772
+ ...this.fetchOptions,
1773
+ });
1774
+ return await response.json();
1775
+ }
1735
1776
  /**
1736
1777
  * Retrieves a list of presigned feedback tokens for a given run ID.
1737
1778
  * @param runId The ID of the run.
@@ -470,10 +470,30 @@ class _ExperimentManager {
470
470
  async _getDatasetVersion() {
471
471
  const examples = await this.getExamples();
472
472
  const modifiedAt = examples.map((ex) => ex.modified_at);
473
- const maxModifiedAt = modifiedAt.length > 0
474
- ? new Date(Math.max(...modifiedAt.map((date) => new Date(date).getTime())))
475
- : undefined;
476
- return maxModifiedAt?.toISOString();
473
+ // Python might return microseconds, which we need
474
+ // to account for when comparing dates.
475
+ const modifiedAtTime = modifiedAt.map((date) => {
476
+ function getMiliseconds(isoString) {
477
+ const time = isoString.split("T").at(1);
478
+ if (!time)
479
+ return "";
480
+ const regex = /[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]+)/;
481
+ const strMiliseconds = time.match(regex)?.[1];
482
+ return strMiliseconds ?? "";
483
+ }
484
+ const jsDate = new Date(date);
485
+ let source = getMiliseconds(date);
486
+ let parsed = getMiliseconds(jsDate.toISOString());
487
+ const length = Math.max(source.length, parsed.length);
488
+ source = source.padEnd(length, "0");
489
+ parsed = parsed.padEnd(length, "0");
490
+ const microseconds = (Number.parseInt(source, 10) - Number.parseInt(parsed, 10)) / 1000;
491
+ const time = jsDate.getTime() + microseconds;
492
+ return { date, time };
493
+ });
494
+ if (modifiedAtTime.length === 0)
495
+ return undefined;
496
+ return modifiedAtTime.reduce((max, current) => (current.time > max.time ? current : max), modifiedAtTime[0]).date;
477
497
  }
478
498
  async _end() {
479
499
  const experiment = this._experiment;
@@ -572,9 +592,7 @@ async function _evaluate(target, fields) {
572
592
  runs: newRuns ?? undefined,
573
593
  }).start();
574
594
  if (_isCallable(target)) {
575
- manager = await manager.withPredictions(convertInvokeToTopLevel(target), {
576
- maxConcurrency: fields.maxConcurrency,
577
- });
595
+ manager = await manager.withPredictions(convertInvokeToTopLevel(target), { maxConcurrency: fields.maxConcurrency });
578
596
  }
579
597
  if (fields.evaluators) {
580
598
  manager = await manager.withEvaluators(fields.evaluators, {
@@ -1,12 +1,12 @@
1
1
  import { Client } from "../index.js";
2
2
  import { Example, KVMap, Run, TracerSession } from "../schemas.js";
3
3
  import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluator.js";
4
- type TargetT = ((input: KVMap, config?: KVMap) => Promise<KVMap>) | ((input: KVMap, config?: KVMap) => KVMap) | {
5
- invoke: (input: KVMap, config?: KVMap) => KVMap;
4
+ type TargetT<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput) | {
5
+ invoke: (input: TInput, config?: KVMap) => TOutput;
6
6
  } | {
7
- invoke: (input: KVMap, config?: KVMap) => Promise<KVMap>;
7
+ invoke: (input: TInput, config?: KVMap) => Promise<TOutput>;
8
8
  };
9
- type TargetNoInvoke = ((input: KVMap, config?: KVMap) => Promise<KVMap>) | ((input: KVMap, config?: KVMap) => KVMap);
9
+ type TargetNoInvoke<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput);
10
10
  type DataT = string | AsyncIterable<Example> | Example[];
11
11
  type SummaryEvaluatorT = ((runs: Array<Run>, examples: Array<Example>) => Promise<EvaluationResult | EvaluationResults>) | ((runs: Array<Run>, examples: Array<Example>) => EvaluationResult | EvaluationResults);
12
12
  type EvaluatorT = RunEvaluator | ((run: Run, example?: Example) => EvaluationResult) | ((run: Run, example?: Example) => Promise<EvaluationResult>);
@@ -466,10 +466,30 @@ class _ExperimentManager {
466
466
  async _getDatasetVersion() {
467
467
  const examples = await this.getExamples();
468
468
  const modifiedAt = examples.map((ex) => ex.modified_at);
469
- const maxModifiedAt = modifiedAt.length > 0
470
- ? new Date(Math.max(...modifiedAt.map((date) => new Date(date).getTime())))
471
- : undefined;
472
- return maxModifiedAt?.toISOString();
469
+ // Python might return microseconds, which we need
470
+ // to account for when comparing dates.
471
+ const modifiedAtTime = modifiedAt.map((date) => {
472
+ function getMiliseconds(isoString) {
473
+ const time = isoString.split("T").at(1);
474
+ if (!time)
475
+ return "";
476
+ const regex = /[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]+)/;
477
+ const strMiliseconds = time.match(regex)?.[1];
478
+ return strMiliseconds ?? "";
479
+ }
480
+ const jsDate = new Date(date);
481
+ let source = getMiliseconds(date);
482
+ let parsed = getMiliseconds(jsDate.toISOString());
483
+ const length = Math.max(source.length, parsed.length);
484
+ source = source.padEnd(length, "0");
485
+ parsed = parsed.padEnd(length, "0");
486
+ const microseconds = (Number.parseInt(source, 10) - Number.parseInt(parsed, 10)) / 1000;
487
+ const time = jsDate.getTime() + microseconds;
488
+ return { date, time };
489
+ });
490
+ if (modifiedAtTime.length === 0)
491
+ return undefined;
492
+ return modifiedAtTime.reduce((max, current) => (current.time > max.time ? current : max), modifiedAtTime[0]).date;
473
493
  }
474
494
  async _end() {
475
495
  const experiment = this._experiment;
@@ -568,9 +588,7 @@ async function _evaluate(target, fields) {
568
588
  runs: newRuns ?? undefined,
569
589
  }).start();
570
590
  if (_isCallable(target)) {
571
- manager = await manager.withPredictions(convertInvokeToTopLevel(target), {
572
- maxConcurrency: fields.maxConcurrency,
573
- });
591
+ manager = await manager.withPredictions(convertInvokeToTopLevel(target), { maxConcurrency: fields.maxConcurrency });
574
592
  }
575
593
  if (fields.evaluators) {
576
594
  manager = await manager.withEvaluators(fields.evaluators, {
@@ -0,0 +1,202 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.evaluateComparative = void 0;
7
+ const uuid_1 = require("uuid");
8
+ const index_js_1 = require("../index.cjs");
9
+ const shuffle_js_1 = require("../utils/shuffle.cjs");
10
+ const async_caller_js_1 = require("../utils/async_caller.cjs");
11
+ const p_retry_1 = __importDefault(require("p-retry"));
12
+ const traceable_js_1 = require("../traceable.cjs");
13
+ function isExperimentResultsList(value) {
14
+ return value.some((x) => typeof x !== "string");
15
+ }
16
+ async function loadExperiment(client, experiment) {
17
+ const value = typeof experiment === "string" ? experiment : experiment.experimentName;
18
+ return client.readProject((0, uuid_1.validate)(value) ? { projectId: value } : { projectName: value });
19
+ }
20
+ async function loadTraces(client, experiment, options) {
21
+ const executionOrder = options.loadNested ? undefined : 1;
22
+ const runs = await client.listRuns((0, uuid_1.validate)(experiment)
23
+ ? { projectId: experiment, executionOrder }
24
+ : { projectName: experiment, executionOrder });
25
+ const treeMap = {};
26
+ const runIdMap = {};
27
+ const results = [];
28
+ for await (const run of runs) {
29
+ if (run.parent_run_id != null) {
30
+ treeMap[run.parent_run_id] ??= [];
31
+ treeMap[run.parent_run_id].push(run);
32
+ }
33
+ else {
34
+ results.push(run);
35
+ }
36
+ runIdMap[run.id] = run;
37
+ }
38
+ for (const [parentRunId, childRuns] of Object.entries(treeMap)) {
39
+ const parentRun = runIdMap[parentRunId];
40
+ parentRun.child_runs = childRuns.sort((a, b) => {
41
+ if (a.dotted_order == null || b.dotted_order == null)
42
+ return 0;
43
+ return a.dotted_order.localeCompare(b.dotted_order);
44
+ });
45
+ }
46
+ return results;
47
+ }
48
+ async function evaluateComparative(experiments, options) {
49
+ if (experiments.length < 2) {
50
+ throw new Error("Comparative evaluation requires at least 2 experiments.");
51
+ }
52
+ if (!options.evaluators.length) {
53
+ throw new Error("At least one evaluator is required for comparative evaluation.");
54
+ }
55
+ if (options.maxConcurrency && options.maxConcurrency < 0) {
56
+ throw new Error("maxConcurrency must be a positive number.");
57
+ }
58
+ const client = options.client ?? new index_js_1.Client();
59
+ const resolvedExperiments = await Promise.all(experiments);
60
+ const projects = await (() => {
61
+ if (!isExperimentResultsList(resolvedExperiments)) {
62
+ return Promise.all(resolvedExperiments.map((experiment) => loadExperiment(client, experiment)));
63
+ }
64
+ // if we know the number of runs beforehand, check if the
65
+ // number of runs in the project matches the expected number of runs
66
+ return Promise.all(resolvedExperiments.map((experiment) => (0, p_retry_1.default)(async () => {
67
+ const project = await loadExperiment(client, experiment);
68
+ if (project.run_count !== experiment?.results.length) {
69
+ throw new Error("Experiment is missing runs. Retrying.");
70
+ }
71
+ return project;
72
+ }, { factor: 2, minTimeout: 1000, retries: 10 })));
73
+ })();
74
+ if (new Set(projects.map((p) => p.reference_dataset_id)).size > 1) {
75
+ throw new Error("All experiments must have the same reference dataset.");
76
+ }
77
+ const referenceDatasetId = projects.at(0)?.reference_dataset_id;
78
+ if (!referenceDatasetId) {
79
+ throw new Error("Reference dataset is required for comparative evaluation.");
80
+ }
81
+ if (new Set(projects.map((p) => p.extra?.metadata?.dataset_version)).size > 1) {
82
+ console.warn("Detected multiple dataset versions used by experiments, which may lead to inaccurate results.");
83
+ }
84
+ const datasetVersion = projects.at(0)?.extra?.metadata?.dataset_version;
85
+ const id = (0, uuid_1.v4)();
86
+ const experimentName = (() => {
87
+ if (!options.experimentPrefix) {
88
+ const names = projects
89
+ .map((p) => p.name)
90
+ .filter(Boolean)
91
+ .join(" vs. ");
92
+ return `${names}-${(0, uuid_1.v4)().slice(0, 4)}`;
93
+ }
94
+ return `${options.experimentPrefix}-${(0, uuid_1.v4)().slice(0, 4)}`;
95
+ })();
96
+ // TODO: add URL to the comparative experiment
97
+ console.log(`Starting pairwise evaluation of: ${experimentName}`);
98
+ const comparativeExperiment = await client.createComparativeExperiment({
99
+ id,
100
+ name: experimentName,
101
+ experimentIds: projects.map((p) => p.id),
102
+ description: options.description,
103
+ metadata: options.metadata,
104
+ referenceDatasetId: projects.at(0)?.reference_dataset_id,
105
+ });
106
+ const viewUrl = await (async () => {
107
+ const projectId = projects.at(0)?.id ?? projects.at(1)?.id;
108
+ const datasetId = comparativeExperiment?.reference_dataset_id;
109
+ if (projectId && datasetId) {
110
+ const hostUrl = (await client.getProjectUrl({ projectId }))
111
+ .split("/projects/p/")
112
+ .at(0);
113
+ const result = new URL(`${hostUrl}/datasets/${datasetId}/compare`);
114
+ result.searchParams.set("selectedSessions", projects.map((p) => p.id).join(","));
115
+ result.searchParams.set("comparativeExperiment", comparativeExperiment.id);
116
+ return result.toString();
117
+ }
118
+ return null;
119
+ })();
120
+ if (viewUrl != null) {
121
+ console.log(`View results at: ${viewUrl}`);
122
+ }
123
+ const experimentRuns = await Promise.all(projects.map((p) => loadTraces(client, p.id, { loadNested: !!options.loadNested })));
124
+ let exampleIdsIntersect;
125
+ for (const runs of experimentRuns) {
126
+ const exampleIdsSet = new Set(runs
127
+ .map((r) => r.reference_example_id)
128
+ .filter((x) => x != null));
129
+ if (!exampleIdsIntersect) {
130
+ exampleIdsIntersect = exampleIdsSet;
131
+ }
132
+ else {
133
+ exampleIdsIntersect = new Set([...exampleIdsIntersect].filter((x) => exampleIdsSet.has(x)));
134
+ }
135
+ }
136
+ const exampleIds = [...(exampleIdsIntersect ?? [])];
137
+ if (!exampleIds.length) {
138
+ throw new Error("No examples found in common between experiments.");
139
+ }
140
+ const exampleMap = {};
141
+ for (let start = 0; start < exampleIds.length; start += 99) {
142
+ const exampleIdsChunk = exampleIds.slice(start, start + 99);
143
+ for await (const example of client.listExamples({
144
+ datasetId: referenceDatasetId,
145
+ exampleIds: exampleIdsChunk,
146
+ asOf: datasetVersion,
147
+ })) {
148
+ exampleMap[example.id] = example;
149
+ }
150
+ }
151
+ const runMapByExampleId = {};
152
+ for (const runs of experimentRuns) {
153
+ for (const run of runs) {
154
+ if (run.reference_example_id == null ||
155
+ !exampleIds.includes(run.reference_example_id)) {
156
+ continue;
157
+ }
158
+ runMapByExampleId[run.reference_example_id] ??= [];
159
+ runMapByExampleId[run.reference_example_id].push(run);
160
+ }
161
+ }
162
+ const caller = new async_caller_js_1.AsyncCaller({ maxConcurrency: options.maxConcurrency });
163
+ async function evaluateAndSubmitFeedback(runs, example, evaluator) {
164
+ const expectedRunIds = new Set(runs.map((r) => r.id));
165
+ const result = await evaluator(options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs, example);
166
+ for (const [runId, score] of Object.entries(result.scores)) {
167
+ // validate if the run id
168
+ if (!expectedRunIds.has(runId)) {
169
+ throw new Error(`Returning an invalid run id ${runId} from evaluator.`);
170
+ }
171
+ await client.createFeedback(runId, result.key, {
172
+ score,
173
+ sourceRunId: result.source_run_id,
174
+ comparativeExperimentId: comparativeExperiment.id,
175
+ });
176
+ }
177
+ return result;
178
+ }
179
+ const tracedEvaluators = options.evaluators.map((evaluator) => (0, traceable_js_1.traceable)(async (runs, example) => {
180
+ const evaluatorRun = (0, traceable_js_1.getCurrentRunTree)();
181
+ const result = await evaluator(runs, example);
182
+ // sanitise the payload before sending to LangSmith
183
+ evaluatorRun.inputs = { runs: runs, example: example };
184
+ evaluatorRun.outputs = result;
185
+ return {
186
+ ...result,
187
+ source_run_id: result.source_run_id ?? evaluatorRun.id,
188
+ };
189
+ }, {
190
+ project_name: "evaluators",
191
+ name: evaluator.name || "evaluator",
192
+ }));
193
+ const promises = Object.entries(runMapByExampleId).flatMap(([exampleId, runs]) => {
194
+ const example = exampleMap[exampleId];
195
+ if (!example)
196
+ throw new Error(`Example ${exampleId} not found.`);
197
+ return tracedEvaluators.map((evaluator) => caller.call(evaluateAndSubmitFeedback, runs, exampleMap[exampleId], evaluator));
198
+ });
199
+ const results = await Promise.all(promises);
200
+ return { experimentName, results };
201
+ }
202
+ exports.evaluateComparative = evaluateComparative;
@@ -0,0 +1,51 @@
1
+ import { Client } from "../index.js";
2
+ import { ComparisonEvaluationResult as ComparisonEvaluationResultRow, Example, Run } from "../schemas.js";
3
+ import { evaluate } from "./index.js";
4
+ type ExperimentResults = Awaited<ReturnType<typeof evaluate>>;
5
+ export interface EvaluateComparativeOptions {
6
+ /**
7
+ * A list of evaluators to use for comparative evaluation.
8
+ */
9
+ evaluators: Array<(runs: Run[], example: Example) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>>;
10
+ /**
11
+ * Randomize the order of outputs for each evaluation
12
+ * @default false
13
+ */
14
+ randomizeOrder?: boolean;
15
+ /**
16
+ * The LangSmith client to use.
17
+ * @default undefined
18
+ */
19
+ client?: Client;
20
+ /**
21
+ * Metadata to attach to the experiment.
22
+ * @default undefined
23
+ */
24
+ metadata?: Record<string, unknown>;
25
+ /**
26
+ * A prefix to use for your experiment name.
27
+ * @default undefined
28
+ */
29
+ experimentPrefix?: string;
30
+ /**
31
+ * A free-form description of the experiment.
32
+ * @default undefined
33
+ */
34
+ description?: string;
35
+ /**
36
+ * Whether to load all child runs for the experiment.
37
+ * @default false
38
+ */
39
+ loadNested?: boolean;
40
+ /**
41
+ * The maximum number of concurrent evaluators to run.
42
+ * @default undefined
43
+ */
44
+ maxConcurrency?: number;
45
+ }
46
+ export interface ComparisonEvaluationResults {
47
+ experimentName: string;
48
+ results: ComparisonEvaluationResultRow[];
49
+ }
50
+ export declare function evaluateComparative(experiments: Array<string> | Array<Promise<ExperimentResults> | ExperimentResults>, options: EvaluateComparativeOptions): Promise<ComparisonEvaluationResults>;
51
+ export {};