langsmith 0.1.23 → 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client.cjs +46 -5
- package/dist/client.d.ts +17 -3
- package/dist/client.js +46 -5
- package/dist/evaluation/_runner.cjs +25 -7
- package/dist/evaluation/_runner.d.ts +4 -4
- package/dist/evaluation/_runner.js +25 -7
- package/dist/evaluation/evaluate_comparative.cjs +202 -0
- package/dist/evaluation/evaluate_comparative.d.ts +51 -0
- package/dist/evaluation/evaluate_comparative.js +195 -0
- package/dist/evaluation/evaluator.cjs +10 -13
- package/dist/evaluation/evaluator.js +11 -14
- package/dist/evaluation/index.cjs +3 -1
- package/dist/evaluation/index.d.ts +1 -0
- package/dist/evaluation/index.js +1 -0
- package/dist/index.cjs +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/run_trees.cjs +1 -0
- package/dist/run_trees.js +1 -0
- package/dist/schemas.d.ts +17 -0
- package/dist/traceable.cjs +209 -23
- package/dist/traceable.js +206 -20
- package/dist/utils/shuffle.cjs +15 -0
- package/dist/utils/shuffle.d.ts +1 -0
- package/dist/utils/shuffle.js +11 -0
- package/package.json +1 -1
package/dist/client.cjs
CHANGED
|
@@ -304,8 +304,8 @@ class Client {
|
|
|
304
304
|
return this.webUrl;
|
|
305
305
|
}
|
|
306
306
|
else if (isLocalhost(this.apiUrl)) {
|
|
307
|
-
this.webUrl = "http://localhost";
|
|
308
|
-
return
|
|
307
|
+
this.webUrl = "http://localhost:3000";
|
|
308
|
+
return this.webUrl;
|
|
309
309
|
}
|
|
310
310
|
else if (this.apiUrl.includes("/api") &&
|
|
311
311
|
!this.apiUrl.split(".", 1)[0].endsWith("api")) {
|
|
@@ -314,11 +314,11 @@ class Client {
|
|
|
314
314
|
}
|
|
315
315
|
else if (this.apiUrl.split(".", 1)[0].includes("dev")) {
|
|
316
316
|
this.webUrl = "https://dev.smith.langchain.com";
|
|
317
|
-
return
|
|
317
|
+
return this.webUrl;
|
|
318
318
|
}
|
|
319
319
|
else {
|
|
320
320
|
this.webUrl = "https://smith.langchain.com";
|
|
321
|
-
return
|
|
321
|
+
return this.webUrl;
|
|
322
322
|
}
|
|
323
323
|
}
|
|
324
324
|
get headers() {
|
|
@@ -1162,6 +1162,14 @@ class Client {
|
|
|
1162
1162
|
}
|
|
1163
1163
|
return result;
|
|
1164
1164
|
}
|
|
1165
|
+
async getProjectUrl({ projectId, projectName, }) {
|
|
1166
|
+
if (projectId === undefined && projectName === undefined) {
|
|
1167
|
+
throw new Error("Must provide either projectName or projectId");
|
|
1168
|
+
}
|
|
1169
|
+
const project = await this.readProject({ projectId, projectName });
|
|
1170
|
+
const tenantId = await this._getTenantId();
|
|
1171
|
+
return `${this.getHostUrl()}/o/${tenantId}/projects/p/${project.id}`;
|
|
1172
|
+
}
|
|
1165
1173
|
async _getTenantId() {
|
|
1166
1174
|
if (this._tenantId !== null) {
|
|
1167
1175
|
return this._tenantId;
|
|
@@ -1609,7 +1617,7 @@ class Client {
|
|
|
1609
1617
|
sourceRunId: feedbackResult?.sourceRunId,
|
|
1610
1618
|
});
|
|
1611
1619
|
}
|
|
1612
|
-
async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, }) {
|
|
1620
|
+
async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }) {
|
|
1613
1621
|
if (!runId && !projectId) {
|
|
1614
1622
|
throw new Error("One of runId or projectId must be provided");
|
|
1615
1623
|
}
|
|
@@ -1638,6 +1646,7 @@ class Client {
|
|
|
1638
1646
|
correction,
|
|
1639
1647
|
comment,
|
|
1640
1648
|
feedback_source: feedback_source,
|
|
1649
|
+
comparative_experiment_id: comparativeExperimentId,
|
|
1641
1650
|
feedbackConfig,
|
|
1642
1651
|
session_id: projectId,
|
|
1643
1652
|
};
|
|
@@ -1759,6 +1768,38 @@ class Client {
|
|
|
1759
1768
|
const result = await response.json();
|
|
1760
1769
|
return result;
|
|
1761
1770
|
}
|
|
1771
|
+
async createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }) {
|
|
1772
|
+
if (experimentIds.length === 0) {
|
|
1773
|
+
throw new Error("At least one experiment is required");
|
|
1774
|
+
}
|
|
1775
|
+
if (!referenceDatasetId) {
|
|
1776
|
+
referenceDatasetId = (await this.readProject({
|
|
1777
|
+
projectId: experimentIds[0],
|
|
1778
|
+
})).reference_dataset_id;
|
|
1779
|
+
}
|
|
1780
|
+
if (!referenceDatasetId == null) {
|
|
1781
|
+
throw new Error("A reference dataset is required");
|
|
1782
|
+
}
|
|
1783
|
+
const body = {
|
|
1784
|
+
id,
|
|
1785
|
+
name,
|
|
1786
|
+
experiment_ids: experimentIds,
|
|
1787
|
+
reference_dataset_id: referenceDatasetId,
|
|
1788
|
+
description,
|
|
1789
|
+
created_at: (createdAt ?? new Date())?.toISOString(),
|
|
1790
|
+
extra: {},
|
|
1791
|
+
};
|
|
1792
|
+
if (metadata)
|
|
1793
|
+
body.extra["metadata"] = metadata;
|
|
1794
|
+
const response = await this.caller.call(fetch, `${this.apiUrl}/datasets/comparative`, {
|
|
1795
|
+
method: "POST",
|
|
1796
|
+
headers: { ...this.headers, "Content-Type": "application/json" },
|
|
1797
|
+
body: JSON.stringify(body),
|
|
1798
|
+
signal: AbortSignal.timeout(this.timeout_ms),
|
|
1799
|
+
...this.fetchOptions,
|
|
1800
|
+
});
|
|
1801
|
+
return await response.json();
|
|
1802
|
+
}
|
|
1762
1803
|
/**
|
|
1763
1804
|
* Retrieves a list of presigned feedback tokens for a given run ID.
|
|
1764
1805
|
* @param runId The ID of the run.
|
package/dist/client.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { AsyncCallerParams } from "./utils/async_caller.js";
|
|
2
|
-
import { DataType, Dataset, DatasetDiffInfo, DatasetShareSchema, Example, ExampleUpdate, Feedback, FeedbackConfig, FeedbackIngestToken, KVMap, LangChainBaseMessage, Run, RunCreate, RunUpdate, ScoreType, TimeDelta, TracerSession, TracerSessionResult, ValueType } from "./schemas.js";
|
|
2
|
+
import { ComparativeExperiment, DataType, Dataset, DatasetDiffInfo, DatasetShareSchema, Example, ExampleUpdate, Feedback, FeedbackConfig, FeedbackIngestToken, KVMap, LangChainBaseMessage, Run, RunCreate, RunUpdate, ScoreType, TimeDelta, TracerSession, TracerSessionResult, ValueType } from "./schemas.js";
|
|
3
3
|
import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluation/evaluator.js";
|
|
4
4
|
interface ClientConfig {
|
|
5
5
|
apiUrl?: string;
|
|
@@ -182,7 +182,7 @@ export declare class Client {
|
|
|
182
182
|
hideInputs?: boolean;
|
|
183
183
|
hideOutputs?: boolean;
|
|
184
184
|
};
|
|
185
|
-
|
|
185
|
+
getHostUrl(): string;
|
|
186
186
|
private get headers();
|
|
187
187
|
private processInputs;
|
|
188
188
|
private processOutputs;
|
|
@@ -334,6 +334,10 @@ export declare class Client {
|
|
|
334
334
|
projectName?: string;
|
|
335
335
|
includeStats?: boolean;
|
|
336
336
|
}): Promise<TracerSessionResult>;
|
|
337
|
+
getProjectUrl({ projectId, projectName, }: {
|
|
338
|
+
projectId?: string;
|
|
339
|
+
projectName?: string;
|
|
340
|
+
}): Promise<string>;
|
|
337
341
|
private _getTenantId;
|
|
338
342
|
listProjects({ projectIds, name, nameContains, referenceDatasetId, referenceDatasetName, referenceFree, }?: {
|
|
339
343
|
projectIds?: string[];
|
|
@@ -409,7 +413,7 @@ export declare class Client {
|
|
|
409
413
|
loadChildRuns: boolean;
|
|
410
414
|
referenceExample?: Example;
|
|
411
415
|
}): Promise<Feedback>;
|
|
412
|
-
createFeedback(runId: string | null, key: string, { score, value, correction, comment, sourceInfo, feedbackSourceType, sourceRunId, feedbackId, feedbackConfig, projectId, }: {
|
|
416
|
+
createFeedback(runId: string | null, key: string, { score, value, correction, comment, sourceInfo, feedbackSourceType, sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }: {
|
|
413
417
|
score?: ScoreType;
|
|
414
418
|
value?: ValueType;
|
|
415
419
|
correction?: object;
|
|
@@ -421,6 +425,7 @@ export declare class Client {
|
|
|
421
425
|
feedbackId?: string;
|
|
422
426
|
eager?: boolean;
|
|
423
427
|
projectId?: string;
|
|
428
|
+
comparativeExperimentId?: string;
|
|
424
429
|
}): Promise<Feedback>;
|
|
425
430
|
updateFeedback(feedbackId: string, { score, value, correction, comment, }: {
|
|
426
431
|
score?: number | boolean | null;
|
|
@@ -454,6 +459,15 @@ export declare class Client {
|
|
|
454
459
|
expiration?: string | TimeDelta;
|
|
455
460
|
feedbackConfig?: FeedbackConfig;
|
|
456
461
|
}): Promise<FeedbackIngestToken>;
|
|
462
|
+
createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }: {
|
|
463
|
+
name: string;
|
|
464
|
+
experimentIds: Array<string>;
|
|
465
|
+
referenceDatasetId?: string;
|
|
466
|
+
createdAt?: Date;
|
|
467
|
+
description?: string;
|
|
468
|
+
metadata?: Record<string, unknown>;
|
|
469
|
+
id?: string;
|
|
470
|
+
}): Promise<ComparativeExperiment>;
|
|
457
471
|
/**
|
|
458
472
|
* Retrieves a list of presigned feedback tokens for a given run ID.
|
|
459
473
|
* @param runId The ID of the run.
|
package/dist/client.js
CHANGED
|
@@ -277,8 +277,8 @@ export class Client {
|
|
|
277
277
|
return this.webUrl;
|
|
278
278
|
}
|
|
279
279
|
else if (isLocalhost(this.apiUrl)) {
|
|
280
|
-
this.webUrl = "http://localhost";
|
|
281
|
-
return
|
|
280
|
+
this.webUrl = "http://localhost:3000";
|
|
281
|
+
return this.webUrl;
|
|
282
282
|
}
|
|
283
283
|
else if (this.apiUrl.includes("/api") &&
|
|
284
284
|
!this.apiUrl.split(".", 1)[0].endsWith("api")) {
|
|
@@ -287,11 +287,11 @@ export class Client {
|
|
|
287
287
|
}
|
|
288
288
|
else if (this.apiUrl.split(".", 1)[0].includes("dev")) {
|
|
289
289
|
this.webUrl = "https://dev.smith.langchain.com";
|
|
290
|
-
return
|
|
290
|
+
return this.webUrl;
|
|
291
291
|
}
|
|
292
292
|
else {
|
|
293
293
|
this.webUrl = "https://smith.langchain.com";
|
|
294
|
-
return
|
|
294
|
+
return this.webUrl;
|
|
295
295
|
}
|
|
296
296
|
}
|
|
297
297
|
get headers() {
|
|
@@ -1135,6 +1135,14 @@ export class Client {
|
|
|
1135
1135
|
}
|
|
1136
1136
|
return result;
|
|
1137
1137
|
}
|
|
1138
|
+
async getProjectUrl({ projectId, projectName, }) {
|
|
1139
|
+
if (projectId === undefined && projectName === undefined) {
|
|
1140
|
+
throw new Error("Must provide either projectName or projectId");
|
|
1141
|
+
}
|
|
1142
|
+
const project = await this.readProject({ projectId, projectName });
|
|
1143
|
+
const tenantId = await this._getTenantId();
|
|
1144
|
+
return `${this.getHostUrl()}/o/${tenantId}/projects/p/${project.id}`;
|
|
1145
|
+
}
|
|
1138
1146
|
async _getTenantId() {
|
|
1139
1147
|
if (this._tenantId !== null) {
|
|
1140
1148
|
return this._tenantId;
|
|
@@ -1582,7 +1590,7 @@ export class Client {
|
|
|
1582
1590
|
sourceRunId: feedbackResult?.sourceRunId,
|
|
1583
1591
|
});
|
|
1584
1592
|
}
|
|
1585
|
-
async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, }) {
|
|
1593
|
+
async createFeedback(runId, key, { score, value, correction, comment, sourceInfo, feedbackSourceType = "api", sourceRunId, feedbackId, feedbackConfig, projectId, comparativeExperimentId, }) {
|
|
1586
1594
|
if (!runId && !projectId) {
|
|
1587
1595
|
throw new Error("One of runId or projectId must be provided");
|
|
1588
1596
|
}
|
|
@@ -1611,6 +1619,7 @@ export class Client {
|
|
|
1611
1619
|
correction,
|
|
1612
1620
|
comment,
|
|
1613
1621
|
feedback_source: feedback_source,
|
|
1622
|
+
comparative_experiment_id: comparativeExperimentId,
|
|
1614
1623
|
feedbackConfig,
|
|
1615
1624
|
session_id: projectId,
|
|
1616
1625
|
};
|
|
@@ -1732,6 +1741,38 @@ export class Client {
|
|
|
1732
1741
|
const result = await response.json();
|
|
1733
1742
|
return result;
|
|
1734
1743
|
}
|
|
1744
|
+
async createComparativeExperiment({ name, experimentIds, referenceDatasetId, createdAt, description, metadata, id, }) {
|
|
1745
|
+
if (experimentIds.length === 0) {
|
|
1746
|
+
throw new Error("At least one experiment is required");
|
|
1747
|
+
}
|
|
1748
|
+
if (!referenceDatasetId) {
|
|
1749
|
+
referenceDatasetId = (await this.readProject({
|
|
1750
|
+
projectId: experimentIds[0],
|
|
1751
|
+
})).reference_dataset_id;
|
|
1752
|
+
}
|
|
1753
|
+
if (!referenceDatasetId == null) {
|
|
1754
|
+
throw new Error("A reference dataset is required");
|
|
1755
|
+
}
|
|
1756
|
+
const body = {
|
|
1757
|
+
id,
|
|
1758
|
+
name,
|
|
1759
|
+
experiment_ids: experimentIds,
|
|
1760
|
+
reference_dataset_id: referenceDatasetId,
|
|
1761
|
+
description,
|
|
1762
|
+
created_at: (createdAt ?? new Date())?.toISOString(),
|
|
1763
|
+
extra: {},
|
|
1764
|
+
};
|
|
1765
|
+
if (metadata)
|
|
1766
|
+
body.extra["metadata"] = metadata;
|
|
1767
|
+
const response = await this.caller.call(fetch, `${this.apiUrl}/datasets/comparative`, {
|
|
1768
|
+
method: "POST",
|
|
1769
|
+
headers: { ...this.headers, "Content-Type": "application/json" },
|
|
1770
|
+
body: JSON.stringify(body),
|
|
1771
|
+
signal: AbortSignal.timeout(this.timeout_ms),
|
|
1772
|
+
...this.fetchOptions,
|
|
1773
|
+
});
|
|
1774
|
+
return await response.json();
|
|
1775
|
+
}
|
|
1735
1776
|
/**
|
|
1736
1777
|
* Retrieves a list of presigned feedback tokens for a given run ID.
|
|
1737
1778
|
* @param runId The ID of the run.
|
|
@@ -470,10 +470,30 @@ class _ExperimentManager {
|
|
|
470
470
|
async _getDatasetVersion() {
|
|
471
471
|
const examples = await this.getExamples();
|
|
472
472
|
const modifiedAt = examples.map((ex) => ex.modified_at);
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
473
|
+
// Python might return microseconds, which we need
|
|
474
|
+
// to account for when comparing dates.
|
|
475
|
+
const modifiedAtTime = modifiedAt.map((date) => {
|
|
476
|
+
function getMiliseconds(isoString) {
|
|
477
|
+
const time = isoString.split("T").at(1);
|
|
478
|
+
if (!time)
|
|
479
|
+
return "";
|
|
480
|
+
const regex = /[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]+)/;
|
|
481
|
+
const strMiliseconds = time.match(regex)?.[1];
|
|
482
|
+
return strMiliseconds ?? "";
|
|
483
|
+
}
|
|
484
|
+
const jsDate = new Date(date);
|
|
485
|
+
let source = getMiliseconds(date);
|
|
486
|
+
let parsed = getMiliseconds(jsDate.toISOString());
|
|
487
|
+
const length = Math.max(source.length, parsed.length);
|
|
488
|
+
source = source.padEnd(length, "0");
|
|
489
|
+
parsed = parsed.padEnd(length, "0");
|
|
490
|
+
const microseconds = (Number.parseInt(source, 10) - Number.parseInt(parsed, 10)) / 1000;
|
|
491
|
+
const time = jsDate.getTime() + microseconds;
|
|
492
|
+
return { date, time };
|
|
493
|
+
});
|
|
494
|
+
if (modifiedAtTime.length === 0)
|
|
495
|
+
return undefined;
|
|
496
|
+
return modifiedAtTime.reduce((max, current) => (current.time > max.time ? current : max), modifiedAtTime[0]).date;
|
|
477
497
|
}
|
|
478
498
|
async _end() {
|
|
479
499
|
const experiment = this._experiment;
|
|
@@ -572,9 +592,7 @@ async function _evaluate(target, fields) {
|
|
|
572
592
|
runs: newRuns ?? undefined,
|
|
573
593
|
}).start();
|
|
574
594
|
if (_isCallable(target)) {
|
|
575
|
-
manager = await manager.withPredictions(convertInvokeToTopLevel(target), {
|
|
576
|
-
maxConcurrency: fields.maxConcurrency,
|
|
577
|
-
});
|
|
595
|
+
manager = await manager.withPredictions(convertInvokeToTopLevel(target), { maxConcurrency: fields.maxConcurrency });
|
|
578
596
|
}
|
|
579
597
|
if (fields.evaluators) {
|
|
580
598
|
manager = await manager.withEvaluators(fields.evaluators, {
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import { Client } from "../index.js";
|
|
2
2
|
import { Example, KVMap, Run, TracerSession } from "../schemas.js";
|
|
3
3
|
import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluator.js";
|
|
4
|
-
type TargetT = ((input:
|
|
5
|
-
invoke: (input:
|
|
4
|
+
type TargetT<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput) | {
|
|
5
|
+
invoke: (input: TInput, config?: KVMap) => TOutput;
|
|
6
6
|
} | {
|
|
7
|
-
invoke: (input:
|
|
7
|
+
invoke: (input: TInput, config?: KVMap) => Promise<TOutput>;
|
|
8
8
|
};
|
|
9
|
-
type TargetNoInvoke = ((input:
|
|
9
|
+
type TargetNoInvoke<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput);
|
|
10
10
|
type DataT = string | AsyncIterable<Example> | Example[];
|
|
11
11
|
type SummaryEvaluatorT = ((runs: Array<Run>, examples: Array<Example>) => Promise<EvaluationResult | EvaluationResults>) | ((runs: Array<Run>, examples: Array<Example>) => EvaluationResult | EvaluationResults);
|
|
12
12
|
type EvaluatorT = RunEvaluator | ((run: Run, example?: Example) => EvaluationResult) | ((run: Run, example?: Example) => Promise<EvaluationResult>);
|
|
@@ -466,10 +466,30 @@ class _ExperimentManager {
|
|
|
466
466
|
async _getDatasetVersion() {
|
|
467
467
|
const examples = await this.getExamples();
|
|
468
468
|
const modifiedAt = examples.map((ex) => ex.modified_at);
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
469
|
+
// Python might return microseconds, which we need
|
|
470
|
+
// to account for when comparing dates.
|
|
471
|
+
const modifiedAtTime = modifiedAt.map((date) => {
|
|
472
|
+
function getMiliseconds(isoString) {
|
|
473
|
+
const time = isoString.split("T").at(1);
|
|
474
|
+
if (!time)
|
|
475
|
+
return "";
|
|
476
|
+
const regex = /[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]+)/;
|
|
477
|
+
const strMiliseconds = time.match(regex)?.[1];
|
|
478
|
+
return strMiliseconds ?? "";
|
|
479
|
+
}
|
|
480
|
+
const jsDate = new Date(date);
|
|
481
|
+
let source = getMiliseconds(date);
|
|
482
|
+
let parsed = getMiliseconds(jsDate.toISOString());
|
|
483
|
+
const length = Math.max(source.length, parsed.length);
|
|
484
|
+
source = source.padEnd(length, "0");
|
|
485
|
+
parsed = parsed.padEnd(length, "0");
|
|
486
|
+
const microseconds = (Number.parseInt(source, 10) - Number.parseInt(parsed, 10)) / 1000;
|
|
487
|
+
const time = jsDate.getTime() + microseconds;
|
|
488
|
+
return { date, time };
|
|
489
|
+
});
|
|
490
|
+
if (modifiedAtTime.length === 0)
|
|
491
|
+
return undefined;
|
|
492
|
+
return modifiedAtTime.reduce((max, current) => (current.time > max.time ? current : max), modifiedAtTime[0]).date;
|
|
473
493
|
}
|
|
474
494
|
async _end() {
|
|
475
495
|
const experiment = this._experiment;
|
|
@@ -568,9 +588,7 @@ async function _evaluate(target, fields) {
|
|
|
568
588
|
runs: newRuns ?? undefined,
|
|
569
589
|
}).start();
|
|
570
590
|
if (_isCallable(target)) {
|
|
571
|
-
manager = await manager.withPredictions(convertInvokeToTopLevel(target), {
|
|
572
|
-
maxConcurrency: fields.maxConcurrency,
|
|
573
|
-
});
|
|
591
|
+
manager = await manager.withPredictions(convertInvokeToTopLevel(target), { maxConcurrency: fields.maxConcurrency });
|
|
574
592
|
}
|
|
575
593
|
if (fields.evaluators) {
|
|
576
594
|
manager = await manager.withEvaluators(fields.evaluators, {
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.evaluateComparative = void 0;
|
|
7
|
+
const uuid_1 = require("uuid");
|
|
8
|
+
const index_js_1 = require("../index.cjs");
|
|
9
|
+
const shuffle_js_1 = require("../utils/shuffle.cjs");
|
|
10
|
+
const async_caller_js_1 = require("../utils/async_caller.cjs");
|
|
11
|
+
const p_retry_1 = __importDefault(require("p-retry"));
|
|
12
|
+
const traceable_js_1 = require("../traceable.cjs");
|
|
13
|
+
function isExperimentResultsList(value) {
|
|
14
|
+
return value.some((x) => typeof x !== "string");
|
|
15
|
+
}
|
|
16
|
+
async function loadExperiment(client, experiment) {
|
|
17
|
+
const value = typeof experiment === "string" ? experiment : experiment.experimentName;
|
|
18
|
+
return client.readProject((0, uuid_1.validate)(value) ? { projectId: value } : { projectName: value });
|
|
19
|
+
}
|
|
20
|
+
async function loadTraces(client, experiment, options) {
|
|
21
|
+
const executionOrder = options.loadNested ? undefined : 1;
|
|
22
|
+
const runs = await client.listRuns((0, uuid_1.validate)(experiment)
|
|
23
|
+
? { projectId: experiment, executionOrder }
|
|
24
|
+
: { projectName: experiment, executionOrder });
|
|
25
|
+
const treeMap = {};
|
|
26
|
+
const runIdMap = {};
|
|
27
|
+
const results = [];
|
|
28
|
+
for await (const run of runs) {
|
|
29
|
+
if (run.parent_run_id != null) {
|
|
30
|
+
treeMap[run.parent_run_id] ??= [];
|
|
31
|
+
treeMap[run.parent_run_id].push(run);
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
results.push(run);
|
|
35
|
+
}
|
|
36
|
+
runIdMap[run.id] = run;
|
|
37
|
+
}
|
|
38
|
+
for (const [parentRunId, childRuns] of Object.entries(treeMap)) {
|
|
39
|
+
const parentRun = runIdMap[parentRunId];
|
|
40
|
+
parentRun.child_runs = childRuns.sort((a, b) => {
|
|
41
|
+
if (a.dotted_order == null || b.dotted_order == null)
|
|
42
|
+
return 0;
|
|
43
|
+
return a.dotted_order.localeCompare(b.dotted_order);
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
return results;
|
|
47
|
+
}
|
|
48
|
+
async function evaluateComparative(experiments, options) {
|
|
49
|
+
if (experiments.length < 2) {
|
|
50
|
+
throw new Error("Comparative evaluation requires at least 2 experiments.");
|
|
51
|
+
}
|
|
52
|
+
if (!options.evaluators.length) {
|
|
53
|
+
throw new Error("At least one evaluator is required for comparative evaluation.");
|
|
54
|
+
}
|
|
55
|
+
if (options.maxConcurrency && options.maxConcurrency < 0) {
|
|
56
|
+
throw new Error("maxConcurrency must be a positive number.");
|
|
57
|
+
}
|
|
58
|
+
const client = options.client ?? new index_js_1.Client();
|
|
59
|
+
const resolvedExperiments = await Promise.all(experiments);
|
|
60
|
+
const projects = await (() => {
|
|
61
|
+
if (!isExperimentResultsList(resolvedExperiments)) {
|
|
62
|
+
return Promise.all(resolvedExperiments.map((experiment) => loadExperiment(client, experiment)));
|
|
63
|
+
}
|
|
64
|
+
// if we know the number of runs beforehand, check if the
|
|
65
|
+
// number of runs in the project matches the expected number of runs
|
|
66
|
+
return Promise.all(resolvedExperiments.map((experiment) => (0, p_retry_1.default)(async () => {
|
|
67
|
+
const project = await loadExperiment(client, experiment);
|
|
68
|
+
if (project.run_count !== experiment?.results.length) {
|
|
69
|
+
throw new Error("Experiment is missing runs. Retrying.");
|
|
70
|
+
}
|
|
71
|
+
return project;
|
|
72
|
+
}, { factor: 2, minTimeout: 1000, retries: 10 })));
|
|
73
|
+
})();
|
|
74
|
+
if (new Set(projects.map((p) => p.reference_dataset_id)).size > 1) {
|
|
75
|
+
throw new Error("All experiments must have the same reference dataset.");
|
|
76
|
+
}
|
|
77
|
+
const referenceDatasetId = projects.at(0)?.reference_dataset_id;
|
|
78
|
+
if (!referenceDatasetId) {
|
|
79
|
+
throw new Error("Reference dataset is required for comparative evaluation.");
|
|
80
|
+
}
|
|
81
|
+
if (new Set(projects.map((p) => p.extra?.metadata?.dataset_version)).size > 1) {
|
|
82
|
+
console.warn("Detected multiple dataset versions used by experiments, which may lead to inaccurate results.");
|
|
83
|
+
}
|
|
84
|
+
const datasetVersion = projects.at(0)?.extra?.metadata?.dataset_version;
|
|
85
|
+
const id = (0, uuid_1.v4)();
|
|
86
|
+
const experimentName = (() => {
|
|
87
|
+
if (!options.experimentPrefix) {
|
|
88
|
+
const names = projects
|
|
89
|
+
.map((p) => p.name)
|
|
90
|
+
.filter(Boolean)
|
|
91
|
+
.join(" vs. ");
|
|
92
|
+
return `${names}-${(0, uuid_1.v4)().slice(0, 4)}`;
|
|
93
|
+
}
|
|
94
|
+
return `${options.experimentPrefix}-${(0, uuid_1.v4)().slice(0, 4)}`;
|
|
95
|
+
})();
|
|
96
|
+
// TODO: add URL to the comparative experiment
|
|
97
|
+
console.log(`Starting pairwise evaluation of: ${experimentName}`);
|
|
98
|
+
const comparativeExperiment = await client.createComparativeExperiment({
|
|
99
|
+
id,
|
|
100
|
+
name: experimentName,
|
|
101
|
+
experimentIds: projects.map((p) => p.id),
|
|
102
|
+
description: options.description,
|
|
103
|
+
metadata: options.metadata,
|
|
104
|
+
referenceDatasetId: projects.at(0)?.reference_dataset_id,
|
|
105
|
+
});
|
|
106
|
+
const viewUrl = await (async () => {
|
|
107
|
+
const projectId = projects.at(0)?.id ?? projects.at(1)?.id;
|
|
108
|
+
const datasetId = comparativeExperiment?.reference_dataset_id;
|
|
109
|
+
if (projectId && datasetId) {
|
|
110
|
+
const hostUrl = (await client.getProjectUrl({ projectId }))
|
|
111
|
+
.split("/projects/p/")
|
|
112
|
+
.at(0);
|
|
113
|
+
const result = new URL(`${hostUrl}/datasets/${datasetId}/compare`);
|
|
114
|
+
result.searchParams.set("selectedSessions", projects.map((p) => p.id).join(","));
|
|
115
|
+
result.searchParams.set("comparativeExperiment", comparativeExperiment.id);
|
|
116
|
+
return result.toString();
|
|
117
|
+
}
|
|
118
|
+
return null;
|
|
119
|
+
})();
|
|
120
|
+
if (viewUrl != null) {
|
|
121
|
+
console.log(`View results at: ${viewUrl}`);
|
|
122
|
+
}
|
|
123
|
+
const experimentRuns = await Promise.all(projects.map((p) => loadTraces(client, p.id, { loadNested: !!options.loadNested })));
|
|
124
|
+
let exampleIdsIntersect;
|
|
125
|
+
for (const runs of experimentRuns) {
|
|
126
|
+
const exampleIdsSet = new Set(runs
|
|
127
|
+
.map((r) => r.reference_example_id)
|
|
128
|
+
.filter((x) => x != null));
|
|
129
|
+
if (!exampleIdsIntersect) {
|
|
130
|
+
exampleIdsIntersect = exampleIdsSet;
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
exampleIdsIntersect = new Set([...exampleIdsIntersect].filter((x) => exampleIdsSet.has(x)));
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
const exampleIds = [...(exampleIdsIntersect ?? [])];
|
|
137
|
+
if (!exampleIds.length) {
|
|
138
|
+
throw new Error("No examples found in common between experiments.");
|
|
139
|
+
}
|
|
140
|
+
const exampleMap = {};
|
|
141
|
+
for (let start = 0; start < exampleIds.length; start += 99) {
|
|
142
|
+
const exampleIdsChunk = exampleIds.slice(start, start + 99);
|
|
143
|
+
for await (const example of client.listExamples({
|
|
144
|
+
datasetId: referenceDatasetId,
|
|
145
|
+
exampleIds: exampleIdsChunk,
|
|
146
|
+
asOf: datasetVersion,
|
|
147
|
+
})) {
|
|
148
|
+
exampleMap[example.id] = example;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
const runMapByExampleId = {};
|
|
152
|
+
for (const runs of experimentRuns) {
|
|
153
|
+
for (const run of runs) {
|
|
154
|
+
if (run.reference_example_id == null ||
|
|
155
|
+
!exampleIds.includes(run.reference_example_id)) {
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
runMapByExampleId[run.reference_example_id] ??= [];
|
|
159
|
+
runMapByExampleId[run.reference_example_id].push(run);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
const caller = new async_caller_js_1.AsyncCaller({ maxConcurrency: options.maxConcurrency });
|
|
163
|
+
async function evaluateAndSubmitFeedback(runs, example, evaluator) {
|
|
164
|
+
const expectedRunIds = new Set(runs.map((r) => r.id));
|
|
165
|
+
const result = await evaluator(options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs, example);
|
|
166
|
+
for (const [runId, score] of Object.entries(result.scores)) {
|
|
167
|
+
// validate if the run id
|
|
168
|
+
if (!expectedRunIds.has(runId)) {
|
|
169
|
+
throw new Error(`Returning an invalid run id ${runId} from evaluator.`);
|
|
170
|
+
}
|
|
171
|
+
await client.createFeedback(runId, result.key, {
|
|
172
|
+
score,
|
|
173
|
+
sourceRunId: result.source_run_id,
|
|
174
|
+
comparativeExperimentId: comparativeExperiment.id,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
return result;
|
|
178
|
+
}
|
|
179
|
+
const tracedEvaluators = options.evaluators.map((evaluator) => (0, traceable_js_1.traceable)(async (runs, example) => {
|
|
180
|
+
const evaluatorRun = (0, traceable_js_1.getCurrentRunTree)();
|
|
181
|
+
const result = await evaluator(runs, example);
|
|
182
|
+
// sanitise the payload before sending to LangSmith
|
|
183
|
+
evaluatorRun.inputs = { runs: runs, example: example };
|
|
184
|
+
evaluatorRun.outputs = result;
|
|
185
|
+
return {
|
|
186
|
+
...result,
|
|
187
|
+
source_run_id: result.source_run_id ?? evaluatorRun.id,
|
|
188
|
+
};
|
|
189
|
+
}, {
|
|
190
|
+
project_name: "evaluators",
|
|
191
|
+
name: evaluator.name || "evaluator",
|
|
192
|
+
}));
|
|
193
|
+
const promises = Object.entries(runMapByExampleId).flatMap(([exampleId, runs]) => {
|
|
194
|
+
const example = exampleMap[exampleId];
|
|
195
|
+
if (!example)
|
|
196
|
+
throw new Error(`Example ${exampleId} not found.`);
|
|
197
|
+
return tracedEvaluators.map((evaluator) => caller.call(evaluateAndSubmitFeedback, runs, exampleMap[exampleId], evaluator));
|
|
198
|
+
});
|
|
199
|
+
const results = await Promise.all(promises);
|
|
200
|
+
return { experimentName, results };
|
|
201
|
+
}
|
|
202
|
+
exports.evaluateComparative = evaluateComparative;
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { Client } from "../index.js";
|
|
2
|
+
import { ComparisonEvaluationResult as ComparisonEvaluationResultRow, Example, Run } from "../schemas.js";
|
|
3
|
+
import { evaluate } from "./index.js";
|
|
4
|
+
type ExperimentResults = Awaited<ReturnType<typeof evaluate>>;
|
|
5
|
+
export interface EvaluateComparativeOptions {
|
|
6
|
+
/**
|
|
7
|
+
* A list of evaluators to use for comparative evaluation.
|
|
8
|
+
*/
|
|
9
|
+
evaluators: Array<(runs: Run[], example: Example) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>>;
|
|
10
|
+
/**
|
|
11
|
+
* Randomize the order of outputs for each evaluation
|
|
12
|
+
* @default false
|
|
13
|
+
*/
|
|
14
|
+
randomizeOrder?: boolean;
|
|
15
|
+
/**
|
|
16
|
+
* The LangSmith client to use.
|
|
17
|
+
* @default undefined
|
|
18
|
+
*/
|
|
19
|
+
client?: Client;
|
|
20
|
+
/**
|
|
21
|
+
* Metadata to attach to the experiment.
|
|
22
|
+
* @default undefined
|
|
23
|
+
*/
|
|
24
|
+
metadata?: Record<string, unknown>;
|
|
25
|
+
/**
|
|
26
|
+
* A prefix to use for your experiment name.
|
|
27
|
+
* @default undefined
|
|
28
|
+
*/
|
|
29
|
+
experimentPrefix?: string;
|
|
30
|
+
/**
|
|
31
|
+
* A free-form description of the experiment.
|
|
32
|
+
* @default undefined
|
|
33
|
+
*/
|
|
34
|
+
description?: string;
|
|
35
|
+
/**
|
|
36
|
+
* Whether to load all child runs for the experiment.
|
|
37
|
+
* @default false
|
|
38
|
+
*/
|
|
39
|
+
loadNested?: boolean;
|
|
40
|
+
/**
|
|
41
|
+
* The maximum number of concurrent evaluators to run.
|
|
42
|
+
* @default undefined
|
|
43
|
+
*/
|
|
44
|
+
maxConcurrency?: number;
|
|
45
|
+
}
|
|
46
|
+
export interface ComparisonEvaluationResults {
|
|
47
|
+
experimentName: string;
|
|
48
|
+
results: ComparisonEvaluationResultRow[];
|
|
49
|
+
}
|
|
50
|
+
export declare function evaluateComparative(experiments: Array<string> | Array<Promise<ExperimentResults> | ExperimentResults>, options: EvaluateComparativeOptions): Promise<ComparisonEvaluationResults>;
|
|
51
|
+
export {};
|