langsmith 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/_runner.cjs +44 -12
- package/dist/evaluation/_runner.d.ts +79 -27
- package/dist/evaluation/_runner.js +44 -12
- package/dist/evaluation/evaluate_comparative.cjs +19 -2
- package/dist/evaluation/evaluate_comparative.d.ts +11 -1
- package/dist/evaluation/evaluate_comparative.js +19 -2
- package/dist/evaluation/evaluator.cjs +8 -1
- package/dist/evaluation/evaluator.d.ts +13 -1
- package/dist/evaluation/evaluator.js +8 -1
- package/dist/index.cjs +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
|
@@ -12,11 +12,9 @@ const error_js_1 = require("../utils/error.cjs");
|
|
|
12
12
|
const _random_name_js_1 = require("./_random_name.cjs");
|
|
13
13
|
const evaluator_js_1 = require("./evaluator.cjs");
|
|
14
14
|
const uuid_1 = require("uuid");
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
*/
|
|
19
|
-
target, options) {
|
|
15
|
+
const evaluate_comparative_js_1 = require("./evaluate_comparative.cjs");
|
|
16
|
+
// Implementation signature
|
|
17
|
+
function evaluate(target, options) {
|
|
20
18
|
return _evaluate(target, options);
|
|
21
19
|
}
|
|
22
20
|
exports.evaluate = evaluate;
|
|
@@ -381,7 +379,7 @@ class _ExperimentManager {
|
|
|
381
379
|
// Private methods
|
|
382
380
|
/**
|
|
383
381
|
* Run the target function or runnable on the examples.
|
|
384
|
-
* @param {
|
|
382
|
+
* @param {StandardTargetT} target The target function or runnable to evaluate.
|
|
385
383
|
* @param options
|
|
386
384
|
* @returns {AsyncGenerator<_ForwardResults>} An async generator of the results.
|
|
387
385
|
*/
|
|
@@ -630,12 +628,32 @@ class ExperimentResults {
|
|
|
630
628
|
}
|
|
631
629
|
}
|
|
632
630
|
async function _evaluate(target, fields) {
|
|
631
|
+
// Add check for comparative evaluation
|
|
632
|
+
if (Array.isArray(target)) {
|
|
633
|
+
const comparativeOptions = fields;
|
|
634
|
+
if (!comparativeOptions.evaluators) {
|
|
635
|
+
throw new Error("Evaluators are required for comparative evaluation");
|
|
636
|
+
}
|
|
637
|
+
return (0, evaluate_comparative_js_1.evaluateComparative)(target, {
|
|
638
|
+
evaluators: comparativeOptions.evaluators,
|
|
639
|
+
client: comparativeOptions.client,
|
|
640
|
+
metadata: comparativeOptions.metadata,
|
|
641
|
+
experimentPrefix: comparativeOptions.experimentPrefix,
|
|
642
|
+
description: comparativeOptions.description,
|
|
643
|
+
maxConcurrency: comparativeOptions.maxConcurrency,
|
|
644
|
+
loadNested: comparativeOptions.loadNested ?? false,
|
|
645
|
+
randomizeOrder: comparativeOptions.randomizeOrder ?? false,
|
|
646
|
+
});
|
|
647
|
+
}
|
|
633
648
|
const client = fields.client ?? new index_js_1.Client();
|
|
634
649
|
const runs = _isCallable(target) ? null : target;
|
|
650
|
+
const standardFields = fields;
|
|
635
651
|
const [experiment_, newRuns] = await _resolveExperiment(fields.experiment ?? null, runs, client);
|
|
636
652
|
let manager = await new _ExperimentManager({
|
|
637
|
-
data: Array.isArray(
|
|
638
|
-
examples: Array.isArray(
|
|
653
|
+
data: Array.isArray(standardFields.data) ? undefined : standardFields.data,
|
|
654
|
+
examples: Array.isArray(standardFields.data)
|
|
655
|
+
? standardFields.data
|
|
656
|
+
: undefined,
|
|
639
657
|
client,
|
|
640
658
|
metadata: fields.metadata,
|
|
641
659
|
experiment: experiment_ ?? fields.experimentPrefix,
|
|
@@ -647,13 +665,13 @@ async function _evaluate(target, fields) {
|
|
|
647
665
|
maxConcurrency: fields.maxConcurrency,
|
|
648
666
|
});
|
|
649
667
|
}
|
|
650
|
-
if (
|
|
651
|
-
manager = await manager.withEvaluators(
|
|
668
|
+
if (standardFields.evaluators) {
|
|
669
|
+
manager = await manager.withEvaluators(standardFields.evaluators, {
|
|
652
670
|
maxConcurrency: fields.maxConcurrency,
|
|
653
671
|
});
|
|
654
672
|
}
|
|
655
|
-
if (
|
|
656
|
-
manager = await manager.withSummaryEvaluators(
|
|
673
|
+
if (standardFields.summaryEvaluators) {
|
|
674
|
+
manager = await manager.withSummaryEvaluators(standardFields.summaryEvaluators);
|
|
657
675
|
}
|
|
658
676
|
// Start consuming the results.
|
|
659
677
|
const results = new ExperimentResults(manager);
|
|
@@ -743,6 +761,20 @@ async function wrapSummaryEvaluators(evaluators, optionsArray) {
|
|
|
743
761
|
const evalName = evaluator.name || "BatchEvaluator";
|
|
744
762
|
const wrapperInner = (runs, examples) => {
|
|
745
763
|
const wrapperSuperInner = (0, traceable_js_1.traceable)((_runs_, _examples_) => {
|
|
764
|
+
// Check if the evaluator expects an object parameter
|
|
765
|
+
if (evaluator.length === 1) {
|
|
766
|
+
const inputs = examples.map((ex) => ex.inputs);
|
|
767
|
+
const outputs = runs.map((run) => run.outputs || {});
|
|
768
|
+
const referenceOutputs = examples.map((ex) => ex.outputs || {});
|
|
769
|
+
return Promise.resolve(evaluator({
|
|
770
|
+
runs,
|
|
771
|
+
examples,
|
|
772
|
+
inputs,
|
|
773
|
+
outputs,
|
|
774
|
+
referenceOutputs,
|
|
775
|
+
}));
|
|
776
|
+
}
|
|
777
|
+
// Otherwise use the traditional (runs, examples) signature
|
|
746
778
|
return Promise.resolve(evaluator(runs, examples));
|
|
747
779
|
}, { ...optionsArray, name: evalName });
|
|
748
780
|
return Promise.resolve(wrapperSuperInner(`Runs[] (Length=${runs.length})`, `Examples[] (Length=${examples.length})`));
|
|
@@ -1,14 +1,51 @@
|
|
|
1
1
|
import { Client } from "../index.js";
|
|
2
2
|
import { Example, KVMap, Run, TracerSession } from "../schemas.js";
|
|
3
3
|
import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluator.js";
|
|
4
|
-
|
|
4
|
+
import { ComparisonEvaluationResults, ComparativeEvaluator } from "./evaluate_comparative.js";
|
|
5
|
+
type StandardTargetT<TInput = any, TOutput = KVMap> = ((input: TInput, config?: KVMap) => Promise<TOutput>) | ((input: TInput, config?: KVMap) => TOutput) | {
|
|
5
6
|
invoke: (input: TInput, config?: KVMap) => TOutput;
|
|
6
7
|
} | {
|
|
7
8
|
invoke: (input: TInput, config?: KVMap) => Promise<TOutput>;
|
|
8
9
|
};
|
|
10
|
+
type ComparativeTargetT = Array<string> | Array<Promise<ExperimentResults> | ExperimentResults>;
|
|
11
|
+
export type TargetT<TInput = any, TOutput = KVMap> = StandardTargetT<TInput, TOutput> | ComparativeTargetT;
|
|
9
12
|
export type DataT = string | AsyncIterable<Example> | Example[];
|
|
10
|
-
|
|
11
|
-
|
|
13
|
+
/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */
|
|
14
|
+
type DeprecatedSyncSummaryEvaluator = (runs: Array<Run>, examples: Array<Example>) => EvaluationResult | EvaluationResults;
|
|
15
|
+
/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */
|
|
16
|
+
type DeprecatedAsyncSummaryEvaluator = (runs: Array<Run>, examples: Array<Example>) => Promise<EvaluationResult | EvaluationResults>;
|
|
17
|
+
export type SummaryEvaluatorT = DeprecatedSyncSummaryEvaluator | DeprecatedAsyncSummaryEvaluator | ((args: {
|
|
18
|
+
runs: Array<Run>;
|
|
19
|
+
examples: Array<Example>;
|
|
20
|
+
inputs: Array<Record<string, any>>;
|
|
21
|
+
outputs: Array<Record<string, any>>;
|
|
22
|
+
referenceOutputs?: Array<Record<string, any>>;
|
|
23
|
+
}) => EvaluationResult | EvaluationResults) | ((args: {
|
|
24
|
+
runs: Array<Run>;
|
|
25
|
+
examples: Array<Example>;
|
|
26
|
+
inputs: Array<Record<string, any>>;
|
|
27
|
+
outputs: Array<Record<string, any>>;
|
|
28
|
+
referenceOutputs?: Array<Record<string, any>>;
|
|
29
|
+
}) => Promise<EvaluationResult | EvaluationResults>);
|
|
30
|
+
/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
|
|
31
|
+
type DeprecatedRunEvaluator = RunEvaluator;
|
|
32
|
+
/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
|
|
33
|
+
type DeprecatedFunctionEvaluator = (run: Run, example?: Example) => EvaluationResult | EvaluationResults;
|
|
34
|
+
/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
|
|
35
|
+
type DeprecatedAsyncFunctionEvaluator = (run: Run, example?: Example) => Promise<EvaluationResult | EvaluationResults>;
|
|
36
|
+
export type EvaluatorT = DeprecatedRunEvaluator | DeprecatedFunctionEvaluator | DeprecatedAsyncFunctionEvaluator | ((args: {
|
|
37
|
+
run: Run;
|
|
38
|
+
example: Example;
|
|
39
|
+
inputs: Record<string, any>;
|
|
40
|
+
outputs: Record<string, any>;
|
|
41
|
+
referenceOutputs?: Record<string, any>;
|
|
42
|
+
}) => EvaluationResult | EvaluationResults) | ((args: {
|
|
43
|
+
run: Run;
|
|
44
|
+
example: Example;
|
|
45
|
+
inputs: Record<string, any>;
|
|
46
|
+
outputs: Record<string, any>;
|
|
47
|
+
referenceOutputs?: Record<string, any>;
|
|
48
|
+
}) => Promise<EvaluationResult | EvaluationResults>);
|
|
12
49
|
interface _ForwardResults {
|
|
13
50
|
run: Run;
|
|
14
51
|
example: Example;
|
|
@@ -25,22 +62,7 @@ interface _ExperimentManagerArgs {
|
|
|
25
62
|
numRepetitions?: number;
|
|
26
63
|
_runsArray?: Run[];
|
|
27
64
|
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* The dataset to evaluate on. Can be a dataset name, a list of
|
|
31
|
-
* examples, or a generator of examples.
|
|
32
|
-
*/
|
|
33
|
-
data: DataT;
|
|
34
|
-
/**
|
|
35
|
-
* A list of evaluators to run on each example.
|
|
36
|
-
* @default undefined
|
|
37
|
-
*/
|
|
38
|
-
evaluators?: Array<EvaluatorT>;
|
|
39
|
-
/**
|
|
40
|
-
* A list of summary evaluators to run on the entire dataset.
|
|
41
|
-
* @default undefined
|
|
42
|
-
*/
|
|
43
|
-
summaryEvaluators?: Array<SummaryEvaluatorT>;
|
|
65
|
+
type BaseEvaluateOptions = {
|
|
44
66
|
/**
|
|
45
67
|
* Metadata to attach to the experiment.
|
|
46
68
|
* @default undefined
|
|
@@ -71,12 +93,42 @@ export interface EvaluateOptions {
|
|
|
71
93
|
* @default 1
|
|
72
94
|
*/
|
|
73
95
|
numRepetitions?: number;
|
|
96
|
+
};
|
|
97
|
+
export interface EvaluateOptions extends BaseEvaluateOptions {
|
|
98
|
+
/**
|
|
99
|
+
* A list of evaluators to run on each example.
|
|
100
|
+
* @default undefined
|
|
101
|
+
*/
|
|
102
|
+
evaluators?: Array<EvaluatorT>;
|
|
103
|
+
/**
|
|
104
|
+
* A list of summary evaluators to run on the entire dataset.
|
|
105
|
+
* @default undefined
|
|
106
|
+
*/
|
|
107
|
+
summaryEvaluators?: Array<SummaryEvaluatorT>;
|
|
108
|
+
/**
|
|
109
|
+
* The dataset to evaluate on. Can be a dataset name, a list of
|
|
110
|
+
* examples, or a generator of examples.
|
|
111
|
+
*/
|
|
112
|
+
data: DataT;
|
|
74
113
|
}
|
|
75
|
-
export
|
|
76
|
-
/**
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
114
|
+
export interface ComparativeEvaluateOptions extends BaseEvaluateOptions {
|
|
115
|
+
/**
|
|
116
|
+
* A list of evaluators to run on each example.
|
|
117
|
+
*/
|
|
118
|
+
evaluators: Array<ComparativeEvaluator>;
|
|
119
|
+
/**
|
|
120
|
+
* Whether to load all child runs for the experiment.
|
|
121
|
+
* @default false
|
|
122
|
+
*/
|
|
123
|
+
loadNested?: boolean;
|
|
124
|
+
/**
|
|
125
|
+
* Randomize the order of outputs for each evaluation
|
|
126
|
+
* @default false
|
|
127
|
+
*/
|
|
128
|
+
randomizeOrder?: boolean;
|
|
129
|
+
}
|
|
130
|
+
export declare function evaluate(target: ComparativeTargetT, options: ComparativeEvaluateOptions): Promise<ComparisonEvaluationResults>;
|
|
131
|
+
export declare function evaluate(target: StandardTargetT, options: EvaluateOptions): Promise<ExperimentResults>;
|
|
80
132
|
export interface ExperimentResultRow {
|
|
81
133
|
run: Run;
|
|
82
134
|
example: Example;
|
|
@@ -114,7 +166,7 @@ export declare class _ExperimentManager {
|
|
|
114
166
|
_getProject(firstExample: Example): Promise<TracerSession>;
|
|
115
167
|
protected _printExperimentStart(): Promise<void>;
|
|
116
168
|
start(): Promise<_ExperimentManager>;
|
|
117
|
-
withPredictions(target:
|
|
169
|
+
withPredictions(target: StandardTargetT, options?: {
|
|
118
170
|
maxConcurrency?: number;
|
|
119
171
|
}): Promise<_ExperimentManager>;
|
|
120
172
|
withEvaluators(evaluators: Array<EvaluatorT | RunEvaluator>, options?: {
|
|
@@ -125,11 +177,11 @@ export declare class _ExperimentManager {
|
|
|
125
177
|
getSummaryScores(): Promise<EvaluationResults>;
|
|
126
178
|
/**
|
|
127
179
|
* Run the target function or runnable on the examples.
|
|
128
|
-
* @param {
|
|
180
|
+
* @param {StandardTargetT} target The target function or runnable to evaluate.
|
|
129
181
|
* @param options
|
|
130
182
|
* @returns {AsyncGenerator<_ForwardResults>} An async generator of the results.
|
|
131
183
|
*/
|
|
132
|
-
_predict(target:
|
|
184
|
+
_predict(target: StandardTargetT, options?: {
|
|
133
185
|
maxConcurrency?: number;
|
|
134
186
|
}): AsyncGenerator<_ForwardResults>;
|
|
135
187
|
_runEvaluators(evaluators: Array<RunEvaluator>, currentResults: ExperimentResultRow, fields: {
|
|
@@ -9,11 +9,9 @@ import { printErrorStackTrace } from "../utils/error.js";
|
|
|
9
9
|
import { randomName } from "./_random_name.js";
|
|
10
10
|
import { runEvaluator, } from "./evaluator.js";
|
|
11
11
|
import { v4 as uuidv4 } from "uuid";
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
*/
|
|
16
|
-
target, options) {
|
|
12
|
+
import { evaluateComparative, } from "./evaluate_comparative.js";
|
|
13
|
+
// Implementation signature
|
|
14
|
+
export function evaluate(target, options) {
|
|
17
15
|
return _evaluate(target, options);
|
|
18
16
|
}
|
|
19
17
|
/**
|
|
@@ -377,7 +375,7 @@ export class _ExperimentManager {
|
|
|
377
375
|
// Private methods
|
|
378
376
|
/**
|
|
379
377
|
* Run the target function or runnable on the examples.
|
|
380
|
-
* @param {
|
|
378
|
+
* @param {StandardTargetT} target The target function or runnable to evaluate.
|
|
381
379
|
* @param options
|
|
382
380
|
* @returns {AsyncGenerator<_ForwardResults>} An async generator of the results.
|
|
383
381
|
*/
|
|
@@ -625,12 +623,32 @@ class ExperimentResults {
|
|
|
625
623
|
}
|
|
626
624
|
}
|
|
627
625
|
async function _evaluate(target, fields) {
|
|
626
|
+
// Add check for comparative evaluation
|
|
627
|
+
if (Array.isArray(target)) {
|
|
628
|
+
const comparativeOptions = fields;
|
|
629
|
+
if (!comparativeOptions.evaluators) {
|
|
630
|
+
throw new Error("Evaluators are required for comparative evaluation");
|
|
631
|
+
}
|
|
632
|
+
return evaluateComparative(target, {
|
|
633
|
+
evaluators: comparativeOptions.evaluators,
|
|
634
|
+
client: comparativeOptions.client,
|
|
635
|
+
metadata: comparativeOptions.metadata,
|
|
636
|
+
experimentPrefix: comparativeOptions.experimentPrefix,
|
|
637
|
+
description: comparativeOptions.description,
|
|
638
|
+
maxConcurrency: comparativeOptions.maxConcurrency,
|
|
639
|
+
loadNested: comparativeOptions.loadNested ?? false,
|
|
640
|
+
randomizeOrder: comparativeOptions.randomizeOrder ?? false,
|
|
641
|
+
});
|
|
642
|
+
}
|
|
628
643
|
const client = fields.client ?? new Client();
|
|
629
644
|
const runs = _isCallable(target) ? null : target;
|
|
645
|
+
const standardFields = fields;
|
|
630
646
|
const [experiment_, newRuns] = await _resolveExperiment(fields.experiment ?? null, runs, client);
|
|
631
647
|
let manager = await new _ExperimentManager({
|
|
632
|
-
data: Array.isArray(
|
|
633
|
-
examples: Array.isArray(
|
|
648
|
+
data: Array.isArray(standardFields.data) ? undefined : standardFields.data,
|
|
649
|
+
examples: Array.isArray(standardFields.data)
|
|
650
|
+
? standardFields.data
|
|
651
|
+
: undefined,
|
|
634
652
|
client,
|
|
635
653
|
metadata: fields.metadata,
|
|
636
654
|
experiment: experiment_ ?? fields.experimentPrefix,
|
|
@@ -642,13 +660,13 @@ async function _evaluate(target, fields) {
|
|
|
642
660
|
maxConcurrency: fields.maxConcurrency,
|
|
643
661
|
});
|
|
644
662
|
}
|
|
645
|
-
if (
|
|
646
|
-
manager = await manager.withEvaluators(
|
|
663
|
+
if (standardFields.evaluators) {
|
|
664
|
+
manager = await manager.withEvaluators(standardFields.evaluators, {
|
|
647
665
|
maxConcurrency: fields.maxConcurrency,
|
|
648
666
|
});
|
|
649
667
|
}
|
|
650
|
-
if (
|
|
651
|
-
manager = await manager.withSummaryEvaluators(
|
|
668
|
+
if (standardFields.summaryEvaluators) {
|
|
669
|
+
manager = await manager.withSummaryEvaluators(standardFields.summaryEvaluators);
|
|
652
670
|
}
|
|
653
671
|
// Start consuming the results.
|
|
654
672
|
const results = new ExperimentResults(manager);
|
|
@@ -738,6 +756,20 @@ async function wrapSummaryEvaluators(evaluators, optionsArray) {
|
|
|
738
756
|
const evalName = evaluator.name || "BatchEvaluator";
|
|
739
757
|
const wrapperInner = (runs, examples) => {
|
|
740
758
|
const wrapperSuperInner = traceable((_runs_, _examples_) => {
|
|
759
|
+
// Check if the evaluator expects an object parameter
|
|
760
|
+
if (evaluator.length === 1) {
|
|
761
|
+
const inputs = examples.map((ex) => ex.inputs);
|
|
762
|
+
const outputs = runs.map((run) => run.outputs || {});
|
|
763
|
+
const referenceOutputs = examples.map((ex) => ex.outputs || {});
|
|
764
|
+
return Promise.resolve(evaluator({
|
|
765
|
+
runs,
|
|
766
|
+
examples,
|
|
767
|
+
inputs,
|
|
768
|
+
outputs,
|
|
769
|
+
referenceOutputs,
|
|
770
|
+
}));
|
|
771
|
+
}
|
|
772
|
+
// Otherwise use the traditional (runs, examples) signature
|
|
741
773
|
return Promise.resolve(evaluator(runs, examples));
|
|
742
774
|
}, { ...optionsArray, name: evalName });
|
|
743
775
|
return Promise.resolve(wrapperSuperInner(`Runs[] (Length=${runs.length})`, `Examples[] (Length=${examples.length})`));
|
|
@@ -162,7 +162,16 @@ async function evaluateComparative(experiments, options) {
|
|
|
162
162
|
const caller = new async_caller_js_1.AsyncCaller({ maxConcurrency: options.maxConcurrency });
|
|
163
163
|
async function evaluateAndSubmitFeedback(runs, example, evaluator) {
|
|
164
164
|
const expectedRunIds = new Set(runs.map((r) => r.id));
|
|
165
|
-
|
|
165
|
+
// Check if evaluator expects an object parameter
|
|
166
|
+
const result = evaluator.length === 1
|
|
167
|
+
? await evaluator({
|
|
168
|
+
runs: options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs,
|
|
169
|
+
example,
|
|
170
|
+
inputs: example.inputs,
|
|
171
|
+
outputs: runs.map((run) => run.outputs || {}),
|
|
172
|
+
referenceOutputs: example.outputs || {},
|
|
173
|
+
})
|
|
174
|
+
: await evaluator(runs, example);
|
|
166
175
|
for (const [runId, score] of Object.entries(result.scores)) {
|
|
167
176
|
// validate if the run id
|
|
168
177
|
if (!expectedRunIds.has(runId)) {
|
|
@@ -178,7 +187,15 @@ async function evaluateComparative(experiments, options) {
|
|
|
178
187
|
}
|
|
179
188
|
const tracedEvaluators = options.evaluators.map((evaluator) => (0, traceable_js_1.traceable)(async (runs, example) => {
|
|
180
189
|
const evaluatorRun = (0, traceable_js_1.getCurrentRunTree)();
|
|
181
|
-
const result =
|
|
190
|
+
const result = evaluator.length === 1
|
|
191
|
+
? await evaluator({
|
|
192
|
+
runs: options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs,
|
|
193
|
+
example,
|
|
194
|
+
inputs: example.inputs,
|
|
195
|
+
outputs: runs.map((run) => run.outputs || {}),
|
|
196
|
+
referenceOutputs: example.outputs || {},
|
|
197
|
+
})
|
|
198
|
+
: await evaluator(runs, example);
|
|
182
199
|
// sanitise the payload before sending to LangSmith
|
|
183
200
|
evaluatorRun.inputs = { runs: runs, example: example };
|
|
184
201
|
evaluatorRun.outputs = result;
|
|
@@ -2,11 +2,21 @@ import { Client } from "../index.js";
|
|
|
2
2
|
import { ComparisonEvaluationResult as ComparisonEvaluationResultRow, Example, Run } from "../schemas.js";
|
|
3
3
|
import { evaluate } from "./index.js";
|
|
4
4
|
type ExperimentResults = Awaited<ReturnType<typeof evaluate>>;
|
|
5
|
+
/** @deprecated Use ComparativeEvaluatorNew instead: (args: { runs, example, inputs, outputs, referenceOutputs }) => ... */
|
|
6
|
+
export type _ComparativeEvaluatorLegacy = (runs: Run[], example: Example) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;
|
|
7
|
+
export type _ComparativeEvaluator = (args: {
|
|
8
|
+
runs: Run[];
|
|
9
|
+
example: Example;
|
|
10
|
+
inputs: Record<string, any>;
|
|
11
|
+
outputs: Record<string, any>[];
|
|
12
|
+
referenceOutputs?: Record<string, any>;
|
|
13
|
+
}) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;
|
|
14
|
+
export type ComparativeEvaluator = _ComparativeEvaluatorLegacy | _ComparativeEvaluator;
|
|
5
15
|
export interface EvaluateComparativeOptions {
|
|
6
16
|
/**
|
|
7
17
|
* A list of evaluators to use for comparative evaluation.
|
|
8
18
|
*/
|
|
9
|
-
evaluators: Array<
|
|
19
|
+
evaluators: Array<ComparativeEvaluator>;
|
|
10
20
|
/**
|
|
11
21
|
* Randomize the order of outputs for each evaluation
|
|
12
22
|
* @default false
|
|
@@ -156,7 +156,16 @@ export async function evaluateComparative(experiments, options) {
|
|
|
156
156
|
const caller = new AsyncCaller({ maxConcurrency: options.maxConcurrency });
|
|
157
157
|
async function evaluateAndSubmitFeedback(runs, example, evaluator) {
|
|
158
158
|
const expectedRunIds = new Set(runs.map((r) => r.id));
|
|
159
|
-
|
|
159
|
+
// Check if evaluator expects an object parameter
|
|
160
|
+
const result = evaluator.length === 1
|
|
161
|
+
? await evaluator({
|
|
162
|
+
runs: options.randomizeOrder ? shuffle(runs) : runs,
|
|
163
|
+
example,
|
|
164
|
+
inputs: example.inputs,
|
|
165
|
+
outputs: runs.map((run) => run.outputs || {}),
|
|
166
|
+
referenceOutputs: example.outputs || {},
|
|
167
|
+
})
|
|
168
|
+
: await evaluator(runs, example);
|
|
160
169
|
for (const [runId, score] of Object.entries(result.scores)) {
|
|
161
170
|
// validate if the run id
|
|
162
171
|
if (!expectedRunIds.has(runId)) {
|
|
@@ -172,7 +181,15 @@ export async function evaluateComparative(experiments, options) {
|
|
|
172
181
|
}
|
|
173
182
|
const tracedEvaluators = options.evaluators.map((evaluator) => traceable(async (runs, example) => {
|
|
174
183
|
const evaluatorRun = getCurrentRunTree();
|
|
175
|
-
const result =
|
|
184
|
+
const result = evaluator.length === 1
|
|
185
|
+
? await evaluator({
|
|
186
|
+
runs: options.randomizeOrder ? shuffle(runs) : runs,
|
|
187
|
+
example,
|
|
188
|
+
inputs: example.inputs,
|
|
189
|
+
outputs: runs.map((run) => run.outputs || {}),
|
|
190
|
+
referenceOutputs: example.outputs || {},
|
|
191
|
+
})
|
|
192
|
+
: await evaluator(runs, example);
|
|
176
193
|
// sanitise the payload before sending to LangSmith
|
|
177
194
|
evaluatorRun.inputs = { runs: runs, example: example };
|
|
178
195
|
evaluatorRun.outputs = result;
|
|
@@ -16,7 +16,14 @@ class DynamicRunEvaluator {
|
|
|
16
16
|
});
|
|
17
17
|
this.func = ((input) => {
|
|
18
18
|
const { run, example } = input.langSmithRunAndExample;
|
|
19
|
-
return evaluator(
|
|
19
|
+
return evaluator({
|
|
20
|
+
...run,
|
|
21
|
+
run,
|
|
22
|
+
example,
|
|
23
|
+
inputs: example?.inputs,
|
|
24
|
+
outputs: run?.outputs,
|
|
25
|
+
referenceOutputs: example?.outputs,
|
|
26
|
+
}, example);
|
|
20
27
|
});
|
|
21
28
|
}
|
|
22
29
|
isEvaluationResults(x) {
|
|
@@ -72,7 +72,19 @@ export type EvaluationResults = {
|
|
|
72
72
|
export interface RunEvaluator {
|
|
73
73
|
evaluateRun(run: Run, example?: Example, options?: Partial<RunTreeConfig>): Promise<EvaluationResult | EvaluationResults>;
|
|
74
74
|
}
|
|
75
|
-
export type RunEvaluatorLike = ((run: Run, example?: Example) => Promise<EvaluationResult | EvaluationResults>) | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
|
|
75
|
+
export type RunEvaluatorLike = ((run: Run, example?: Example) => Promise<EvaluationResult | EvaluationResults>) | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) | ((run: Run, example: Example) => Promise<EvaluationResult | EvaluationResults>) | ((run: Run, example: Example) => EvaluationResult | EvaluationResults) | ((args: {
|
|
76
|
+
run: Run;
|
|
77
|
+
example: Example;
|
|
78
|
+
inputs: Record<string, any>;
|
|
79
|
+
outputs: Record<string, any>;
|
|
80
|
+
referenceOutputs?: Record<string, any>;
|
|
81
|
+
}) => EvaluationResult | EvaluationResults) | ((args: {
|
|
82
|
+
run: Run;
|
|
83
|
+
example: Example;
|
|
84
|
+
inputs: Record<string, any>;
|
|
85
|
+
outputs: Record<string, any>;
|
|
86
|
+
referenceOutputs?: Record<string, any>;
|
|
87
|
+
}) => Promise<EvaluationResult | EvaluationResults>);
|
|
76
88
|
/**
|
|
77
89
|
* Wraps an evaluator function + implements the RunEvaluator interface.
|
|
78
90
|
*/
|
|
@@ -13,7 +13,14 @@ export class DynamicRunEvaluator {
|
|
|
13
13
|
});
|
|
14
14
|
this.func = ((input) => {
|
|
15
15
|
const { run, example } = input.langSmithRunAndExample;
|
|
16
|
-
return evaluator(
|
|
16
|
+
return evaluator({
|
|
17
|
+
...run,
|
|
18
|
+
run,
|
|
19
|
+
example,
|
|
20
|
+
inputs: example?.inputs,
|
|
21
|
+
outputs: run?.outputs,
|
|
22
|
+
referenceOutputs: example?.outputs,
|
|
23
|
+
}, example);
|
|
17
24
|
});
|
|
18
25
|
}
|
|
19
26
|
isEvaluationResults(x) {
|
package/dist/index.cjs
CHANGED
|
@@ -8,4 +8,4 @@ Object.defineProperty(exports, "RunTree", { enumerable: true, get: function () {
|
|
|
8
8
|
var fetch_js_1 = require("./singletons/fetch.cjs");
|
|
9
9
|
Object.defineProperty(exports, "overrideFetchImplementation", { enumerable: true, get: function () { return fetch_js_1.overrideFetchImplementation; } });
|
|
10
10
|
// Update using yarn bump-version
|
|
11
|
-
exports.__version__ = "0.2.
|
|
11
|
+
exports.__version__ = "0.2.9";
|
package/dist/index.d.ts
CHANGED
|
@@ -2,4 +2,4 @@ export { Client, type ClientConfig, type LangSmithTracingClientInterface, } from
|
|
|
2
2
|
export type { Dataset, Example, TracerSession, Run, Feedback, RetrieverOutput, } from "./schemas.js";
|
|
3
3
|
export { RunTree, type RunTreeConfig } from "./run_trees.js";
|
|
4
4
|
export { overrideFetchImplementation } from "./singletons/fetch.js";
|
|
5
|
-
export declare const __version__ = "0.2.
|
|
5
|
+
export declare const __version__ = "0.2.9";
|
package/dist/index.js
CHANGED