braintrust 0.0.147 → 0.0.149
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.d.mts +4 -1
- package/dist/browser.d.ts +4 -1
- package/dist/browser.js +42 -6
- package/dist/browser.mjs +41 -6
- package/dist/cli.js +41 -10
- package/dist/index.d.mts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +45 -8
- package/dist/index.mjs +44 -8
- package/package.json +2 -2
package/dist/browser.d.mts
CHANGED
|
@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
290
290
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
291
291
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
292
292
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
293
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
293
294
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
294
295
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
295
296
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -625,6 +626,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
|
|
|
625
626
|
* Mainly for internal use. Return the parent object for starting a span in a global context.
|
|
626
627
|
*/
|
|
627
628
|
declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
|
|
629
|
+
declare function logError(span: Span, error: unknown): void;
|
|
628
630
|
/**
|
|
629
631
|
* Toplevel function for starting a span. It checks the following (in precedence order):
|
|
630
632
|
* * Currently-active span
|
|
@@ -743,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
743
745
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
744
746
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
745
747
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
748
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
746
749
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
747
750
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
748
751
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1248,4 +1251,4 @@ declare const LEGACY_CACHED_HEADER = "x-cached";
|
|
|
1248
1251
|
declare const X_CACHED_HEADER = "x-bt-cached";
|
|
1249
1252
|
declare function parseCachedHeader(value: string | null | undefined): number | undefined;
|
|
1250
1253
|
|
|
1251
|
-
export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
|
1254
|
+
export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
package/dist/browser.d.ts
CHANGED
|
@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
290
290
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
291
291
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
292
292
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
293
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
293
294
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
294
295
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
295
296
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -625,6 +626,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
|
|
|
625
626
|
* Mainly for internal use. Return the parent object for starting a span in a global context.
|
|
626
627
|
*/
|
|
627
628
|
declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
|
|
629
|
+
declare function logError(span: Span, error: unknown): void;
|
|
628
630
|
/**
|
|
629
631
|
* Toplevel function for starting a span. It checks the following (in precedence order):
|
|
630
632
|
* * Currently-active span
|
|
@@ -743,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
743
745
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
744
746
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
745
747
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
748
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
746
749
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
747
750
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
748
751
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1248,4 +1251,4 @@ declare const LEGACY_CACHED_HEADER = "x-cached";
|
|
|
1248
1251
|
declare const X_CACHED_HEADER = "x-bt-cached";
|
|
1249
1252
|
declare function parseCachedHeader(value: string | null | undefined): number | undefined;
|
|
1250
1253
|
|
|
1251
|
-
export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
|
1254
|
+
export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
package/dist/browser.js
CHANGED
|
@@ -59,6 +59,7 @@ __export(browser_exports, {
|
|
|
59
59
|
invoke: () => invoke,
|
|
60
60
|
loadPrompt: () => loadPrompt,
|
|
61
61
|
log: () => log,
|
|
62
|
+
logError: () => logError,
|
|
62
63
|
login: () => login,
|
|
63
64
|
loginToState: () => loginToState,
|
|
64
65
|
newId: () => newId,
|
|
@@ -109,16 +110,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
|
|
|
109
110
|
|
|
110
111
|
// src/util.ts
|
|
111
112
|
var GLOBAL_PROJECT = "Global";
|
|
112
|
-
function
|
|
113
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
113
114
|
let runSyncCleanup = true;
|
|
114
115
|
try {
|
|
115
116
|
const ret = f();
|
|
116
117
|
if (ret instanceof Promise) {
|
|
117
118
|
runSyncCleanup = false;
|
|
118
|
-
return ret.finally(finallyF);
|
|
119
|
+
return ret.catch(catchF).finally(finallyF);
|
|
119
120
|
} else {
|
|
120
121
|
return ret;
|
|
121
122
|
}
|
|
123
|
+
} catch (e) {
|
|
124
|
+
return catchF(e);
|
|
122
125
|
} finally {
|
|
123
126
|
if (runSyncCleanup) {
|
|
124
127
|
finallyF();
|
|
@@ -918,6 +921,7 @@ var Logger = class {
|
|
|
918
921
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
919
922
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
920
923
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
924
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
921
925
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
922
926
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
923
927
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -952,7 +956,7 @@ var Logger = class {
|
|
|
952
956
|
traced(callback, args) {
|
|
953
957
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
954
958
|
const span = this.startSpan(argsRest);
|
|
955
|
-
const ret =
|
|
959
|
+
const ret = runCatchFinally(
|
|
956
960
|
() => {
|
|
957
961
|
if (setCurrent ?? true) {
|
|
958
962
|
return withCurrent(span, callback);
|
|
@@ -960,6 +964,10 @@ var Logger = class {
|
|
|
960
964
|
return callback(span);
|
|
961
965
|
}
|
|
962
966
|
},
|
|
967
|
+
(e) => {
|
|
968
|
+
logError(span, e);
|
|
969
|
+
throw e;
|
|
970
|
+
},
|
|
963
971
|
() => span.end()
|
|
964
972
|
);
|
|
965
973
|
if (this.asyncFlush) {
|
|
@@ -1893,9 +1901,22 @@ function getSpanParentObject(options) {
|
|
|
1893
1901
|
}
|
|
1894
1902
|
return NOOP_SPAN;
|
|
1895
1903
|
}
|
|
1904
|
+
function logError(span, error) {
|
|
1905
|
+
let errorMessage = "<error>";
|
|
1906
|
+
let stackTrace = "";
|
|
1907
|
+
if (error instanceof Error) {
|
|
1908
|
+
errorMessage = error.message;
|
|
1909
|
+
stackTrace = error.stack || "";
|
|
1910
|
+
} else {
|
|
1911
|
+
errorMessage = String(error);
|
|
1912
|
+
}
|
|
1913
|
+
span.log({ error: `${errorMessage}
|
|
1914
|
+
|
|
1915
|
+
${stackTrace}` });
|
|
1916
|
+
}
|
|
1896
1917
|
function traced(callback, args) {
|
|
1897
1918
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
1898
|
-
const ret =
|
|
1919
|
+
const ret = runCatchFinally(
|
|
1899
1920
|
() => {
|
|
1900
1921
|
if (args?.setCurrent ?? true) {
|
|
1901
1922
|
return withCurrent(span, callback);
|
|
@@ -1903,6 +1924,10 @@ function traced(callback, args) {
|
|
|
1903
1924
|
return callback(span);
|
|
1904
1925
|
}
|
|
1905
1926
|
},
|
|
1927
|
+
(e) => {
|
|
1928
|
+
logError(span, e);
|
|
1929
|
+
throw e;
|
|
1930
|
+
},
|
|
1906
1931
|
() => span.end()
|
|
1907
1932
|
);
|
|
1908
1933
|
if (args?.asyncFlush) {
|
|
@@ -2218,6 +2243,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2218
2243
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2219
2244
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2220
2245
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2246
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2221
2247
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2222
2248
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2223
2249
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2247,7 +2273,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2247
2273
|
traced(callback, args) {
|
|
2248
2274
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2249
2275
|
const span = this.startSpan(argsRest);
|
|
2250
|
-
|
|
2276
|
+
const ret = runCatchFinally(
|
|
2251
2277
|
() => {
|
|
2252
2278
|
if (setCurrent ?? true) {
|
|
2253
2279
|
return withCurrent(span, callback);
|
|
@@ -2255,8 +2281,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2255
2281
|
return callback(span);
|
|
2256
2282
|
}
|
|
2257
2283
|
},
|
|
2284
|
+
(e) => {
|
|
2285
|
+
logError(span, e);
|
|
2286
|
+
throw e;
|
|
2287
|
+
},
|
|
2258
2288
|
() => span.end()
|
|
2259
2289
|
);
|
|
2290
|
+
return ret;
|
|
2260
2291
|
}
|
|
2261
2292
|
/**
|
|
2262
2293
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2597,7 +2628,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2597
2628
|
traced(callback, args) {
|
|
2598
2629
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2599
2630
|
const span = this.startSpan(argsRest);
|
|
2600
|
-
return
|
|
2631
|
+
return runCatchFinally(
|
|
2601
2632
|
() => {
|
|
2602
2633
|
if (setCurrent ?? true) {
|
|
2603
2634
|
return withCurrent(span, callback);
|
|
@@ -2605,6 +2636,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2605
2636
|
return callback(span);
|
|
2606
2637
|
}
|
|
2607
2638
|
},
|
|
2639
|
+
(e) => {
|
|
2640
|
+
logError(span, e);
|
|
2641
|
+
throw e;
|
|
2642
|
+
},
|
|
2608
2643
|
() => span.end()
|
|
2609
2644
|
);
|
|
2610
2645
|
}
|
|
@@ -3424,6 +3459,7 @@ configureBrowser();
|
|
|
3424
3459
|
invoke,
|
|
3425
3460
|
loadPrompt,
|
|
3426
3461
|
log,
|
|
3462
|
+
logError,
|
|
3427
3463
|
login,
|
|
3428
3464
|
loginToState,
|
|
3429
3465
|
newId,
|
package/dist/browser.mjs
CHANGED
|
@@ -53,16 +53,18 @@ import {
|
|
|
53
53
|
|
|
54
54
|
// src/util.ts
|
|
55
55
|
var GLOBAL_PROJECT = "Global";
|
|
56
|
-
function
|
|
56
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
57
57
|
let runSyncCleanup = true;
|
|
58
58
|
try {
|
|
59
59
|
const ret = f();
|
|
60
60
|
if (ret instanceof Promise) {
|
|
61
61
|
runSyncCleanup = false;
|
|
62
|
-
return ret.finally(finallyF);
|
|
62
|
+
return ret.catch(catchF).finally(finallyF);
|
|
63
63
|
} else {
|
|
64
64
|
return ret;
|
|
65
65
|
}
|
|
66
|
+
} catch (e) {
|
|
67
|
+
return catchF(e);
|
|
66
68
|
} finally {
|
|
67
69
|
if (runSyncCleanup) {
|
|
68
70
|
finallyF();
|
|
@@ -864,6 +866,7 @@ var Logger = class {
|
|
|
864
866
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
865
867
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
866
868
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
869
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
867
870
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
868
871
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
869
872
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -898,7 +901,7 @@ var Logger = class {
|
|
|
898
901
|
traced(callback, args) {
|
|
899
902
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
900
903
|
const span = this.startSpan(argsRest);
|
|
901
|
-
const ret =
|
|
904
|
+
const ret = runCatchFinally(
|
|
902
905
|
() => {
|
|
903
906
|
if (setCurrent ?? true) {
|
|
904
907
|
return withCurrent(span, callback);
|
|
@@ -906,6 +909,10 @@ var Logger = class {
|
|
|
906
909
|
return callback(span);
|
|
907
910
|
}
|
|
908
911
|
},
|
|
912
|
+
(e) => {
|
|
913
|
+
logError(span, e);
|
|
914
|
+
throw e;
|
|
915
|
+
},
|
|
909
916
|
() => span.end()
|
|
910
917
|
);
|
|
911
918
|
if (this.asyncFlush) {
|
|
@@ -1839,9 +1846,22 @@ function getSpanParentObject(options) {
|
|
|
1839
1846
|
}
|
|
1840
1847
|
return NOOP_SPAN;
|
|
1841
1848
|
}
|
|
1849
|
+
function logError(span, error) {
|
|
1850
|
+
let errorMessage = "<error>";
|
|
1851
|
+
let stackTrace = "";
|
|
1852
|
+
if (error instanceof Error) {
|
|
1853
|
+
errorMessage = error.message;
|
|
1854
|
+
stackTrace = error.stack || "";
|
|
1855
|
+
} else {
|
|
1856
|
+
errorMessage = String(error);
|
|
1857
|
+
}
|
|
1858
|
+
span.log({ error: `${errorMessage}
|
|
1859
|
+
|
|
1860
|
+
${stackTrace}` });
|
|
1861
|
+
}
|
|
1842
1862
|
function traced(callback, args) {
|
|
1843
1863
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
1844
|
-
const ret =
|
|
1864
|
+
const ret = runCatchFinally(
|
|
1845
1865
|
() => {
|
|
1846
1866
|
if (args?.setCurrent ?? true) {
|
|
1847
1867
|
return withCurrent(span, callback);
|
|
@@ -1849,6 +1869,10 @@ function traced(callback, args) {
|
|
|
1849
1869
|
return callback(span);
|
|
1850
1870
|
}
|
|
1851
1871
|
},
|
|
1872
|
+
(e) => {
|
|
1873
|
+
logError(span, e);
|
|
1874
|
+
throw e;
|
|
1875
|
+
},
|
|
1852
1876
|
() => span.end()
|
|
1853
1877
|
);
|
|
1854
1878
|
if (args?.asyncFlush) {
|
|
@@ -2164,6 +2188,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2164
2188
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2165
2189
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2166
2190
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2191
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2167
2192
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2168
2193
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2169
2194
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2193,7 +2218,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2193
2218
|
traced(callback, args) {
|
|
2194
2219
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2195
2220
|
const span = this.startSpan(argsRest);
|
|
2196
|
-
|
|
2221
|
+
const ret = runCatchFinally(
|
|
2197
2222
|
() => {
|
|
2198
2223
|
if (setCurrent ?? true) {
|
|
2199
2224
|
return withCurrent(span, callback);
|
|
@@ -2201,8 +2226,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2201
2226
|
return callback(span);
|
|
2202
2227
|
}
|
|
2203
2228
|
},
|
|
2229
|
+
(e) => {
|
|
2230
|
+
logError(span, e);
|
|
2231
|
+
throw e;
|
|
2232
|
+
},
|
|
2204
2233
|
() => span.end()
|
|
2205
2234
|
);
|
|
2235
|
+
return ret;
|
|
2206
2236
|
}
|
|
2207
2237
|
/**
|
|
2208
2238
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2543,7 +2573,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2543
2573
|
traced(callback, args) {
|
|
2544
2574
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2545
2575
|
const span = this.startSpan(argsRest);
|
|
2546
|
-
return
|
|
2576
|
+
return runCatchFinally(
|
|
2547
2577
|
() => {
|
|
2548
2578
|
if (setCurrent ?? true) {
|
|
2549
2579
|
return withCurrent(span, callback);
|
|
@@ -2551,6 +2581,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2551
2581
|
return callback(span);
|
|
2552
2582
|
}
|
|
2553
2583
|
},
|
|
2584
|
+
(e) => {
|
|
2585
|
+
logError(span, e);
|
|
2586
|
+
throw e;
|
|
2587
|
+
},
|
|
2554
2588
|
() => span.end()
|
|
2555
2589
|
);
|
|
2556
2590
|
}
|
|
@@ -3371,6 +3405,7 @@ export {
|
|
|
3371
3405
|
invoke,
|
|
3372
3406
|
loadPrompt,
|
|
3373
3407
|
log,
|
|
3408
|
+
logError,
|
|
3374
3409
|
login,
|
|
3375
3410
|
loginToState,
|
|
3376
3411
|
newId,
|
package/dist/cli.js
CHANGED
|
@@ -1232,7 +1232,7 @@ var require_package = __commonJS({
|
|
|
1232
1232
|
"package.json"(exports2, module2) {
|
|
1233
1233
|
module2.exports = {
|
|
1234
1234
|
name: "braintrust",
|
|
1235
|
-
version: "0.0.
|
|
1235
|
+
version: "0.0.149",
|
|
1236
1236
|
description: "SDK for integrating Braintrust",
|
|
1237
1237
|
repository: {
|
|
1238
1238
|
type: "git",
|
|
@@ -1302,7 +1302,7 @@ var require_package = __commonJS({
|
|
|
1302
1302
|
},
|
|
1303
1303
|
dependencies: {
|
|
1304
1304
|
"@ai-sdk/provider": "^0.0.11",
|
|
1305
|
-
"@braintrust/core": "0.0.
|
|
1305
|
+
"@braintrust/core": "0.0.50",
|
|
1306
1306
|
"@next/env": "^14.2.3",
|
|
1307
1307
|
"@vercel/functions": "^1.0.2",
|
|
1308
1308
|
ai: "^3.2.16",
|
|
@@ -1373,16 +1373,18 @@ var isomorph_default = iso;
|
|
|
1373
1373
|
|
|
1374
1374
|
// src/util.ts
|
|
1375
1375
|
var GLOBAL_PROJECT = "Global";
|
|
1376
|
-
function
|
|
1376
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
1377
1377
|
let runSyncCleanup = true;
|
|
1378
1378
|
try {
|
|
1379
1379
|
const ret = f();
|
|
1380
1380
|
if (ret instanceof Promise) {
|
|
1381
1381
|
runSyncCleanup = false;
|
|
1382
|
-
return ret.finally(finallyF);
|
|
1382
|
+
return ret.catch(catchF).finally(finallyF);
|
|
1383
1383
|
} else {
|
|
1384
1384
|
return ret;
|
|
1385
1385
|
}
|
|
1386
|
+
} catch (e) {
|
|
1387
|
+
return catchF(e);
|
|
1386
1388
|
} finally {
|
|
1387
1389
|
if (runSyncCleanup) {
|
|
1388
1390
|
finallyF();
|
|
@@ -2164,6 +2166,7 @@ var Logger = class {
|
|
|
2164
2166
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
2165
2167
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2166
2168
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2169
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2167
2170
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
2168
2171
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2169
2172
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2198,7 +2201,7 @@ var Logger = class {
|
|
|
2198
2201
|
traced(callback, args) {
|
|
2199
2202
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2200
2203
|
const span = this.startSpan(argsRest);
|
|
2201
|
-
const ret =
|
|
2204
|
+
const ret = runCatchFinally(
|
|
2202
2205
|
() => {
|
|
2203
2206
|
if (setCurrent ?? true) {
|
|
2204
2207
|
return withCurrent(span, callback);
|
|
@@ -2206,6 +2209,10 @@ var Logger = class {
|
|
|
2206
2209
|
return callback(span);
|
|
2207
2210
|
}
|
|
2208
2211
|
},
|
|
2212
|
+
(e) => {
|
|
2213
|
+
logError(span, e);
|
|
2214
|
+
throw e;
|
|
2215
|
+
},
|
|
2209
2216
|
() => span.end()
|
|
2210
2217
|
);
|
|
2211
2218
|
if (this.asyncFlush) {
|
|
@@ -2899,6 +2906,19 @@ async function loginToState(options = {}) {
|
|
|
2899
2906
|
state.loginReplaceApiConn(conn);
|
|
2900
2907
|
return state;
|
|
2901
2908
|
}
|
|
2909
|
+
function logError(span, error2) {
|
|
2910
|
+
let errorMessage = "<error>";
|
|
2911
|
+
let stackTrace = "";
|
|
2912
|
+
if (error2 instanceof Error) {
|
|
2913
|
+
errorMessage = error2.message;
|
|
2914
|
+
stackTrace = error2.stack || "";
|
|
2915
|
+
} else {
|
|
2916
|
+
errorMessage = String(error2);
|
|
2917
|
+
}
|
|
2918
|
+
span.log({ error: `${errorMessage}
|
|
2919
|
+
|
|
2920
|
+
${stackTrace}` });
|
|
2921
|
+
}
|
|
2902
2922
|
function withCurrent(span, callback, state = _globalState) {
|
|
2903
2923
|
return state.currentSpan.run(span, () => callback(span));
|
|
2904
2924
|
}
|
|
@@ -3113,6 +3133,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
3113
3133
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
3114
3134
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
3115
3135
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
3136
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
3116
3137
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
3117
3138
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
3118
3139
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -3142,7 +3163,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
3142
3163
|
traced(callback, args) {
|
|
3143
3164
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
3144
3165
|
const span = this.startSpan(argsRest);
|
|
3145
|
-
|
|
3166
|
+
const ret = runCatchFinally(
|
|
3146
3167
|
() => {
|
|
3147
3168
|
if (setCurrent ?? true) {
|
|
3148
3169
|
return withCurrent(span, callback);
|
|
@@ -3150,8 +3171,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
3150
3171
|
return callback(span);
|
|
3151
3172
|
}
|
|
3152
3173
|
},
|
|
3174
|
+
(e) => {
|
|
3175
|
+
logError(span, e);
|
|
3176
|
+
throw e;
|
|
3177
|
+
},
|
|
3153
3178
|
() => span.end()
|
|
3154
3179
|
);
|
|
3180
|
+
return ret;
|
|
3155
3181
|
}
|
|
3156
3182
|
/**
|
|
3157
3183
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -3489,7 +3515,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
3489
3515
|
traced(callback, args) {
|
|
3490
3516
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
3491
3517
|
const span = this.startSpan(argsRest);
|
|
3492
|
-
return
|
|
3518
|
+
return runCatchFinally(
|
|
3493
3519
|
() => {
|
|
3494
3520
|
if (setCurrent ?? true) {
|
|
3495
3521
|
return withCurrent(span, callback);
|
|
@@ -3497,6 +3523,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
3497
3523
|
return callback(span);
|
|
3498
3524
|
}
|
|
3499
3525
|
},
|
|
3526
|
+
(e) => {
|
|
3527
|
+
logError(span, e);
|
|
3528
|
+
throw e;
|
|
3529
|
+
},
|
|
3500
3530
|
() => span.end()
|
|
3501
3531
|
);
|
|
3502
3532
|
}
|
|
@@ -5234,6 +5264,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
5234
5264
|
);
|
|
5235
5265
|
}
|
|
5236
5266
|
} catch (e) {
|
|
5267
|
+
logError(rootSpan, e);
|
|
5237
5268
|
error2 = e;
|
|
5238
5269
|
} finally {
|
|
5239
5270
|
progressReporter.increment(evaluator.evalName);
|
|
@@ -5273,7 +5304,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
5273
5304
|
}
|
|
5274
5305
|
var error = import_chalk.default.bold.red;
|
|
5275
5306
|
var warning = import_chalk.default.hex("#FFA500");
|
|
5276
|
-
function
|
|
5307
|
+
function logError2(e, verbose) {
|
|
5277
5308
|
if (!verbose) {
|
|
5278
5309
|
console.error(`${e}`);
|
|
5279
5310
|
} else {
|
|
@@ -5329,7 +5360,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
|
|
|
5329
5360
|
);
|
|
5330
5361
|
} else {
|
|
5331
5362
|
for (const result of failingResults) {
|
|
5332
|
-
|
|
5363
|
+
logError2(result.error, verbose);
|
|
5333
5364
|
}
|
|
5334
5365
|
}
|
|
5335
5366
|
if (!verbose && !jsonl) {
|
|
@@ -6397,7 +6428,7 @@ async function main() {
|
|
|
6397
6428
|
try {
|
|
6398
6429
|
await parsed.func(parsed);
|
|
6399
6430
|
} catch (e) {
|
|
6400
|
-
|
|
6431
|
+
logError2(e, parsed.verbose);
|
|
6401
6432
|
process.exit(1);
|
|
6402
6433
|
}
|
|
6403
6434
|
}
|
package/dist/index.d.mts
CHANGED
|
@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
291
291
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
292
292
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
293
293
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
294
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
294
295
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
295
296
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
296
297
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -626,6 +627,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
|
|
|
626
627
|
* Mainly for internal use. Return the parent object for starting a span in a global context.
|
|
627
628
|
*/
|
|
628
629
|
declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
|
|
630
|
+
declare function logError(span: Span, error: unknown): void;
|
|
629
631
|
/**
|
|
630
632
|
* Toplevel function for starting a span. It checks the following (in precedence order):
|
|
631
633
|
* * Currently-active span
|
|
@@ -744,6 +746,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
744
746
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
745
747
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
746
748
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
749
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
747
750
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
748
751
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
749
752
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1415,4 +1418,4 @@ declare function parseCachedHeader(value: string | null | undefined): number | u
|
|
|
1415
1418
|
*/
|
|
1416
1419
|
declare function wrapAISDKModel<T extends object>(model: T): T;
|
|
1417
1420
|
|
|
1418
|
-
export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
|
1421
|
+
export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
package/dist/index.d.ts
CHANGED
|
@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
291
291
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
292
292
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
293
293
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
294
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
294
295
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
295
296
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
296
297
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -626,6 +627,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
|
|
|
626
627
|
* Mainly for internal use. Return the parent object for starting a span in a global context.
|
|
627
628
|
*/
|
|
628
629
|
declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
|
|
630
|
+
declare function logError(span: Span, error: unknown): void;
|
|
629
631
|
/**
|
|
630
632
|
* Toplevel function for starting a span. It checks the following (in precedence order):
|
|
631
633
|
* * Currently-active span
|
|
@@ -744,6 +746,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
744
746
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
745
747
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
746
748
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
749
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
747
750
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
748
751
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
749
752
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1415,4 +1418,4 @@ declare function parseCachedHeader(value: string | null | undefined): number | u
|
|
|
1415
1418
|
*/
|
|
1416
1419
|
declare function wrapAISDKModel<T extends object>(model: T): T;
|
|
1417
1420
|
|
|
1418
|
-
export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
|
1421
|
+
export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
|
package/dist/index.js
CHANGED
|
@@ -64,6 +64,7 @@ __export(src_exports, {
|
|
|
64
64
|
invoke: () => invoke,
|
|
65
65
|
loadPrompt: () => loadPrompt,
|
|
66
66
|
log: () => log,
|
|
67
|
+
logError: () => logError,
|
|
67
68
|
login: () => login,
|
|
68
69
|
loginToState: () => loginToState,
|
|
69
70
|
newId: () => newId,
|
|
@@ -335,16 +336,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
|
|
|
335
336
|
|
|
336
337
|
// src/util.ts
|
|
337
338
|
var GLOBAL_PROJECT = "Global";
|
|
338
|
-
function
|
|
339
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
339
340
|
let runSyncCleanup = true;
|
|
340
341
|
try {
|
|
341
342
|
const ret = f();
|
|
342
343
|
if (ret instanceof Promise) {
|
|
343
344
|
runSyncCleanup = false;
|
|
344
|
-
return ret.finally(finallyF);
|
|
345
|
+
return ret.catch(catchF).finally(finallyF);
|
|
345
346
|
} else {
|
|
346
347
|
return ret;
|
|
347
348
|
}
|
|
349
|
+
} catch (e) {
|
|
350
|
+
return catchF(e);
|
|
348
351
|
} finally {
|
|
349
352
|
if (runSyncCleanup) {
|
|
350
353
|
finallyF();
|
|
@@ -1144,6 +1147,7 @@ var Logger = class {
|
|
|
1144
1147
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
1145
1148
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
1146
1149
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
1150
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
1147
1151
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
1148
1152
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
1149
1153
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1178,7 +1182,7 @@ var Logger = class {
|
|
|
1178
1182
|
traced(callback, args) {
|
|
1179
1183
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
1180
1184
|
const span = this.startSpan(argsRest);
|
|
1181
|
-
const ret =
|
|
1185
|
+
const ret = runCatchFinally(
|
|
1182
1186
|
() => {
|
|
1183
1187
|
if (setCurrent ?? true) {
|
|
1184
1188
|
return withCurrent(span, callback);
|
|
@@ -1186,6 +1190,10 @@ var Logger = class {
|
|
|
1186
1190
|
return callback(span);
|
|
1187
1191
|
}
|
|
1188
1192
|
},
|
|
1193
|
+
(e) => {
|
|
1194
|
+
logError(span, e);
|
|
1195
|
+
throw e;
|
|
1196
|
+
},
|
|
1189
1197
|
() => span.end()
|
|
1190
1198
|
);
|
|
1191
1199
|
if (this.asyncFlush) {
|
|
@@ -2119,9 +2127,22 @@ function getSpanParentObject(options) {
|
|
|
2119
2127
|
}
|
|
2120
2128
|
return NOOP_SPAN;
|
|
2121
2129
|
}
|
|
2130
|
+
function logError(span, error2) {
|
|
2131
|
+
let errorMessage = "<error>";
|
|
2132
|
+
let stackTrace = "";
|
|
2133
|
+
if (error2 instanceof Error) {
|
|
2134
|
+
errorMessage = error2.message;
|
|
2135
|
+
stackTrace = error2.stack || "";
|
|
2136
|
+
} else {
|
|
2137
|
+
errorMessage = String(error2);
|
|
2138
|
+
}
|
|
2139
|
+
span.log({ error: `${errorMessage}
|
|
2140
|
+
|
|
2141
|
+
${stackTrace}` });
|
|
2142
|
+
}
|
|
2122
2143
|
function traced(callback, args) {
|
|
2123
2144
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
2124
|
-
const ret =
|
|
2145
|
+
const ret = runCatchFinally(
|
|
2125
2146
|
() => {
|
|
2126
2147
|
if (args?.setCurrent ?? true) {
|
|
2127
2148
|
return withCurrent(span, callback);
|
|
@@ -2129,6 +2150,10 @@ function traced(callback, args) {
|
|
|
2129
2150
|
return callback(span);
|
|
2130
2151
|
}
|
|
2131
2152
|
},
|
|
2153
|
+
(e) => {
|
|
2154
|
+
logError(span, e);
|
|
2155
|
+
throw e;
|
|
2156
|
+
},
|
|
2132
2157
|
() => span.end()
|
|
2133
2158
|
);
|
|
2134
2159
|
if (args?.asyncFlush) {
|
|
@@ -2444,6 +2469,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2444
2469
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2445
2470
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2446
2471
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2472
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2447
2473
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2448
2474
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2449
2475
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2473,7 +2499,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2473
2499
|
traced(callback, args) {
|
|
2474
2500
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2475
2501
|
const span = this.startSpan(argsRest);
|
|
2476
|
-
|
|
2502
|
+
const ret = runCatchFinally(
|
|
2477
2503
|
() => {
|
|
2478
2504
|
if (setCurrent ?? true) {
|
|
2479
2505
|
return withCurrent(span, callback);
|
|
@@ -2481,8 +2507,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2481
2507
|
return callback(span);
|
|
2482
2508
|
}
|
|
2483
2509
|
},
|
|
2510
|
+
(e) => {
|
|
2511
|
+
logError(span, e);
|
|
2512
|
+
throw e;
|
|
2513
|
+
},
|
|
2484
2514
|
() => span.end()
|
|
2485
2515
|
);
|
|
2516
|
+
return ret;
|
|
2486
2517
|
}
|
|
2487
2518
|
/**
|
|
2488
2519
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2823,7 +2854,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2823
2854
|
traced(callback, args) {
|
|
2824
2855
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2825
2856
|
const span = this.startSpan(argsRest);
|
|
2826
|
-
return
|
|
2857
|
+
return runCatchFinally(
|
|
2827
2858
|
() => {
|
|
2828
2859
|
if (setCurrent ?? true) {
|
|
2829
2860
|
return withCurrent(span, callback);
|
|
@@ -2831,6 +2862,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2831
2862
|
return callback(span);
|
|
2832
2863
|
}
|
|
2833
2864
|
},
|
|
2865
|
+
(e) => {
|
|
2866
|
+
logError(span, e);
|
|
2867
|
+
throw e;
|
|
2868
|
+
},
|
|
2834
2869
|
() => span.end()
|
|
2835
2870
|
);
|
|
2836
2871
|
}
|
|
@@ -4745,6 +4780,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
4745
4780
|
);
|
|
4746
4781
|
}
|
|
4747
4782
|
} catch (e) {
|
|
4783
|
+
logError(rootSpan, e);
|
|
4748
4784
|
error2 = e;
|
|
4749
4785
|
} finally {
|
|
4750
4786
|
progressReporter.increment(evaluator.evalName);
|
|
@@ -4784,7 +4820,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
4784
4820
|
}
|
|
4785
4821
|
var error = import_chalk.default.bold.red;
|
|
4786
4822
|
var warning = import_chalk.default.hex("#FFA500");
|
|
4787
|
-
function
|
|
4823
|
+
function logError2(e, verbose) {
|
|
4788
4824
|
if (!verbose) {
|
|
4789
4825
|
console.error(`${e}`);
|
|
4790
4826
|
} else {
|
|
@@ -4840,7 +4876,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
|
|
|
4840
4876
|
);
|
|
4841
4877
|
} else {
|
|
4842
4878
|
for (const result of failingResults) {
|
|
4843
|
-
|
|
4879
|
+
logError2(result.error, verbose);
|
|
4844
4880
|
}
|
|
4845
4881
|
}
|
|
4846
4882
|
if (!verbose && !jsonl) {
|
|
@@ -5465,6 +5501,7 @@ configureNode();
|
|
|
5465
5501
|
invoke,
|
|
5466
5502
|
loadPrompt,
|
|
5467
5503
|
log,
|
|
5504
|
+
logError,
|
|
5468
5505
|
login,
|
|
5469
5506
|
loginToState,
|
|
5470
5507
|
newId,
|
package/dist/index.mjs
CHANGED
|
@@ -272,16 +272,18 @@ import {
|
|
|
272
272
|
|
|
273
273
|
// src/util.ts
|
|
274
274
|
var GLOBAL_PROJECT = "Global";
|
|
275
|
-
function
|
|
275
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
276
276
|
let runSyncCleanup = true;
|
|
277
277
|
try {
|
|
278
278
|
const ret = f();
|
|
279
279
|
if (ret instanceof Promise) {
|
|
280
280
|
runSyncCleanup = false;
|
|
281
|
-
return ret.finally(finallyF);
|
|
281
|
+
return ret.catch(catchF).finally(finallyF);
|
|
282
282
|
} else {
|
|
283
283
|
return ret;
|
|
284
284
|
}
|
|
285
|
+
} catch (e) {
|
|
286
|
+
return catchF(e);
|
|
285
287
|
} finally {
|
|
286
288
|
if (runSyncCleanup) {
|
|
287
289
|
finallyF();
|
|
@@ -1083,6 +1085,7 @@ var Logger = class {
|
|
|
1083
1085
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
1084
1086
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
1085
1087
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
1088
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
1086
1089
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
1087
1090
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
1088
1091
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1117,7 +1120,7 @@ var Logger = class {
|
|
|
1117
1120
|
traced(callback, args) {
|
|
1118
1121
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
1119
1122
|
const span = this.startSpan(argsRest);
|
|
1120
|
-
const ret =
|
|
1123
|
+
const ret = runCatchFinally(
|
|
1121
1124
|
() => {
|
|
1122
1125
|
if (setCurrent ?? true) {
|
|
1123
1126
|
return withCurrent(span, callback);
|
|
@@ -1125,6 +1128,10 @@ var Logger = class {
|
|
|
1125
1128
|
return callback(span);
|
|
1126
1129
|
}
|
|
1127
1130
|
},
|
|
1131
|
+
(e) => {
|
|
1132
|
+
logError(span, e);
|
|
1133
|
+
throw e;
|
|
1134
|
+
},
|
|
1128
1135
|
() => span.end()
|
|
1129
1136
|
);
|
|
1130
1137
|
if (this.asyncFlush) {
|
|
@@ -2058,9 +2065,22 @@ function getSpanParentObject(options) {
|
|
|
2058
2065
|
}
|
|
2059
2066
|
return NOOP_SPAN;
|
|
2060
2067
|
}
|
|
2068
|
+
function logError(span, error2) {
|
|
2069
|
+
let errorMessage = "<error>";
|
|
2070
|
+
let stackTrace = "";
|
|
2071
|
+
if (error2 instanceof Error) {
|
|
2072
|
+
errorMessage = error2.message;
|
|
2073
|
+
stackTrace = error2.stack || "";
|
|
2074
|
+
} else {
|
|
2075
|
+
errorMessage = String(error2);
|
|
2076
|
+
}
|
|
2077
|
+
span.log({ error: `${errorMessage}
|
|
2078
|
+
|
|
2079
|
+
${stackTrace}` });
|
|
2080
|
+
}
|
|
2061
2081
|
function traced(callback, args) {
|
|
2062
2082
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
2063
|
-
const ret =
|
|
2083
|
+
const ret = runCatchFinally(
|
|
2064
2084
|
() => {
|
|
2065
2085
|
if (args?.setCurrent ?? true) {
|
|
2066
2086
|
return withCurrent(span, callback);
|
|
@@ -2068,6 +2088,10 @@ function traced(callback, args) {
|
|
|
2068
2088
|
return callback(span);
|
|
2069
2089
|
}
|
|
2070
2090
|
},
|
|
2091
|
+
(e) => {
|
|
2092
|
+
logError(span, e);
|
|
2093
|
+
throw e;
|
|
2094
|
+
},
|
|
2071
2095
|
() => span.end()
|
|
2072
2096
|
);
|
|
2073
2097
|
if (args?.asyncFlush) {
|
|
@@ -2383,6 +2407,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2383
2407
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2384
2408
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2385
2409
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2410
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2386
2411
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2387
2412
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2388
2413
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2412,7 +2437,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2412
2437
|
traced(callback, args) {
|
|
2413
2438
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2414
2439
|
const span = this.startSpan(argsRest);
|
|
2415
|
-
|
|
2440
|
+
const ret = runCatchFinally(
|
|
2416
2441
|
() => {
|
|
2417
2442
|
if (setCurrent ?? true) {
|
|
2418
2443
|
return withCurrent(span, callback);
|
|
@@ -2420,8 +2445,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2420
2445
|
return callback(span);
|
|
2421
2446
|
}
|
|
2422
2447
|
},
|
|
2448
|
+
(e) => {
|
|
2449
|
+
logError(span, e);
|
|
2450
|
+
throw e;
|
|
2451
|
+
},
|
|
2423
2452
|
() => span.end()
|
|
2424
2453
|
);
|
|
2454
|
+
return ret;
|
|
2425
2455
|
}
|
|
2426
2456
|
/**
|
|
2427
2457
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2762,7 +2792,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2762
2792
|
traced(callback, args) {
|
|
2763
2793
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2764
2794
|
const span = this.startSpan(argsRest);
|
|
2765
|
-
return
|
|
2795
|
+
return runCatchFinally(
|
|
2766
2796
|
() => {
|
|
2767
2797
|
if (setCurrent ?? true) {
|
|
2768
2798
|
return withCurrent(span, callback);
|
|
@@ -2770,6 +2800,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2770
2800
|
return callback(span);
|
|
2771
2801
|
}
|
|
2772
2802
|
},
|
|
2803
|
+
(e) => {
|
|
2804
|
+
logError(span, e);
|
|
2805
|
+
throw e;
|
|
2806
|
+
},
|
|
2773
2807
|
() => span.end()
|
|
2774
2808
|
);
|
|
2775
2809
|
}
|
|
@@ -4686,6 +4720,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
4686
4720
|
);
|
|
4687
4721
|
}
|
|
4688
4722
|
} catch (e) {
|
|
4723
|
+
logError(rootSpan, e);
|
|
4689
4724
|
error2 = e;
|
|
4690
4725
|
} finally {
|
|
4691
4726
|
progressReporter.increment(evaluator.evalName);
|
|
@@ -4725,7 +4760,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
4725
4760
|
}
|
|
4726
4761
|
var error = chalk.bold.red;
|
|
4727
4762
|
var warning = chalk.hex("#FFA500");
|
|
4728
|
-
function
|
|
4763
|
+
function logError2(e, verbose) {
|
|
4729
4764
|
if (!verbose) {
|
|
4730
4765
|
console.error(`${e}`);
|
|
4731
4766
|
} else {
|
|
@@ -4781,7 +4816,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
|
|
|
4781
4816
|
);
|
|
4782
4817
|
} else {
|
|
4783
4818
|
for (const result of failingResults) {
|
|
4784
|
-
|
|
4819
|
+
logError2(result.error, verbose);
|
|
4785
4820
|
}
|
|
4786
4821
|
}
|
|
4787
4822
|
if (!verbose && !jsonl) {
|
|
@@ -5405,6 +5440,7 @@ export {
|
|
|
5405
5440
|
invoke,
|
|
5406
5441
|
loadPrompt,
|
|
5407
5442
|
log,
|
|
5443
|
+
logError,
|
|
5408
5444
|
login,
|
|
5409
5445
|
loginToState,
|
|
5410
5446
|
newId,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "braintrust",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.149",
|
|
4
4
|
"description": "SDK for integrating Braintrust",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -70,7 +70,7 @@
|
|
|
70
70
|
},
|
|
71
71
|
"dependencies": {
|
|
72
72
|
"@ai-sdk/provider": "^0.0.11",
|
|
73
|
-
"@braintrust/core": "0.0.
|
|
73
|
+
"@braintrust/core": "0.0.50",
|
|
74
74
|
"@next/env": "^14.2.3",
|
|
75
75
|
"@vercel/functions": "^1.0.2",
|
|
76
76
|
"ai": "^3.2.16",
|