braintrust 0.0.147 → 0.0.149

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
290
290
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
291
291
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
292
292
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
293
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
293
294
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
294
295
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
295
296
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -625,6 +626,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
625
626
  * Mainly for internal use. Return the parent object for starting a span in a global context.
626
627
  */
627
628
  declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
629
+ declare function logError(span: Span, error: unknown): void;
628
630
  /**
629
631
  * Toplevel function for starting a span. It checks the following (in precedence order):
630
632
  * * Currently-active span
@@ -743,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
743
745
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
744
746
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
745
747
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
748
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
746
749
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
747
750
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
748
751
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1248,4 +1251,4 @@ declare const LEGACY_CACHED_HEADER = "x-cached";
1248
1251
  declare const X_CACHED_HEADER = "x-bt-cached";
1249
1252
  declare function parseCachedHeader(value: string | null | undefined): number | undefined;
1250
1253
 
1251
- export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
1254
+ export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
package/dist/browser.d.ts CHANGED
@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
290
290
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
291
291
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
292
292
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
293
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
293
294
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
294
295
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
295
296
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -625,6 +626,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
625
626
  * Mainly for internal use. Return the parent object for starting a span in a global context.
626
627
  */
627
628
  declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
629
+ declare function logError(span: Span, error: unknown): void;
628
630
  /**
629
631
  * Toplevel function for starting a span. It checks the following (in precedence order):
630
632
  * * Currently-active span
@@ -743,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
743
745
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
744
746
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
745
747
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
748
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
746
749
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
747
750
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
748
751
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1248,4 +1251,4 @@ declare const LEGACY_CACHED_HEADER = "x-cached";
1248
1251
  declare const X_CACHED_HEADER = "x-bt-cached";
1249
1252
  declare function parseCachedHeader(value: string | null | undefined): number | undefined;
1250
1253
 
1251
- export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
1254
+ export { type AnyDataset, type BackgroundLoggerOpts, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, type EvalCase, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapOpenAI, wrapOpenAIv4, wrapTraced };
package/dist/browser.js CHANGED
@@ -59,6 +59,7 @@ __export(browser_exports, {
59
59
  invoke: () => invoke,
60
60
  loadPrompt: () => loadPrompt,
61
61
  log: () => log,
62
+ logError: () => logError,
62
63
  login: () => login,
63
64
  loginToState: () => loginToState,
64
65
  newId: () => newId,
@@ -109,16 +110,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
109
110
 
110
111
  // src/util.ts
111
112
  var GLOBAL_PROJECT = "Global";
112
- function runFinally(f, finallyF) {
113
+ function runCatchFinally(f, catchF, finallyF) {
113
114
  let runSyncCleanup = true;
114
115
  try {
115
116
  const ret = f();
116
117
  if (ret instanceof Promise) {
117
118
  runSyncCleanup = false;
118
- return ret.finally(finallyF);
119
+ return ret.catch(catchF).finally(finallyF);
119
120
  } else {
120
121
  return ret;
121
122
  }
123
+ } catch (e) {
124
+ return catchF(e);
122
125
  } finally {
123
126
  if (runSyncCleanup) {
124
127
  finallyF();
@@ -918,6 +921,7 @@ var Logger = class {
918
921
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
919
922
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
920
923
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
924
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
921
925
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
922
926
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
923
927
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -952,7 +956,7 @@ var Logger = class {
952
956
  traced(callback, args) {
953
957
  const { setCurrent, ...argsRest } = args ?? {};
954
958
  const span = this.startSpan(argsRest);
955
- const ret = runFinally(
959
+ const ret = runCatchFinally(
956
960
  () => {
957
961
  if (setCurrent ?? true) {
958
962
  return withCurrent(span, callback);
@@ -960,6 +964,10 @@ var Logger = class {
960
964
  return callback(span);
961
965
  }
962
966
  },
967
+ (e) => {
968
+ logError(span, e);
969
+ throw e;
970
+ },
963
971
  () => span.end()
964
972
  );
965
973
  if (this.asyncFlush) {
@@ -1893,9 +1901,22 @@ function getSpanParentObject(options) {
1893
1901
  }
1894
1902
  return NOOP_SPAN;
1895
1903
  }
1904
+ function logError(span, error) {
1905
+ let errorMessage = "<error>";
1906
+ let stackTrace = "";
1907
+ if (error instanceof Error) {
1908
+ errorMessage = error.message;
1909
+ stackTrace = error.stack || "";
1910
+ } else {
1911
+ errorMessage = String(error);
1912
+ }
1913
+ span.log({ error: `${errorMessage}
1914
+
1915
+ ${stackTrace}` });
1916
+ }
1896
1917
  function traced(callback, args) {
1897
1918
  const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
1898
- const ret = runFinally(
1919
+ const ret = runCatchFinally(
1899
1920
  () => {
1900
1921
  if (args?.setCurrent ?? true) {
1901
1922
  return withCurrent(span, callback);
@@ -1903,6 +1924,10 @@ function traced(callback, args) {
1903
1924
  return callback(span);
1904
1925
  }
1905
1926
  },
1927
+ (e) => {
1928
+ logError(span, e);
1929
+ throw e;
1930
+ },
1906
1931
  () => span.end()
1907
1932
  );
1908
1933
  if (args?.asyncFlush) {
@@ -2218,6 +2243,7 @@ var Experiment = class extends ObjectFetcher {
2218
2243
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
2219
2244
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
2220
2245
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
2246
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
2221
2247
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
2222
2248
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
2223
2249
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2247,7 +2273,7 @@ var Experiment = class extends ObjectFetcher {
2247
2273
  traced(callback, args) {
2248
2274
  const { setCurrent, ...argsRest } = args ?? {};
2249
2275
  const span = this.startSpan(argsRest);
2250
- return runFinally(
2276
+ const ret = runCatchFinally(
2251
2277
  () => {
2252
2278
  if (setCurrent ?? true) {
2253
2279
  return withCurrent(span, callback);
@@ -2255,8 +2281,13 @@ var Experiment = class extends ObjectFetcher {
2255
2281
  return callback(span);
2256
2282
  }
2257
2283
  },
2284
+ (e) => {
2285
+ logError(span, e);
2286
+ throw e;
2287
+ },
2258
2288
  () => span.end()
2259
2289
  );
2290
+ return ret;
2260
2291
  }
2261
2292
  /**
2262
2293
  * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2597,7 +2628,7 @@ var SpanImpl = class _SpanImpl {
2597
2628
  traced(callback, args) {
2598
2629
  const { setCurrent, ...argsRest } = args ?? {};
2599
2630
  const span = this.startSpan(argsRest);
2600
- return runFinally(
2631
+ return runCatchFinally(
2601
2632
  () => {
2602
2633
  if (setCurrent ?? true) {
2603
2634
  return withCurrent(span, callback);
@@ -2605,6 +2636,10 @@ var SpanImpl = class _SpanImpl {
2605
2636
  return callback(span);
2606
2637
  }
2607
2638
  },
2639
+ (e) => {
2640
+ logError(span, e);
2641
+ throw e;
2642
+ },
2608
2643
  () => span.end()
2609
2644
  );
2610
2645
  }
@@ -3424,6 +3459,7 @@ configureBrowser();
3424
3459
  invoke,
3425
3460
  loadPrompt,
3426
3461
  log,
3462
+ logError,
3427
3463
  login,
3428
3464
  loginToState,
3429
3465
  newId,
package/dist/browser.mjs CHANGED
@@ -53,16 +53,18 @@ import {
53
53
 
54
54
  // src/util.ts
55
55
  var GLOBAL_PROJECT = "Global";
56
- function runFinally(f, finallyF) {
56
+ function runCatchFinally(f, catchF, finallyF) {
57
57
  let runSyncCleanup = true;
58
58
  try {
59
59
  const ret = f();
60
60
  if (ret instanceof Promise) {
61
61
  runSyncCleanup = false;
62
- return ret.finally(finallyF);
62
+ return ret.catch(catchF).finally(finallyF);
63
63
  } else {
64
64
  return ret;
65
65
  }
66
+ } catch (e) {
67
+ return catchF(e);
66
68
  } finally {
67
69
  if (runSyncCleanup) {
68
70
  finallyF();
@@ -864,6 +866,7 @@ var Logger = class {
864
866
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
865
867
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
866
868
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
869
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
867
870
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
868
871
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
869
872
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -898,7 +901,7 @@ var Logger = class {
898
901
  traced(callback, args) {
899
902
  const { setCurrent, ...argsRest } = args ?? {};
900
903
  const span = this.startSpan(argsRest);
901
- const ret = runFinally(
904
+ const ret = runCatchFinally(
902
905
  () => {
903
906
  if (setCurrent ?? true) {
904
907
  return withCurrent(span, callback);
@@ -906,6 +909,10 @@ var Logger = class {
906
909
  return callback(span);
907
910
  }
908
911
  },
912
+ (e) => {
913
+ logError(span, e);
914
+ throw e;
915
+ },
909
916
  () => span.end()
910
917
  );
911
918
  if (this.asyncFlush) {
@@ -1839,9 +1846,22 @@ function getSpanParentObject(options) {
1839
1846
  }
1840
1847
  return NOOP_SPAN;
1841
1848
  }
1849
+ function logError(span, error) {
1850
+ let errorMessage = "<error>";
1851
+ let stackTrace = "";
1852
+ if (error instanceof Error) {
1853
+ errorMessage = error.message;
1854
+ stackTrace = error.stack || "";
1855
+ } else {
1856
+ errorMessage = String(error);
1857
+ }
1858
+ span.log({ error: `${errorMessage}
1859
+
1860
+ ${stackTrace}` });
1861
+ }
1842
1862
  function traced(callback, args) {
1843
1863
  const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
1844
- const ret = runFinally(
1864
+ const ret = runCatchFinally(
1845
1865
  () => {
1846
1866
  if (args?.setCurrent ?? true) {
1847
1867
  return withCurrent(span, callback);
@@ -1849,6 +1869,10 @@ function traced(callback, args) {
1849
1869
  return callback(span);
1850
1870
  }
1851
1871
  },
1872
+ (e) => {
1873
+ logError(span, e);
1874
+ throw e;
1875
+ },
1852
1876
  () => span.end()
1853
1877
  );
1854
1878
  if (args?.asyncFlush) {
@@ -2164,6 +2188,7 @@ var Experiment = class extends ObjectFetcher {
2164
2188
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
2165
2189
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
2166
2190
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
2191
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
2167
2192
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
2168
2193
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
2169
2194
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2193,7 +2218,7 @@ var Experiment = class extends ObjectFetcher {
2193
2218
  traced(callback, args) {
2194
2219
  const { setCurrent, ...argsRest } = args ?? {};
2195
2220
  const span = this.startSpan(argsRest);
2196
- return runFinally(
2221
+ const ret = runCatchFinally(
2197
2222
  () => {
2198
2223
  if (setCurrent ?? true) {
2199
2224
  return withCurrent(span, callback);
@@ -2201,8 +2226,13 @@ var Experiment = class extends ObjectFetcher {
2201
2226
  return callback(span);
2202
2227
  }
2203
2228
  },
2229
+ (e) => {
2230
+ logError(span, e);
2231
+ throw e;
2232
+ },
2204
2233
  () => span.end()
2205
2234
  );
2235
+ return ret;
2206
2236
  }
2207
2237
  /**
2208
2238
  * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2543,7 +2573,7 @@ var SpanImpl = class _SpanImpl {
2543
2573
  traced(callback, args) {
2544
2574
  const { setCurrent, ...argsRest } = args ?? {};
2545
2575
  const span = this.startSpan(argsRest);
2546
- return runFinally(
2576
+ return runCatchFinally(
2547
2577
  () => {
2548
2578
  if (setCurrent ?? true) {
2549
2579
  return withCurrent(span, callback);
@@ -2551,6 +2581,10 @@ var SpanImpl = class _SpanImpl {
2551
2581
  return callback(span);
2552
2582
  }
2553
2583
  },
2584
+ (e) => {
2585
+ logError(span, e);
2586
+ throw e;
2587
+ },
2554
2588
  () => span.end()
2555
2589
  );
2556
2590
  }
@@ -3371,6 +3405,7 @@ export {
3371
3405
  invoke,
3372
3406
  loadPrompt,
3373
3407
  log,
3408
+ logError,
3374
3409
  login,
3375
3410
  loginToState,
3376
3411
  newId,
package/dist/cli.js CHANGED
@@ -1232,7 +1232,7 @@ var require_package = __commonJS({
1232
1232
  "package.json"(exports2, module2) {
1233
1233
  module2.exports = {
1234
1234
  name: "braintrust",
1235
- version: "0.0.147",
1235
+ version: "0.0.149",
1236
1236
  description: "SDK for integrating Braintrust",
1237
1237
  repository: {
1238
1238
  type: "git",
@@ -1302,7 +1302,7 @@ var require_package = __commonJS({
1302
1302
  },
1303
1303
  dependencies: {
1304
1304
  "@ai-sdk/provider": "^0.0.11",
1305
- "@braintrust/core": "0.0.48",
1305
+ "@braintrust/core": "0.0.50",
1306
1306
  "@next/env": "^14.2.3",
1307
1307
  "@vercel/functions": "^1.0.2",
1308
1308
  ai: "^3.2.16",
@@ -1373,16 +1373,18 @@ var isomorph_default = iso;
1373
1373
 
1374
1374
  // src/util.ts
1375
1375
  var GLOBAL_PROJECT = "Global";
1376
- function runFinally(f, finallyF) {
1376
+ function runCatchFinally(f, catchF, finallyF) {
1377
1377
  let runSyncCleanup = true;
1378
1378
  try {
1379
1379
  const ret = f();
1380
1380
  if (ret instanceof Promise) {
1381
1381
  runSyncCleanup = false;
1382
- return ret.finally(finallyF);
1382
+ return ret.catch(catchF).finally(finallyF);
1383
1383
  } else {
1384
1384
  return ret;
1385
1385
  }
1386
+ } catch (e) {
1387
+ return catchF(e);
1386
1388
  } finally {
1387
1389
  if (runSyncCleanup) {
1388
1390
  finallyF();
@@ -2164,6 +2166,7 @@ var Logger = class {
2164
2166
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
2165
2167
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
2166
2168
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
2169
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
2167
2170
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
2168
2171
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
2169
2172
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2198,7 +2201,7 @@ var Logger = class {
2198
2201
  traced(callback, args) {
2199
2202
  const { setCurrent, ...argsRest } = args ?? {};
2200
2203
  const span = this.startSpan(argsRest);
2201
- const ret = runFinally(
2204
+ const ret = runCatchFinally(
2202
2205
  () => {
2203
2206
  if (setCurrent ?? true) {
2204
2207
  return withCurrent(span, callback);
@@ -2206,6 +2209,10 @@ var Logger = class {
2206
2209
  return callback(span);
2207
2210
  }
2208
2211
  },
2212
+ (e) => {
2213
+ logError(span, e);
2214
+ throw e;
2215
+ },
2209
2216
  () => span.end()
2210
2217
  );
2211
2218
  if (this.asyncFlush) {
@@ -2899,6 +2906,19 @@ async function loginToState(options = {}) {
2899
2906
  state.loginReplaceApiConn(conn);
2900
2907
  return state;
2901
2908
  }
2909
+ function logError(span, error2) {
2910
+ let errorMessage = "<error>";
2911
+ let stackTrace = "";
2912
+ if (error2 instanceof Error) {
2913
+ errorMessage = error2.message;
2914
+ stackTrace = error2.stack || "";
2915
+ } else {
2916
+ errorMessage = String(error2);
2917
+ }
2918
+ span.log({ error: `${errorMessage}
2919
+
2920
+ ${stackTrace}` });
2921
+ }
2902
2922
  function withCurrent(span, callback, state = _globalState) {
2903
2923
  return state.currentSpan.run(span, () => callback(span));
2904
2924
  }
@@ -3113,6 +3133,7 @@ var Experiment = class extends ObjectFetcher {
3113
3133
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
3114
3134
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
3115
3135
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
3136
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
3116
3137
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
3117
3138
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
3118
3139
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -3142,7 +3163,7 @@ var Experiment = class extends ObjectFetcher {
3142
3163
  traced(callback, args) {
3143
3164
  const { setCurrent, ...argsRest } = args ?? {};
3144
3165
  const span = this.startSpan(argsRest);
3145
- return runFinally(
3166
+ const ret = runCatchFinally(
3146
3167
  () => {
3147
3168
  if (setCurrent ?? true) {
3148
3169
  return withCurrent(span, callback);
@@ -3150,8 +3171,13 @@ var Experiment = class extends ObjectFetcher {
3150
3171
  return callback(span);
3151
3172
  }
3152
3173
  },
3174
+ (e) => {
3175
+ logError(span, e);
3176
+ throw e;
3177
+ },
3153
3178
  () => span.end()
3154
3179
  );
3180
+ return ret;
3155
3181
  }
3156
3182
  /**
3157
3183
  * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -3489,7 +3515,7 @@ var SpanImpl = class _SpanImpl {
3489
3515
  traced(callback, args) {
3490
3516
  const { setCurrent, ...argsRest } = args ?? {};
3491
3517
  const span = this.startSpan(argsRest);
3492
- return runFinally(
3518
+ return runCatchFinally(
3493
3519
  () => {
3494
3520
  if (setCurrent ?? true) {
3495
3521
  return withCurrent(span, callback);
@@ -3497,6 +3523,10 @@ var SpanImpl = class _SpanImpl {
3497
3523
  return callback(span);
3498
3524
  }
3499
3525
  },
3526
+ (e) => {
3527
+ logError(span, e);
3528
+ throw e;
3529
+ },
3500
3530
  () => span.end()
3501
3531
  );
3502
3532
  }
@@ -5234,6 +5264,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
5234
5264
  );
5235
5265
  }
5236
5266
  } catch (e) {
5267
+ logError(rootSpan, e);
5237
5268
  error2 = e;
5238
5269
  } finally {
5239
5270
  progressReporter.increment(evaluator.evalName);
@@ -5273,7 +5304,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
5273
5304
  }
5274
5305
  var error = import_chalk.default.bold.red;
5275
5306
  var warning = import_chalk.default.hex("#FFA500");
5276
- function logError(e, verbose) {
5307
+ function logError2(e, verbose) {
5277
5308
  if (!verbose) {
5278
5309
  console.error(`${e}`);
5279
5310
  } else {
@@ -5329,7 +5360,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
5329
5360
  );
5330
5361
  } else {
5331
5362
  for (const result of failingResults) {
5332
- logError(result.error, verbose);
5363
+ logError2(result.error, verbose);
5333
5364
  }
5334
5365
  }
5335
5366
  if (!verbose && !jsonl) {
@@ -6397,7 +6428,7 @@ async function main() {
6397
6428
  try {
6398
6429
  await parsed.func(parsed);
6399
6430
  } catch (e) {
6400
- logError(e, parsed.verbose);
6431
+ logError2(e, parsed.verbose);
6401
6432
  process.exit(1);
6402
6433
  }
6403
6434
  }
package/dist/index.d.mts CHANGED
@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
291
291
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
292
292
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
293
293
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
294
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
294
295
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
295
296
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
296
297
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -626,6 +627,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
626
627
  * Mainly for internal use. Return the parent object for starting a span in a global context.
627
628
  */
628
629
  declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
630
+ declare function logError(span: Span, error: unknown): void;
629
631
  /**
630
632
  * Toplevel function for starting a span. It checks the following (in precedence order):
631
633
  * * Currently-active span
@@ -744,6 +746,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
744
746
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
745
747
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
746
748
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
749
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
747
750
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
748
751
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
749
752
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1415,4 +1418,4 @@ declare function parseCachedHeader(value: string | null | undefined): number | u
1415
1418
  */
1416
1419
  declare function wrapAISDKModel<T extends object>(model: T): T;
1417
1420
 
1418
- export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
1421
+ export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
package/dist/index.d.ts CHANGED
@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
291
291
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
292
292
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
293
293
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
294
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
294
295
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
295
296
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
296
297
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -626,6 +627,7 @@ declare function currentSpan(options?: OptionalStateArg): Span;
626
627
  * Mainly for internal use. Return the parent object for starting a span in a global context.
627
628
  */
628
629
  declare function getSpanParentObject<IsAsyncFlush extends boolean>(options?: AsyncFlushArg<IsAsyncFlush> & OptionalStateArg): Span | Experiment | Logger<IsAsyncFlush>;
630
+ declare function logError(span: Span, error: unknown): void;
629
631
  /**
630
632
  * Toplevel function for starting a span. It checks the following (in precedence order):
631
633
  * * Currently-active span
@@ -744,6 +746,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
744
746
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
745
747
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
746
748
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
749
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
747
750
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
748
751
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
749
752
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1415,4 +1418,4 @@ declare function parseCachedHeader(value: string | null | undefined): number | u
1415
1418
  */
1416
1419
  declare function wrapAISDKModel<T extends object>(model: T): T;
1417
1420
 
1418
- export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
1421
+ export { type AnyDataset, type BackgroundLoggerOpts, BaseExperiment, type BaseMetadata, BraintrustState, BraintrustStream, type BraintrustStreamChunk, type ChatPrompt, type CompiledPrompt, type CompiledPromptParams, type CompletionPrompt, type DataSummary, Dataset, type DatasetSummary, type DefaultMetadataType, type DefaultPromptArgs, type EndSpanArgs, Eval, type EvalCase, type EvalScorerArgs, type EvalTask, type Evaluator, Experiment, type ExperimentSummary, type Exportable, type FullInitOptions, type FullLoginOptions, type InitOptions, type InvokeFunctionArgs, type InvokeReturn, LEGACY_CACHED_HEADER, LazyValue, type LogOptions, Logger, type LoginOptions, type MetricSummary, NOOP_SPAN, NoopSpan, type ObjectMetadata, type PromiseUnless, Prompt, ReadonlyExperiment, Reporter, type ReporterBody, type ScoreSummary, type SerializedBraintrustState, type SetCurrentArg, type Span, SpanImpl, type StartSpanArgs, type WithTransactionId, X_CACHED_HEADER, _internalGetGlobalState, _internalSetInitialState, braintrustStreamChunkSchema, buildLocalSummary, createFinalValuePassThroughStream, currentExperiment, currentLogger, currentSpan, devNullWritableStream, flush, getSpanParentObject, init, initDataset, initExperiment, initLogger, invoke, loadPrompt, log, logError, login, loginToState, newId, parseCachedHeader, reportFailures, setFetch, startSpan, summarize, traceable, traced, updateSpan, withDataset, withExperiment, withLogger, wrapAISDKModel, wrapOpenAI, wrapOpenAIv4, wrapTraced };
package/dist/index.js CHANGED
@@ -64,6 +64,7 @@ __export(src_exports, {
64
64
  invoke: () => invoke,
65
65
  loadPrompt: () => loadPrompt,
66
66
  log: () => log,
67
+ logError: () => logError,
67
68
  login: () => login,
68
69
  loginToState: () => loginToState,
69
70
  newId: () => newId,
@@ -335,16 +336,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
335
336
 
336
337
  // src/util.ts
337
338
  var GLOBAL_PROJECT = "Global";
338
- function runFinally(f, finallyF) {
339
+ function runCatchFinally(f, catchF, finallyF) {
339
340
  let runSyncCleanup = true;
340
341
  try {
341
342
  const ret = f();
342
343
  if (ret instanceof Promise) {
343
344
  runSyncCleanup = false;
344
- return ret.finally(finallyF);
345
+ return ret.catch(catchF).finally(finallyF);
345
346
  } else {
346
347
  return ret;
347
348
  }
349
+ } catch (e) {
350
+ return catchF(e);
348
351
  } finally {
349
352
  if (runSyncCleanup) {
350
353
  finallyF();
@@ -1144,6 +1147,7 @@ var Logger = class {
1144
1147
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
1145
1148
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
1146
1149
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
1150
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
1147
1151
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
1148
1152
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
1149
1153
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1178,7 +1182,7 @@ var Logger = class {
1178
1182
  traced(callback, args) {
1179
1183
  const { setCurrent, ...argsRest } = args ?? {};
1180
1184
  const span = this.startSpan(argsRest);
1181
- const ret = runFinally(
1185
+ const ret = runCatchFinally(
1182
1186
  () => {
1183
1187
  if (setCurrent ?? true) {
1184
1188
  return withCurrent(span, callback);
@@ -1186,6 +1190,10 @@ var Logger = class {
1186
1190
  return callback(span);
1187
1191
  }
1188
1192
  },
1193
+ (e) => {
1194
+ logError(span, e);
1195
+ throw e;
1196
+ },
1189
1197
  () => span.end()
1190
1198
  );
1191
1199
  if (this.asyncFlush) {
@@ -2119,9 +2127,22 @@ function getSpanParentObject(options) {
2119
2127
  }
2120
2128
  return NOOP_SPAN;
2121
2129
  }
2130
+ function logError(span, error2) {
2131
+ let errorMessage = "<error>";
2132
+ let stackTrace = "";
2133
+ if (error2 instanceof Error) {
2134
+ errorMessage = error2.message;
2135
+ stackTrace = error2.stack || "";
2136
+ } else {
2137
+ errorMessage = String(error2);
2138
+ }
2139
+ span.log({ error: `${errorMessage}
2140
+
2141
+ ${stackTrace}` });
2142
+ }
2122
2143
  function traced(callback, args) {
2123
2144
  const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
2124
- const ret = runFinally(
2145
+ const ret = runCatchFinally(
2125
2146
  () => {
2126
2147
  if (args?.setCurrent ?? true) {
2127
2148
  return withCurrent(span, callback);
@@ -2129,6 +2150,10 @@ function traced(callback, args) {
2129
2150
  return callback(span);
2130
2151
  }
2131
2152
  },
2153
+ (e) => {
2154
+ logError(span, e);
2155
+ throw e;
2156
+ },
2132
2157
  () => span.end()
2133
2158
  );
2134
2159
  if (args?.asyncFlush) {
@@ -2444,6 +2469,7 @@ var Experiment = class extends ObjectFetcher {
2444
2469
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
2445
2470
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
2446
2471
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
2472
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
2447
2473
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
2448
2474
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
2449
2475
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2473,7 +2499,7 @@ var Experiment = class extends ObjectFetcher {
2473
2499
  traced(callback, args) {
2474
2500
  const { setCurrent, ...argsRest } = args ?? {};
2475
2501
  const span = this.startSpan(argsRest);
2476
- return runFinally(
2502
+ const ret = runCatchFinally(
2477
2503
  () => {
2478
2504
  if (setCurrent ?? true) {
2479
2505
  return withCurrent(span, callback);
@@ -2481,8 +2507,13 @@ var Experiment = class extends ObjectFetcher {
2481
2507
  return callback(span);
2482
2508
  }
2483
2509
  },
2510
+ (e) => {
2511
+ logError(span, e);
2512
+ throw e;
2513
+ },
2484
2514
  () => span.end()
2485
2515
  );
2516
+ return ret;
2486
2517
  }
2487
2518
  /**
2488
2519
  * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2823,7 +2854,7 @@ var SpanImpl = class _SpanImpl {
2823
2854
  traced(callback, args) {
2824
2855
  const { setCurrent, ...argsRest } = args ?? {};
2825
2856
  const span = this.startSpan(argsRest);
2826
- return runFinally(
2857
+ return runCatchFinally(
2827
2858
  () => {
2828
2859
  if (setCurrent ?? true) {
2829
2860
  return withCurrent(span, callback);
@@ -2831,6 +2862,10 @@ var SpanImpl = class _SpanImpl {
2831
2862
  return callback(span);
2832
2863
  }
2833
2864
  },
2865
+ (e) => {
2866
+ logError(span, e);
2867
+ throw e;
2868
+ },
2834
2869
  () => span.end()
2835
2870
  );
2836
2871
  }
@@ -4745,6 +4780,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
4745
4780
  );
4746
4781
  }
4747
4782
  } catch (e) {
4783
+ logError(rootSpan, e);
4748
4784
  error2 = e;
4749
4785
  } finally {
4750
4786
  progressReporter.increment(evaluator.evalName);
@@ -4784,7 +4820,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
4784
4820
  }
4785
4821
  var error = import_chalk.default.bold.red;
4786
4822
  var warning = import_chalk.default.hex("#FFA500");
4787
- function logError(e, verbose) {
4823
+ function logError2(e, verbose) {
4788
4824
  if (!verbose) {
4789
4825
  console.error(`${e}`);
4790
4826
  } else {
@@ -4840,7 +4876,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
4840
4876
  );
4841
4877
  } else {
4842
4878
  for (const result of failingResults) {
4843
- logError(result.error, verbose);
4879
+ logError2(result.error, verbose);
4844
4880
  }
4845
4881
  }
4846
4882
  if (!verbose && !jsonl) {
@@ -5465,6 +5501,7 @@ configureNode();
5465
5501
  invoke,
5466
5502
  loadPrompt,
5467
5503
  log,
5504
+ logError,
5468
5505
  login,
5469
5506
  loginToState,
5470
5507
  newId,
package/dist/index.mjs CHANGED
@@ -272,16 +272,18 @@ import {
272
272
 
273
273
  // src/util.ts
274
274
  var GLOBAL_PROJECT = "Global";
275
- function runFinally(f, finallyF) {
275
+ function runCatchFinally(f, catchF, finallyF) {
276
276
  let runSyncCleanup = true;
277
277
  try {
278
278
  const ret = f();
279
279
  if (ret instanceof Promise) {
280
280
  runSyncCleanup = false;
281
- return ret.finally(finallyF);
281
+ return ret.catch(catchF).finally(finallyF);
282
282
  } else {
283
283
  return ret;
284
284
  }
285
+ } catch (e) {
286
+ return catchF(e);
285
287
  } finally {
286
288
  if (runSyncCleanup) {
287
289
  finallyF();
@@ -1083,6 +1085,7 @@ var Logger = class {
1083
1085
  * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
1084
1086
  * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
1085
1087
  * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
1088
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
1086
1089
  * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
1087
1090
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
1088
1091
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1117,7 +1120,7 @@ var Logger = class {
1117
1120
  traced(callback, args) {
1118
1121
  const { setCurrent, ...argsRest } = args ?? {};
1119
1122
  const span = this.startSpan(argsRest);
1120
- const ret = runFinally(
1123
+ const ret = runCatchFinally(
1121
1124
  () => {
1122
1125
  if (setCurrent ?? true) {
1123
1126
  return withCurrent(span, callback);
@@ -1125,6 +1128,10 @@ var Logger = class {
1125
1128
  return callback(span);
1126
1129
  }
1127
1130
  },
1131
+ (e) => {
1132
+ logError(span, e);
1133
+ throw e;
1134
+ },
1128
1135
  () => span.end()
1129
1136
  );
1130
1137
  if (this.asyncFlush) {
@@ -2058,9 +2065,22 @@ function getSpanParentObject(options) {
2058
2065
  }
2059
2066
  return NOOP_SPAN;
2060
2067
  }
2068
+ function logError(span, error2) {
2069
+ let errorMessage = "<error>";
2070
+ let stackTrace = "";
2071
+ if (error2 instanceof Error) {
2072
+ errorMessage = error2.message;
2073
+ stackTrace = error2.stack || "";
2074
+ } else {
2075
+ errorMessage = String(error2);
2076
+ }
2077
+ span.log({ error: `${errorMessage}
2078
+
2079
+ ${stackTrace}` });
2080
+ }
2061
2081
  function traced(callback, args) {
2062
2082
  const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
2063
- const ret = runFinally(
2083
+ const ret = runCatchFinally(
2064
2084
  () => {
2065
2085
  if (args?.setCurrent ?? true) {
2066
2086
  return withCurrent(span, callback);
@@ -2068,6 +2088,10 @@ function traced(callback, args) {
2068
2088
  return callback(span);
2069
2089
  }
2070
2090
  },
2091
+ (e) => {
2092
+ logError(span, e);
2093
+ throw e;
2094
+ },
2071
2095
  () => span.end()
2072
2096
  );
2073
2097
  if (args?.asyncFlush) {
@@ -2383,6 +2407,7 @@ var Experiment = class extends ObjectFetcher {
2383
2407
  * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
2384
2408
  * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
2385
2409
  * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
2410
+ * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
2386
2411
  * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
2387
2412
  * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
2388
2413
  * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2412,7 +2437,7 @@ var Experiment = class extends ObjectFetcher {
2412
2437
  traced(callback, args) {
2413
2438
  const { setCurrent, ...argsRest } = args ?? {};
2414
2439
  const span = this.startSpan(argsRest);
2415
- return runFinally(
2440
+ const ret = runCatchFinally(
2416
2441
  () => {
2417
2442
  if (setCurrent ?? true) {
2418
2443
  return withCurrent(span, callback);
@@ -2420,8 +2445,13 @@ var Experiment = class extends ObjectFetcher {
2420
2445
  return callback(span);
2421
2446
  }
2422
2447
  },
2448
+ (e) => {
2449
+ logError(span, e);
2450
+ throw e;
2451
+ },
2423
2452
  () => span.end()
2424
2453
  );
2454
+ return ret;
2425
2455
  }
2426
2456
  /**
2427
2457
  * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2762,7 +2792,7 @@ var SpanImpl = class _SpanImpl {
2762
2792
  traced(callback, args) {
2763
2793
  const { setCurrent, ...argsRest } = args ?? {};
2764
2794
  const span = this.startSpan(argsRest);
2765
- return runFinally(
2795
+ return runCatchFinally(
2766
2796
  () => {
2767
2797
  if (setCurrent ?? true) {
2768
2798
  return withCurrent(span, callback);
@@ -2770,6 +2800,10 @@ var SpanImpl = class _SpanImpl {
2770
2800
  return callback(span);
2771
2801
  }
2772
2802
  },
2803
+ (e) => {
2804
+ logError(span, e);
2805
+ throw e;
2806
+ },
2773
2807
  () => span.end()
2774
2808
  );
2775
2809
  }
@@ -4686,6 +4720,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
4686
4720
  );
4687
4721
  }
4688
4722
  } catch (e) {
4723
+ logError(rootSpan, e);
4689
4724
  error2 = e;
4690
4725
  } finally {
4691
4726
  progressReporter.increment(evaluator.evalName);
@@ -4725,7 +4760,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
4725
4760
  }
4726
4761
  var error = chalk.bold.red;
4727
4762
  var warning = chalk.hex("#FFA500");
4728
- function logError(e, verbose) {
4763
+ function logError2(e, verbose) {
4729
4764
  if (!verbose) {
4730
4765
  console.error(`${e}`);
4731
4766
  } else {
@@ -4781,7 +4816,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
4781
4816
  );
4782
4817
  } else {
4783
4818
  for (const result of failingResults) {
4784
- logError(result.error, verbose);
4819
+ logError2(result.error, verbose);
4785
4820
  }
4786
4821
  }
4787
4822
  if (!verbose && !jsonl) {
@@ -5405,6 +5440,7 @@ export {
5405
5440
  invoke,
5406
5441
  loadPrompt,
5407
5442
  log,
5443
+ logError,
5408
5444
  login,
5409
5445
  loginToState,
5410
5446
  newId,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "braintrust",
3
- "version": "0.0.147",
3
+ "version": "0.0.149",
4
4
  "description": "SDK for integrating Braintrust",
5
5
  "repository": {
6
6
  "type": "git",
@@ -70,7 +70,7 @@
70
70
  },
71
71
  "dependencies": {
72
72
  "@ai-sdk/provider": "^0.0.11",
73
- "@braintrust/core": "0.0.48",
73
+ "@braintrust/core": "0.0.50",
74
74
  "@next/env": "^14.2.3",
75
75
  "@vercel/functions": "^1.0.2",
76
76
  "ai": "^3.2.16",