npm - braintrust - Versions diffs - 0.0.147 → 0.0.148 - Mend

braintrust 0.0.147 → 0.0.148

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/browser.d.mts CHANGED Viewed

@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
      * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
      * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -743,6 +744,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
      * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
      * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".

package/dist/browser.d.ts CHANGED Viewed

@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
      * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
      * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -743,6 +744,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
      * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
      * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".

package/dist/browser.js CHANGED Viewed

@@ -109,16 +109,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
 // src/util.ts
 var GLOBAL_PROJECT = "Global";
-function runFinally(f, finallyF) {
+function runCatchFinally(f, catchF, finallyF) {
   let runSyncCleanup = true;
   try {
     const ret = f();
     if (ret instanceof Promise) {
       runSyncCleanup = false;
-      return ret.finally(finallyF);
+      return ret.catch(catchF).finally(finallyF);
     } else {
       return ret;
     }
+  } catch (e) {
+    return catchF(e);
   } finally {
     if (runSyncCleanup) {
       finallyF();
@@ -918,6 +920,7 @@ var Logger = class {
    * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
    * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -952,7 +955,7 @@ var Logger = class {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    const ret = runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -960,6 +963,10 @@ var Logger = class {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
     if (this.asyncFlush) {
@@ -1893,9 +1900,22 @@ function getSpanParentObject(options) {
   }
   return NOOP_SPAN;
 }
+function logError(span, error) {
+  let errorMessage = "<error>";
+  let stackTrace = "";
+  if (error instanceof Error) {
+    errorMessage = error.message;
+    stackTrace = error.stack || "";
+  } else {
+    errorMessage = String(error);
+  }
+  span.log({ error: `${errorMessage}
+${stackTrace}` });
+}
 function traced(callback, args) {
   const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
-  const ret = runFinally(
+  const ret = runCatchFinally(
     () => {
       if (args?.setCurrent ?? true) {
         return withCurrent(span, callback);
@@ -1903,6 +1923,10 @@ function traced(callback, args) {
         return callback(span);
       }
     },
+    (e) => {
+      logError(span, e);
+      throw e;
+    },
     () => span.end()
   );
   if (args?.asyncFlush) {
@@ -2218,6 +2242,7 @@ var Experiment = class extends ObjectFetcher {
    * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
    * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2247,7 +2272,7 @@ var Experiment = class extends ObjectFetcher {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2255,8 +2280,13 @@ var Experiment = class extends ObjectFetcher {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
+    return ret;
   }
   /**
    * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2597,7 +2627,7 @@ var SpanImpl = class _SpanImpl {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    return runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2605,6 +2635,10 @@ var SpanImpl = class _SpanImpl {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
   }

package/dist/browser.mjs CHANGED Viewed

@@ -53,16 +53,18 @@ import {
 // src/util.ts
 var GLOBAL_PROJECT = "Global";
-function runFinally(f, finallyF) {
+function runCatchFinally(f, catchF, finallyF) {
   let runSyncCleanup = true;
   try {
     const ret = f();
     if (ret instanceof Promise) {
       runSyncCleanup = false;
-      return ret.finally(finallyF);
+      return ret.catch(catchF).finally(finallyF);
     } else {
       return ret;
     }
+  } catch (e) {
+    return catchF(e);
   } finally {
     if (runSyncCleanup) {
       finallyF();
@@ -864,6 +866,7 @@ var Logger = class {
    * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
    * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -898,7 +901,7 @@ var Logger = class {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    const ret = runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -906,6 +909,10 @@ var Logger = class {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
     if (this.asyncFlush) {
@@ -1839,9 +1846,22 @@ function getSpanParentObject(options) {
   }
   return NOOP_SPAN;
 }
+function logError(span, error) {
+  let errorMessage = "<error>";
+  let stackTrace = "";
+  if (error instanceof Error) {
+    errorMessage = error.message;
+    stackTrace = error.stack || "";
+  } else {
+    errorMessage = String(error);
+  }
+  span.log({ error: `${errorMessage}
+${stackTrace}` });
+}
 function traced(callback, args) {
   const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
-  const ret = runFinally(
+  const ret = runCatchFinally(
     () => {
       if (args?.setCurrent ?? true) {
         return withCurrent(span, callback);
@@ -1849,6 +1869,10 @@ function traced(callback, args) {
         return callback(span);
       }
     },
+    (e) => {
+      logError(span, e);
+      throw e;
+    },
     () => span.end()
   );
   if (args?.asyncFlush) {
@@ -2164,6 +2188,7 @@ var Experiment = class extends ObjectFetcher {
    * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
    * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2193,7 +2218,7 @@ var Experiment = class extends ObjectFetcher {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2201,8 +2226,13 @@ var Experiment = class extends ObjectFetcher {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
+    return ret;
   }
   /**
    * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2543,7 +2573,7 @@ var SpanImpl = class _SpanImpl {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    return runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2551,6 +2581,10 @@ var SpanImpl = class _SpanImpl {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
   }

package/dist/cli.js CHANGED Viewed

@@ -1232,7 +1232,7 @@ var require_package = __commonJS({
   "package.json"(exports2, module2) {
     module2.exports = {
       name: "braintrust",
-      version: "0.0.147",
+      version: "0.0.148",
       description: "SDK for integrating Braintrust",
       repository: {
         type: "git",
@@ -1302,7 +1302,7 @@ var require_package = __commonJS({
       },
       dependencies: {
         "@ai-sdk/provider": "^0.0.11",
-        "@braintrust/core": "0.0.48",
+        "@braintrust/core": "0.0.49",
         "@next/env": "^14.2.3",
         "@vercel/functions": "^1.0.2",
         ai: "^3.2.16",
@@ -1373,16 +1373,18 @@ var isomorph_default = iso;
 // src/util.ts
 var GLOBAL_PROJECT = "Global";
-function runFinally(f, finallyF) {
+function runCatchFinally(f, catchF, finallyF) {
   let runSyncCleanup = true;
   try {
     const ret = f();
     if (ret instanceof Promise) {
       runSyncCleanup = false;
-      return ret.finally(finallyF);
+      return ret.catch(catchF).finally(finallyF);
     } else {
       return ret;
     }
+  } catch (e) {
+    return catchF(e);
   } finally {
     if (runSyncCleanup) {
       finallyF();
@@ -2164,6 +2166,7 @@ var Logger = class {
    * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
    * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2198,7 +2201,7 @@ var Logger = class {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    const ret = runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2206,6 +2209,10 @@ var Logger = class {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
     if (this.asyncFlush) {
@@ -2899,6 +2906,19 @@ async function loginToState(options = {}) {
   state.loginReplaceApiConn(conn);
   return state;
 }
+function logError(span, error2) {
+  let errorMessage = "<error>";
+  let stackTrace = "";
+  if (error2 instanceof Error) {
+    errorMessage = error2.message;
+    stackTrace = error2.stack || "";
+  } else {
+    errorMessage = String(error2);
+  }
+  span.log({ error: `${errorMessage}
+${stackTrace}` });
+}
 function withCurrent(span, callback, state = _globalState) {
   return state.currentSpan.run(span, () => callback(span));
 }
@@ -3113,6 +3133,7 @@ var Experiment = class extends ObjectFetcher {
    * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
    * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -3142,7 +3163,7 @@ var Experiment = class extends ObjectFetcher {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -3150,8 +3171,13 @@ var Experiment = class extends ObjectFetcher {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
+    return ret;
   }
   /**
    * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -3489,7 +3515,7 @@ var SpanImpl = class _SpanImpl {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    return runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -3497,6 +3523,10 @@ var SpanImpl = class _SpanImpl {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
   }
@@ -5273,7 +5303,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
 }
 var error = import_chalk.default.bold.red;
 var warning = import_chalk.default.hex("#FFA500");
-function logError(e, verbose) {
+function logError2(e, verbose) {
   if (!verbose) {
     console.error(`${e}`);
   } else {
@@ -5329,7 +5359,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
       );
     } else {
       for (const result of failingResults) {
-        logError(result.error, verbose);
+        logError2(result.error, verbose);
       }
     }
     if (!verbose && !jsonl) {
@@ -6397,7 +6427,7 @@ async function main() {
   try {
     await parsed.func(parsed);
   } catch (e) {
-    logError(e, parsed.verbose);
+    logError2(e, parsed.verbose);
     process.exit(1);
   }
 }

package/dist/index.d.mts CHANGED Viewed

@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
      * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
      * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -744,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
      * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
      * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".

package/dist/index.d.ts CHANGED Viewed

@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
      * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
      * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -744,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
      * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
      * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
      * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+     * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
      * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
      * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
      * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".

package/dist/index.js CHANGED Viewed

@@ -335,16 +335,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
 // src/util.ts
 var GLOBAL_PROJECT = "Global";
-function runFinally(f, finallyF) {
+function runCatchFinally(f, catchF, finallyF) {
   let runSyncCleanup = true;
   try {
     const ret = f();
     if (ret instanceof Promise) {
       runSyncCleanup = false;
-      return ret.finally(finallyF);
+      return ret.catch(catchF).finally(finallyF);
     } else {
       return ret;
     }
+  } catch (e) {
+    return catchF(e);
   } finally {
     if (runSyncCleanup) {
       finallyF();
@@ -1144,6 +1146,7 @@ var Logger = class {
    * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
    * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1178,7 +1181,7 @@ var Logger = class {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    const ret = runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -1186,6 +1189,10 @@ var Logger = class {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
     if (this.asyncFlush) {
@@ -2119,9 +2126,22 @@ function getSpanParentObject(options) {
   }
   return NOOP_SPAN;
 }
+function logError(span, error2) {
+  let errorMessage = "<error>";
+  let stackTrace = "";
+  if (error2 instanceof Error) {
+    errorMessage = error2.message;
+    stackTrace = error2.stack || "";
+  } else {
+    errorMessage = String(error2);
+  }
+  span.log({ error: `${errorMessage}
+${stackTrace}` });
+}
 function traced(callback, args) {
   const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
-  const ret = runFinally(
+  const ret = runCatchFinally(
     () => {
       if (args?.setCurrent ?? true) {
         return withCurrent(span, callback);
@@ -2129,6 +2149,10 @@ function traced(callback, args) {
         return callback(span);
       }
     },
+    (e) => {
+      logError(span, e);
+      throw e;
+    },
     () => span.end()
   );
   if (args?.asyncFlush) {
@@ -2444,6 +2468,7 @@ var Experiment = class extends ObjectFetcher {
    * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
    * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2473,7 +2498,7 @@ var Experiment = class extends ObjectFetcher {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2481,8 +2506,13 @@ var Experiment = class extends ObjectFetcher {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
+    return ret;
   }
   /**
    * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2823,7 +2853,7 @@ var SpanImpl = class _SpanImpl {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    return runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2831,6 +2861,10 @@ var SpanImpl = class _SpanImpl {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
   }
@@ -4784,7 +4818,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
 }
 var error = import_chalk.default.bold.red;
 var warning = import_chalk.default.hex("#FFA500");
-function logError(e, verbose) {
+function logError2(e, verbose) {
   if (!verbose) {
     console.error(`${e}`);
   } else {
@@ -4840,7 +4874,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
       );
     } else {
       for (const result of failingResults) {
-        logError(result.error, verbose);
+        logError2(result.error, verbose);
       }
     }
     if (!verbose && !jsonl) {

package/dist/index.mjs CHANGED Viewed

@@ -272,16 +272,18 @@ import {
 // src/util.ts
 var GLOBAL_PROJECT = "Global";
-function runFinally(f, finallyF) {
+function runCatchFinally(f, catchF, finallyF) {
   let runSyncCleanup = true;
   try {
     const ret = f();
     if (ret instanceof Promise) {
       runSyncCleanup = false;
-      return ret.finally(finallyF);
+      return ret.catch(catchF).finally(finallyF);
     } else {
       return ret;
     }
+  } catch (e) {
+    return catchF(e);
   } finally {
     if (runSyncCleanup) {
       finallyF();
@@ -1083,6 +1085,7 @@ var Logger = class {
    * @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
    * @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -1117,7 +1120,7 @@ var Logger = class {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    const ret = runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -1125,6 +1128,10 @@ var Logger = class {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
     if (this.asyncFlush) {
@@ -2058,9 +2065,22 @@ function getSpanParentObject(options) {
   }
   return NOOP_SPAN;
 }
+function logError(span, error2) {
+  let errorMessage = "<error>";
+  let stackTrace = "";
+  if (error2 instanceof Error) {
+    errorMessage = error2.message;
+    stackTrace = error2.stack || "";
+  } else {
+    errorMessage = String(error2);
+  }
+  span.log({ error: `${errorMessage}
+${stackTrace}` });
+}
 function traced(callback, args) {
   const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
-  const ret = runFinally(
+  const ret = runCatchFinally(
     () => {
       if (args?.setCurrent ?? true) {
         return withCurrent(span, callback);
@@ -2068,6 +2088,10 @@ function traced(callback, args) {
         return callback(span);
       }
     },
+    (e) => {
+      logError(span, e);
+      throw e;
+    },
     () => span.end()
   );
   if (args?.asyncFlush) {
@@ -2383,6 +2407,7 @@ var Experiment = class extends ObjectFetcher {
    * @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
    * @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
    * @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
+   * @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
    * @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
    * @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
    * @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
@@ -2412,7 +2437,7 @@ var Experiment = class extends ObjectFetcher {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    const ret = runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2420,8 +2445,13 @@ var Experiment = class extends ObjectFetcher {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
+    return ret;
   }
   /**
    * Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
@@ -2762,7 +2792,7 @@ var SpanImpl = class _SpanImpl {
   traced(callback, args) {
     const { setCurrent, ...argsRest } = args ?? {};
     const span = this.startSpan(argsRest);
-    return runFinally(
+    return runCatchFinally(
       () => {
         if (setCurrent ?? true) {
           return withCurrent(span, callback);
@@ -2770,6 +2800,10 @@ var SpanImpl = class _SpanImpl {
           return callback(span);
         }
       },
+      (e) => {
+        logError(span, e);
+        throw e;
+      },
       () => span.end()
     );
   }
@@ -4725,7 +4759,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
 }
 var error = chalk.bold.red;
 var warning = chalk.hex("#FFA500");
-function logError(e, verbose) {
+function logError2(e, verbose) {
   if (!verbose) {
     console.error(`${e}`);
   } else {
@@ -4781,7 +4815,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
       );
     } else {
       for (const result of failingResults) {
-        logError(result.error, verbose);
+        logError2(result.error, verbose);
       }
     }
     if (!verbose && !jsonl) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "braintrust",
-  "version": "0.0.147",
+  "version": "0.0.148",
   "description": "SDK for integrating Braintrust",
   "repository": {
     "type": "git",
@@ -70,7 +70,7 @@
   },
   "dependencies": {
     "@ai-sdk/provider": "^0.0.11",
-    "@braintrust/core": "0.0.48",
+    "@braintrust/core": "0.0.49",
     "@next/env": "^14.2.3",
     "@vercel/functions": "^1.0.2",
     "ai": "^3.2.16",