braintrust 0.0.147 → 0.0.148
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.d.mts +2 -0
- package/dist/browser.d.ts +2 -0
- package/dist/browser.js +40 -6
- package/dist/browser.mjs +40 -6
- package/dist/cli.js +40 -10
- package/dist/index.d.mts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +42 -8
- package/dist/index.mjs +42 -8
- package/package.json +2 -2
package/dist/browser.d.mts
CHANGED
|
@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
290
290
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
291
291
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
292
292
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
293
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
293
294
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
294
295
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
295
296
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -743,6 +744,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
743
744
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
744
745
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
745
746
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
747
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
746
748
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
747
749
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
748
750
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
package/dist/browser.d.ts
CHANGED
|
@@ -290,6 +290,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
290
290
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
291
291
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
292
292
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
293
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
293
294
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
294
295
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
295
296
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -743,6 +744,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
743
744
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
744
745
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
745
746
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
747
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
746
748
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
747
749
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
748
750
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
package/dist/browser.js
CHANGED
|
@@ -109,16 +109,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
|
|
|
109
109
|
|
|
110
110
|
// src/util.ts
|
|
111
111
|
var GLOBAL_PROJECT = "Global";
|
|
112
|
-
function
|
|
112
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
113
113
|
let runSyncCleanup = true;
|
|
114
114
|
try {
|
|
115
115
|
const ret = f();
|
|
116
116
|
if (ret instanceof Promise) {
|
|
117
117
|
runSyncCleanup = false;
|
|
118
|
-
return ret.finally(finallyF);
|
|
118
|
+
return ret.catch(catchF).finally(finallyF);
|
|
119
119
|
} else {
|
|
120
120
|
return ret;
|
|
121
121
|
}
|
|
122
|
+
} catch (e) {
|
|
123
|
+
return catchF(e);
|
|
122
124
|
} finally {
|
|
123
125
|
if (runSyncCleanup) {
|
|
124
126
|
finallyF();
|
|
@@ -918,6 +920,7 @@ var Logger = class {
|
|
|
918
920
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
919
921
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
920
922
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
923
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
921
924
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
922
925
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
923
926
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -952,7 +955,7 @@ var Logger = class {
|
|
|
952
955
|
traced(callback, args) {
|
|
953
956
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
954
957
|
const span = this.startSpan(argsRest);
|
|
955
|
-
const ret =
|
|
958
|
+
const ret = runCatchFinally(
|
|
956
959
|
() => {
|
|
957
960
|
if (setCurrent ?? true) {
|
|
958
961
|
return withCurrent(span, callback);
|
|
@@ -960,6 +963,10 @@ var Logger = class {
|
|
|
960
963
|
return callback(span);
|
|
961
964
|
}
|
|
962
965
|
},
|
|
966
|
+
(e) => {
|
|
967
|
+
logError(span, e);
|
|
968
|
+
throw e;
|
|
969
|
+
},
|
|
963
970
|
() => span.end()
|
|
964
971
|
);
|
|
965
972
|
if (this.asyncFlush) {
|
|
@@ -1893,9 +1900,22 @@ function getSpanParentObject(options) {
|
|
|
1893
1900
|
}
|
|
1894
1901
|
return NOOP_SPAN;
|
|
1895
1902
|
}
|
|
1903
|
+
function logError(span, error) {
|
|
1904
|
+
let errorMessage = "<error>";
|
|
1905
|
+
let stackTrace = "";
|
|
1906
|
+
if (error instanceof Error) {
|
|
1907
|
+
errorMessage = error.message;
|
|
1908
|
+
stackTrace = error.stack || "";
|
|
1909
|
+
} else {
|
|
1910
|
+
errorMessage = String(error);
|
|
1911
|
+
}
|
|
1912
|
+
span.log({ error: `${errorMessage}
|
|
1913
|
+
|
|
1914
|
+
${stackTrace}` });
|
|
1915
|
+
}
|
|
1896
1916
|
function traced(callback, args) {
|
|
1897
1917
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
1898
|
-
const ret =
|
|
1918
|
+
const ret = runCatchFinally(
|
|
1899
1919
|
() => {
|
|
1900
1920
|
if (args?.setCurrent ?? true) {
|
|
1901
1921
|
return withCurrent(span, callback);
|
|
@@ -1903,6 +1923,10 @@ function traced(callback, args) {
|
|
|
1903
1923
|
return callback(span);
|
|
1904
1924
|
}
|
|
1905
1925
|
},
|
|
1926
|
+
(e) => {
|
|
1927
|
+
logError(span, e);
|
|
1928
|
+
throw e;
|
|
1929
|
+
},
|
|
1906
1930
|
() => span.end()
|
|
1907
1931
|
);
|
|
1908
1932
|
if (args?.asyncFlush) {
|
|
@@ -2218,6 +2242,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2218
2242
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2219
2243
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2220
2244
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2245
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2221
2246
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2222
2247
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2223
2248
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2247,7 +2272,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2247
2272
|
traced(callback, args) {
|
|
2248
2273
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2249
2274
|
const span = this.startSpan(argsRest);
|
|
2250
|
-
|
|
2275
|
+
const ret = runCatchFinally(
|
|
2251
2276
|
() => {
|
|
2252
2277
|
if (setCurrent ?? true) {
|
|
2253
2278
|
return withCurrent(span, callback);
|
|
@@ -2255,8 +2280,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2255
2280
|
return callback(span);
|
|
2256
2281
|
}
|
|
2257
2282
|
},
|
|
2283
|
+
(e) => {
|
|
2284
|
+
logError(span, e);
|
|
2285
|
+
throw e;
|
|
2286
|
+
},
|
|
2258
2287
|
() => span.end()
|
|
2259
2288
|
);
|
|
2289
|
+
return ret;
|
|
2260
2290
|
}
|
|
2261
2291
|
/**
|
|
2262
2292
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2597,7 +2627,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2597
2627
|
traced(callback, args) {
|
|
2598
2628
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2599
2629
|
const span = this.startSpan(argsRest);
|
|
2600
|
-
return
|
|
2630
|
+
return runCatchFinally(
|
|
2601
2631
|
() => {
|
|
2602
2632
|
if (setCurrent ?? true) {
|
|
2603
2633
|
return withCurrent(span, callback);
|
|
@@ -2605,6 +2635,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2605
2635
|
return callback(span);
|
|
2606
2636
|
}
|
|
2607
2637
|
},
|
|
2638
|
+
(e) => {
|
|
2639
|
+
logError(span, e);
|
|
2640
|
+
throw e;
|
|
2641
|
+
},
|
|
2608
2642
|
() => span.end()
|
|
2609
2643
|
);
|
|
2610
2644
|
}
|
package/dist/browser.mjs
CHANGED
|
@@ -53,16 +53,18 @@ import {
|
|
|
53
53
|
|
|
54
54
|
// src/util.ts
|
|
55
55
|
var GLOBAL_PROJECT = "Global";
|
|
56
|
-
function
|
|
56
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
57
57
|
let runSyncCleanup = true;
|
|
58
58
|
try {
|
|
59
59
|
const ret = f();
|
|
60
60
|
if (ret instanceof Promise) {
|
|
61
61
|
runSyncCleanup = false;
|
|
62
|
-
return ret.finally(finallyF);
|
|
62
|
+
return ret.catch(catchF).finally(finallyF);
|
|
63
63
|
} else {
|
|
64
64
|
return ret;
|
|
65
65
|
}
|
|
66
|
+
} catch (e) {
|
|
67
|
+
return catchF(e);
|
|
66
68
|
} finally {
|
|
67
69
|
if (runSyncCleanup) {
|
|
68
70
|
finallyF();
|
|
@@ -864,6 +866,7 @@ var Logger = class {
|
|
|
864
866
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
865
867
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
866
868
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
869
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
867
870
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
868
871
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
869
872
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -898,7 +901,7 @@ var Logger = class {
|
|
|
898
901
|
traced(callback, args) {
|
|
899
902
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
900
903
|
const span = this.startSpan(argsRest);
|
|
901
|
-
const ret =
|
|
904
|
+
const ret = runCatchFinally(
|
|
902
905
|
() => {
|
|
903
906
|
if (setCurrent ?? true) {
|
|
904
907
|
return withCurrent(span, callback);
|
|
@@ -906,6 +909,10 @@ var Logger = class {
|
|
|
906
909
|
return callback(span);
|
|
907
910
|
}
|
|
908
911
|
},
|
|
912
|
+
(e) => {
|
|
913
|
+
logError(span, e);
|
|
914
|
+
throw e;
|
|
915
|
+
},
|
|
909
916
|
() => span.end()
|
|
910
917
|
);
|
|
911
918
|
if (this.asyncFlush) {
|
|
@@ -1839,9 +1846,22 @@ function getSpanParentObject(options) {
|
|
|
1839
1846
|
}
|
|
1840
1847
|
return NOOP_SPAN;
|
|
1841
1848
|
}
|
|
1849
|
+
function logError(span, error) {
|
|
1850
|
+
let errorMessage = "<error>";
|
|
1851
|
+
let stackTrace = "";
|
|
1852
|
+
if (error instanceof Error) {
|
|
1853
|
+
errorMessage = error.message;
|
|
1854
|
+
stackTrace = error.stack || "";
|
|
1855
|
+
} else {
|
|
1856
|
+
errorMessage = String(error);
|
|
1857
|
+
}
|
|
1858
|
+
span.log({ error: `${errorMessage}
|
|
1859
|
+
|
|
1860
|
+
${stackTrace}` });
|
|
1861
|
+
}
|
|
1842
1862
|
function traced(callback, args) {
|
|
1843
1863
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
1844
|
-
const ret =
|
|
1864
|
+
const ret = runCatchFinally(
|
|
1845
1865
|
() => {
|
|
1846
1866
|
if (args?.setCurrent ?? true) {
|
|
1847
1867
|
return withCurrent(span, callback);
|
|
@@ -1849,6 +1869,10 @@ function traced(callback, args) {
|
|
|
1849
1869
|
return callback(span);
|
|
1850
1870
|
}
|
|
1851
1871
|
},
|
|
1872
|
+
(e) => {
|
|
1873
|
+
logError(span, e);
|
|
1874
|
+
throw e;
|
|
1875
|
+
},
|
|
1852
1876
|
() => span.end()
|
|
1853
1877
|
);
|
|
1854
1878
|
if (args?.asyncFlush) {
|
|
@@ -2164,6 +2188,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2164
2188
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2165
2189
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2166
2190
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2191
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2167
2192
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2168
2193
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2169
2194
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2193,7 +2218,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2193
2218
|
traced(callback, args) {
|
|
2194
2219
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2195
2220
|
const span = this.startSpan(argsRest);
|
|
2196
|
-
|
|
2221
|
+
const ret = runCatchFinally(
|
|
2197
2222
|
() => {
|
|
2198
2223
|
if (setCurrent ?? true) {
|
|
2199
2224
|
return withCurrent(span, callback);
|
|
@@ -2201,8 +2226,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2201
2226
|
return callback(span);
|
|
2202
2227
|
}
|
|
2203
2228
|
},
|
|
2229
|
+
(e) => {
|
|
2230
|
+
logError(span, e);
|
|
2231
|
+
throw e;
|
|
2232
|
+
},
|
|
2204
2233
|
() => span.end()
|
|
2205
2234
|
);
|
|
2235
|
+
return ret;
|
|
2206
2236
|
}
|
|
2207
2237
|
/**
|
|
2208
2238
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2543,7 +2573,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2543
2573
|
traced(callback, args) {
|
|
2544
2574
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2545
2575
|
const span = this.startSpan(argsRest);
|
|
2546
|
-
return
|
|
2576
|
+
return runCatchFinally(
|
|
2547
2577
|
() => {
|
|
2548
2578
|
if (setCurrent ?? true) {
|
|
2549
2579
|
return withCurrent(span, callback);
|
|
@@ -2551,6 +2581,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2551
2581
|
return callback(span);
|
|
2552
2582
|
}
|
|
2553
2583
|
},
|
|
2584
|
+
(e) => {
|
|
2585
|
+
logError(span, e);
|
|
2586
|
+
throw e;
|
|
2587
|
+
},
|
|
2554
2588
|
() => span.end()
|
|
2555
2589
|
);
|
|
2556
2590
|
}
|
package/dist/cli.js
CHANGED
|
@@ -1232,7 +1232,7 @@ var require_package = __commonJS({
|
|
|
1232
1232
|
"package.json"(exports2, module2) {
|
|
1233
1233
|
module2.exports = {
|
|
1234
1234
|
name: "braintrust",
|
|
1235
|
-
version: "0.0.
|
|
1235
|
+
version: "0.0.148",
|
|
1236
1236
|
description: "SDK for integrating Braintrust",
|
|
1237
1237
|
repository: {
|
|
1238
1238
|
type: "git",
|
|
@@ -1302,7 +1302,7 @@ var require_package = __commonJS({
|
|
|
1302
1302
|
},
|
|
1303
1303
|
dependencies: {
|
|
1304
1304
|
"@ai-sdk/provider": "^0.0.11",
|
|
1305
|
-
"@braintrust/core": "0.0.
|
|
1305
|
+
"@braintrust/core": "0.0.49",
|
|
1306
1306
|
"@next/env": "^14.2.3",
|
|
1307
1307
|
"@vercel/functions": "^1.0.2",
|
|
1308
1308
|
ai: "^3.2.16",
|
|
@@ -1373,16 +1373,18 @@ var isomorph_default = iso;
|
|
|
1373
1373
|
|
|
1374
1374
|
// src/util.ts
|
|
1375
1375
|
var GLOBAL_PROJECT = "Global";
|
|
1376
|
-
function
|
|
1376
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
1377
1377
|
let runSyncCleanup = true;
|
|
1378
1378
|
try {
|
|
1379
1379
|
const ret = f();
|
|
1380
1380
|
if (ret instanceof Promise) {
|
|
1381
1381
|
runSyncCleanup = false;
|
|
1382
|
-
return ret.finally(finallyF);
|
|
1382
|
+
return ret.catch(catchF).finally(finallyF);
|
|
1383
1383
|
} else {
|
|
1384
1384
|
return ret;
|
|
1385
1385
|
}
|
|
1386
|
+
} catch (e) {
|
|
1387
|
+
return catchF(e);
|
|
1386
1388
|
} finally {
|
|
1387
1389
|
if (runSyncCleanup) {
|
|
1388
1390
|
finallyF();
|
|
@@ -2164,6 +2166,7 @@ var Logger = class {
|
|
|
2164
2166
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
2165
2167
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2166
2168
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2169
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2167
2170
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
2168
2171
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2169
2172
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2198,7 +2201,7 @@ var Logger = class {
|
|
|
2198
2201
|
traced(callback, args) {
|
|
2199
2202
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2200
2203
|
const span = this.startSpan(argsRest);
|
|
2201
|
-
const ret =
|
|
2204
|
+
const ret = runCatchFinally(
|
|
2202
2205
|
() => {
|
|
2203
2206
|
if (setCurrent ?? true) {
|
|
2204
2207
|
return withCurrent(span, callback);
|
|
@@ -2206,6 +2209,10 @@ var Logger = class {
|
|
|
2206
2209
|
return callback(span);
|
|
2207
2210
|
}
|
|
2208
2211
|
},
|
|
2212
|
+
(e) => {
|
|
2213
|
+
logError(span, e);
|
|
2214
|
+
throw e;
|
|
2215
|
+
},
|
|
2209
2216
|
() => span.end()
|
|
2210
2217
|
);
|
|
2211
2218
|
if (this.asyncFlush) {
|
|
@@ -2899,6 +2906,19 @@ async function loginToState(options = {}) {
|
|
|
2899
2906
|
state.loginReplaceApiConn(conn);
|
|
2900
2907
|
return state;
|
|
2901
2908
|
}
|
|
2909
|
+
function logError(span, error2) {
|
|
2910
|
+
let errorMessage = "<error>";
|
|
2911
|
+
let stackTrace = "";
|
|
2912
|
+
if (error2 instanceof Error) {
|
|
2913
|
+
errorMessage = error2.message;
|
|
2914
|
+
stackTrace = error2.stack || "";
|
|
2915
|
+
} else {
|
|
2916
|
+
errorMessage = String(error2);
|
|
2917
|
+
}
|
|
2918
|
+
span.log({ error: `${errorMessage}
|
|
2919
|
+
|
|
2920
|
+
${stackTrace}` });
|
|
2921
|
+
}
|
|
2902
2922
|
function withCurrent(span, callback, state = _globalState) {
|
|
2903
2923
|
return state.currentSpan.run(span, () => callback(span));
|
|
2904
2924
|
}
|
|
@@ -3113,6 +3133,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
3113
3133
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
3114
3134
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
3115
3135
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
3136
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
3116
3137
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
3117
3138
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
3118
3139
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -3142,7 +3163,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
3142
3163
|
traced(callback, args) {
|
|
3143
3164
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
3144
3165
|
const span = this.startSpan(argsRest);
|
|
3145
|
-
|
|
3166
|
+
const ret = runCatchFinally(
|
|
3146
3167
|
() => {
|
|
3147
3168
|
if (setCurrent ?? true) {
|
|
3148
3169
|
return withCurrent(span, callback);
|
|
@@ -3150,8 +3171,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
3150
3171
|
return callback(span);
|
|
3151
3172
|
}
|
|
3152
3173
|
},
|
|
3174
|
+
(e) => {
|
|
3175
|
+
logError(span, e);
|
|
3176
|
+
throw e;
|
|
3177
|
+
},
|
|
3153
3178
|
() => span.end()
|
|
3154
3179
|
);
|
|
3180
|
+
return ret;
|
|
3155
3181
|
}
|
|
3156
3182
|
/**
|
|
3157
3183
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -3489,7 +3515,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
3489
3515
|
traced(callback, args) {
|
|
3490
3516
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
3491
3517
|
const span = this.startSpan(argsRest);
|
|
3492
|
-
return
|
|
3518
|
+
return runCatchFinally(
|
|
3493
3519
|
() => {
|
|
3494
3520
|
if (setCurrent ?? true) {
|
|
3495
3521
|
return withCurrent(span, callback);
|
|
@@ -3497,6 +3523,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
3497
3523
|
return callback(span);
|
|
3498
3524
|
}
|
|
3499
3525
|
},
|
|
3526
|
+
(e) => {
|
|
3527
|
+
logError(span, e);
|
|
3528
|
+
throw e;
|
|
3529
|
+
},
|
|
3500
3530
|
() => span.end()
|
|
3501
3531
|
);
|
|
3502
3532
|
}
|
|
@@ -5273,7 +5303,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
5273
5303
|
}
|
|
5274
5304
|
var error = import_chalk.default.bold.red;
|
|
5275
5305
|
var warning = import_chalk.default.hex("#FFA500");
|
|
5276
|
-
function
|
|
5306
|
+
function logError2(e, verbose) {
|
|
5277
5307
|
if (!verbose) {
|
|
5278
5308
|
console.error(`${e}`);
|
|
5279
5309
|
} else {
|
|
@@ -5329,7 +5359,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
|
|
|
5329
5359
|
);
|
|
5330
5360
|
} else {
|
|
5331
5361
|
for (const result of failingResults) {
|
|
5332
|
-
|
|
5362
|
+
logError2(result.error, verbose);
|
|
5333
5363
|
}
|
|
5334
5364
|
}
|
|
5335
5365
|
if (!verbose && !jsonl) {
|
|
@@ -6397,7 +6427,7 @@ async function main() {
|
|
|
6397
6427
|
try {
|
|
6398
6428
|
await parsed.func(parsed);
|
|
6399
6429
|
} catch (e) {
|
|
6400
|
-
|
|
6430
|
+
logError2(e, parsed.verbose);
|
|
6401
6431
|
process.exit(1);
|
|
6402
6432
|
}
|
|
6403
6433
|
}
|
package/dist/index.d.mts
CHANGED
|
@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
291
291
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
292
292
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
293
293
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
294
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
294
295
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
295
296
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
296
297
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -744,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
744
745
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
745
746
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
746
747
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
748
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
747
749
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
748
750
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
749
751
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
package/dist/index.d.ts
CHANGED
|
@@ -291,6 +291,7 @@ declare class Logger<IsAsyncFlush extends boolean> implements Exportable {
|
|
|
291
291
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
292
292
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
293
293
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
294
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
294
295
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
295
296
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
296
297
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -744,6 +745,7 @@ declare class Experiment extends ObjectFetcher<ExperimentEvent> implements Expor
|
|
|
744
745
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
745
746
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
746
747
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
748
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
747
749
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
748
750
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
749
751
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
package/dist/index.js
CHANGED
|
@@ -335,16 +335,18 @@ var import_typespecs2 = require("@braintrust/core/typespecs");
|
|
|
335
335
|
|
|
336
336
|
// src/util.ts
|
|
337
337
|
var GLOBAL_PROJECT = "Global";
|
|
338
|
-
function
|
|
338
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
339
339
|
let runSyncCleanup = true;
|
|
340
340
|
try {
|
|
341
341
|
const ret = f();
|
|
342
342
|
if (ret instanceof Promise) {
|
|
343
343
|
runSyncCleanup = false;
|
|
344
|
-
return ret.finally(finallyF);
|
|
344
|
+
return ret.catch(catchF).finally(finallyF);
|
|
345
345
|
} else {
|
|
346
346
|
return ret;
|
|
347
347
|
}
|
|
348
|
+
} catch (e) {
|
|
349
|
+
return catchF(e);
|
|
348
350
|
} finally {
|
|
349
351
|
if (runSyncCleanup) {
|
|
350
352
|
finallyF();
|
|
@@ -1144,6 +1146,7 @@ var Logger = class {
|
|
|
1144
1146
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
1145
1147
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
1146
1148
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
1149
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
1147
1150
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
1148
1151
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
1149
1152
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1178,7 +1181,7 @@ var Logger = class {
|
|
|
1178
1181
|
traced(callback, args) {
|
|
1179
1182
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
1180
1183
|
const span = this.startSpan(argsRest);
|
|
1181
|
-
const ret =
|
|
1184
|
+
const ret = runCatchFinally(
|
|
1182
1185
|
() => {
|
|
1183
1186
|
if (setCurrent ?? true) {
|
|
1184
1187
|
return withCurrent(span, callback);
|
|
@@ -1186,6 +1189,10 @@ var Logger = class {
|
|
|
1186
1189
|
return callback(span);
|
|
1187
1190
|
}
|
|
1188
1191
|
},
|
|
1192
|
+
(e) => {
|
|
1193
|
+
logError(span, e);
|
|
1194
|
+
throw e;
|
|
1195
|
+
},
|
|
1189
1196
|
() => span.end()
|
|
1190
1197
|
);
|
|
1191
1198
|
if (this.asyncFlush) {
|
|
@@ -2119,9 +2126,22 @@ function getSpanParentObject(options) {
|
|
|
2119
2126
|
}
|
|
2120
2127
|
return NOOP_SPAN;
|
|
2121
2128
|
}
|
|
2129
|
+
function logError(span, error2) {
|
|
2130
|
+
let errorMessage = "<error>";
|
|
2131
|
+
let stackTrace = "";
|
|
2132
|
+
if (error2 instanceof Error) {
|
|
2133
|
+
errorMessage = error2.message;
|
|
2134
|
+
stackTrace = error2.stack || "";
|
|
2135
|
+
} else {
|
|
2136
|
+
errorMessage = String(error2);
|
|
2137
|
+
}
|
|
2138
|
+
span.log({ error: `${errorMessage}
|
|
2139
|
+
|
|
2140
|
+
${stackTrace}` });
|
|
2141
|
+
}
|
|
2122
2142
|
function traced(callback, args) {
|
|
2123
2143
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
2124
|
-
const ret =
|
|
2144
|
+
const ret = runCatchFinally(
|
|
2125
2145
|
() => {
|
|
2126
2146
|
if (args?.setCurrent ?? true) {
|
|
2127
2147
|
return withCurrent(span, callback);
|
|
@@ -2129,6 +2149,10 @@ function traced(callback, args) {
|
|
|
2129
2149
|
return callback(span);
|
|
2130
2150
|
}
|
|
2131
2151
|
},
|
|
2152
|
+
(e) => {
|
|
2153
|
+
logError(span, e);
|
|
2154
|
+
throw e;
|
|
2155
|
+
},
|
|
2132
2156
|
() => span.end()
|
|
2133
2157
|
);
|
|
2134
2158
|
if (args?.asyncFlush) {
|
|
@@ -2444,6 +2468,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2444
2468
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2445
2469
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2446
2470
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2471
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2447
2472
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2448
2473
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2449
2474
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2473,7 +2498,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2473
2498
|
traced(callback, args) {
|
|
2474
2499
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2475
2500
|
const span = this.startSpan(argsRest);
|
|
2476
|
-
|
|
2501
|
+
const ret = runCatchFinally(
|
|
2477
2502
|
() => {
|
|
2478
2503
|
if (setCurrent ?? true) {
|
|
2479
2504
|
return withCurrent(span, callback);
|
|
@@ -2481,8 +2506,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2481
2506
|
return callback(span);
|
|
2482
2507
|
}
|
|
2483
2508
|
},
|
|
2509
|
+
(e) => {
|
|
2510
|
+
logError(span, e);
|
|
2511
|
+
throw e;
|
|
2512
|
+
},
|
|
2484
2513
|
() => span.end()
|
|
2485
2514
|
);
|
|
2515
|
+
return ret;
|
|
2486
2516
|
}
|
|
2487
2517
|
/**
|
|
2488
2518
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2823,7 +2853,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2823
2853
|
traced(callback, args) {
|
|
2824
2854
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2825
2855
|
const span = this.startSpan(argsRest);
|
|
2826
|
-
return
|
|
2856
|
+
return runCatchFinally(
|
|
2827
2857
|
() => {
|
|
2828
2858
|
if (setCurrent ?? true) {
|
|
2829
2859
|
return withCurrent(span, callback);
|
|
@@ -2831,6 +2861,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2831
2861
|
return callback(span);
|
|
2832
2862
|
}
|
|
2833
2863
|
},
|
|
2864
|
+
(e) => {
|
|
2865
|
+
logError(span, e);
|
|
2866
|
+
throw e;
|
|
2867
|
+
},
|
|
2834
2868
|
() => span.end()
|
|
2835
2869
|
);
|
|
2836
2870
|
}
|
|
@@ -4784,7 +4818,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
4784
4818
|
}
|
|
4785
4819
|
var error = import_chalk.default.bold.red;
|
|
4786
4820
|
var warning = import_chalk.default.hex("#FFA500");
|
|
4787
|
-
function
|
|
4821
|
+
function logError2(e, verbose) {
|
|
4788
4822
|
if (!verbose) {
|
|
4789
4823
|
console.error(`${e}`);
|
|
4790
4824
|
} else {
|
|
@@ -4840,7 +4874,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
|
|
|
4840
4874
|
);
|
|
4841
4875
|
} else {
|
|
4842
4876
|
for (const result of failingResults) {
|
|
4843
|
-
|
|
4877
|
+
logError2(result.error, verbose);
|
|
4844
4878
|
}
|
|
4845
4879
|
}
|
|
4846
4880
|
if (!verbose && !jsonl) {
|
package/dist/index.mjs
CHANGED
|
@@ -272,16 +272,18 @@ import {
|
|
|
272
272
|
|
|
273
273
|
// src/util.ts
|
|
274
274
|
var GLOBAL_PROJECT = "Global";
|
|
275
|
-
function
|
|
275
|
+
function runCatchFinally(f, catchF, finallyF) {
|
|
276
276
|
let runSyncCleanup = true;
|
|
277
277
|
try {
|
|
278
278
|
const ret = f();
|
|
279
279
|
if (ret instanceof Promise) {
|
|
280
280
|
runSyncCleanup = false;
|
|
281
|
-
return ret.finally(finallyF);
|
|
281
|
+
return ret.catch(catchF).finally(finallyF);
|
|
282
282
|
} else {
|
|
283
283
|
return ret;
|
|
284
284
|
}
|
|
285
|
+
} catch (e) {
|
|
286
|
+
return catchF(e);
|
|
285
287
|
} finally {
|
|
286
288
|
if (runSyncCleanup) {
|
|
287
289
|
finallyF();
|
|
@@ -1083,6 +1085,7 @@ var Logger = class {
|
|
|
1083
1085
|
* @param event.input: (Optional) the arguments that uniquely define a user input (an arbitrary, JSON serializable object).
|
|
1084
1086
|
* @param event.output: (Optional) the output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
1085
1087
|
* @param event.expected: (Optional) the ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
1088
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
1086
1089
|
* @param event.scores: (Optional) a dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare logs.
|
|
1087
1090
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
1088
1091
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -1117,7 +1120,7 @@ var Logger = class {
|
|
|
1117
1120
|
traced(callback, args) {
|
|
1118
1121
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
1119
1122
|
const span = this.startSpan(argsRest);
|
|
1120
|
-
const ret =
|
|
1123
|
+
const ret = runCatchFinally(
|
|
1121
1124
|
() => {
|
|
1122
1125
|
if (setCurrent ?? true) {
|
|
1123
1126
|
return withCurrent(span, callback);
|
|
@@ -1125,6 +1128,10 @@ var Logger = class {
|
|
|
1125
1128
|
return callback(span);
|
|
1126
1129
|
}
|
|
1127
1130
|
},
|
|
1131
|
+
(e) => {
|
|
1132
|
+
logError(span, e);
|
|
1133
|
+
throw e;
|
|
1134
|
+
},
|
|
1128
1135
|
() => span.end()
|
|
1129
1136
|
);
|
|
1130
1137
|
if (this.asyncFlush) {
|
|
@@ -2058,9 +2065,22 @@ function getSpanParentObject(options) {
|
|
|
2058
2065
|
}
|
|
2059
2066
|
return NOOP_SPAN;
|
|
2060
2067
|
}
|
|
2068
|
+
function logError(span, error2) {
|
|
2069
|
+
let errorMessage = "<error>";
|
|
2070
|
+
let stackTrace = "";
|
|
2071
|
+
if (error2 instanceof Error) {
|
|
2072
|
+
errorMessage = error2.message;
|
|
2073
|
+
stackTrace = error2.stack || "";
|
|
2074
|
+
} else {
|
|
2075
|
+
errorMessage = String(error2);
|
|
2076
|
+
}
|
|
2077
|
+
span.log({ error: `${errorMessage}
|
|
2078
|
+
|
|
2079
|
+
${stackTrace}` });
|
|
2080
|
+
}
|
|
2061
2081
|
function traced(callback, args) {
|
|
2062
2082
|
const { span, isSyncFlushLogger } = startSpanAndIsLogger(args);
|
|
2063
|
-
const ret =
|
|
2083
|
+
const ret = runCatchFinally(
|
|
2064
2084
|
() => {
|
|
2065
2085
|
if (args?.setCurrent ?? true) {
|
|
2066
2086
|
return withCurrent(span, callback);
|
|
@@ -2068,6 +2088,10 @@ function traced(callback, args) {
|
|
|
2068
2088
|
return callback(span);
|
|
2069
2089
|
}
|
|
2070
2090
|
},
|
|
2091
|
+
(e) => {
|
|
2092
|
+
logError(span, e);
|
|
2093
|
+
throw e;
|
|
2094
|
+
},
|
|
2071
2095
|
() => span.end()
|
|
2072
2096
|
);
|
|
2073
2097
|
if (args?.asyncFlush) {
|
|
@@ -2383,6 +2407,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2383
2407
|
* @param event.input: The arguments that uniquely define a test case (an arbitrary, JSON serializable object). Later on, Braintrust will use the `input` to know whether two test cases are the same between experiments, so they should not contain experiment-specific state. A simple rule of thumb is that if you run the same experiment twice, the `input` should be identical.
|
|
2384
2408
|
* @param event.output: The output of your application, including post-processing (an arbitrary, JSON serializable object), that allows you to determine whether the result is correct or not. For example, in an app that generates SQL queries, the `output` should be the _result_ of the SQL query generated by the model, not the query itself, because there may be multiple valid queries that answer a single question.
|
|
2385
2409
|
* @param event.expected: (Optional) The ground truth value (an arbitrary, JSON serializable object) that you'd compare to `output` to determine if your `output` value is correct or not. Braintrust currently does not compare `output` to `expected` for you, since there are so many different ways to do that correctly. Instead, these values are just used to help you navigate your experiments while digging into analyses. However, we may later use these values to re-score outputs or fine-tune your models.
|
|
2410
|
+
* @param event.error: (Optional) The error that occurred, if any. If you use tracing to run an experiment, errors are automatically logged when your code throws an exception.
|
|
2386
2411
|
* @param event.scores: A dictionary of numeric values (between 0 and 1) to log. The scores should give you a variety of signals that help you determine how accurate the outputs are compared to what you expect and diagnose failures. For example, a summarization app might have one score that tells you how accurate the summary is, and another that measures the word similarity between the generated and grouth truth summary. The word similarity score could help you determine whether the summarization was covering similar concepts or not. You can use these scores to help you sort, filter, and compare experiments.
|
|
2387
2412
|
* @param event.metadata: (Optional) a dictionary with additional data about the test example, model outputs, or just about anything else that's relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`, example's `id`, or anything else that would be useful to slice/dice later. The values in `metadata` can be any JSON-serializable type, but its keys must be strings.
|
|
2388
2413
|
* @param event.metrics: (Optional) a dictionary of metrics to log. The following keys are populated automatically: "start", "end".
|
|
@@ -2412,7 +2437,7 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2412
2437
|
traced(callback, args) {
|
|
2413
2438
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2414
2439
|
const span = this.startSpan(argsRest);
|
|
2415
|
-
|
|
2440
|
+
const ret = runCatchFinally(
|
|
2416
2441
|
() => {
|
|
2417
2442
|
if (setCurrent ?? true) {
|
|
2418
2443
|
return withCurrent(span, callback);
|
|
@@ -2420,8 +2445,13 @@ var Experiment = class extends ObjectFetcher {
|
|
|
2420
2445
|
return callback(span);
|
|
2421
2446
|
}
|
|
2422
2447
|
},
|
|
2448
|
+
(e) => {
|
|
2449
|
+
logError(span, e);
|
|
2450
|
+
throw e;
|
|
2451
|
+
},
|
|
2423
2452
|
() => span.end()
|
|
2424
2453
|
);
|
|
2454
|
+
return ret;
|
|
2425
2455
|
}
|
|
2426
2456
|
/**
|
|
2427
2457
|
* Lower-level alternative to `traced`. This allows you to start a span yourself, and can be useful in situations
|
|
@@ -2762,7 +2792,7 @@ var SpanImpl = class _SpanImpl {
|
|
|
2762
2792
|
traced(callback, args) {
|
|
2763
2793
|
const { setCurrent, ...argsRest } = args ?? {};
|
|
2764
2794
|
const span = this.startSpan(argsRest);
|
|
2765
|
-
return
|
|
2795
|
+
return runCatchFinally(
|
|
2766
2796
|
() => {
|
|
2767
2797
|
if (setCurrent ?? true) {
|
|
2768
2798
|
return withCurrent(span, callback);
|
|
@@ -2770,6 +2800,10 @@ var SpanImpl = class _SpanImpl {
|
|
|
2770
2800
|
return callback(span);
|
|
2771
2801
|
}
|
|
2772
2802
|
},
|
|
2803
|
+
(e) => {
|
|
2804
|
+
logError(span, e);
|
|
2805
|
+
throw e;
|
|
2806
|
+
},
|
|
2773
2807
|
() => span.end()
|
|
2774
2808
|
);
|
|
2775
2809
|
}
|
|
@@ -4725,7 +4759,7 @@ async function runEvaluatorInternal(experiment, evaluator, progressReporter, fil
|
|
|
4725
4759
|
}
|
|
4726
4760
|
var error = chalk.bold.red;
|
|
4727
4761
|
var warning = chalk.hex("#FFA500");
|
|
4728
|
-
function
|
|
4762
|
+
function logError2(e, verbose) {
|
|
4729
4763
|
if (!verbose) {
|
|
4730
4764
|
console.error(`${e}`);
|
|
4731
4765
|
} else {
|
|
@@ -4781,7 +4815,7 @@ function reportFailures(evaluator, failingResults, { verbose, jsonl }) {
|
|
|
4781
4815
|
);
|
|
4782
4816
|
} else {
|
|
4783
4817
|
for (const result of failingResults) {
|
|
4784
|
-
|
|
4818
|
+
logError2(result.error, verbose);
|
|
4785
4819
|
}
|
|
4786
4820
|
}
|
|
4787
4821
|
if (!verbose && !jsonl) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "braintrust",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.148",
|
|
4
4
|
"description": "SDK for integrating Braintrust",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -70,7 +70,7 @@
|
|
|
70
70
|
},
|
|
71
71
|
"dependencies": {
|
|
72
72
|
"@ai-sdk/provider": "^0.0.11",
|
|
73
|
-
"@braintrust/core": "0.0.
|
|
73
|
+
"@braintrust/core": "0.0.49",
|
|
74
74
|
"@next/env": "^14.2.3",
|
|
75
75
|
"@vercel/functions": "^1.0.2",
|
|
76
76
|
"ai": "^3.2.16",
|