braintrust 0.0.113 → 0.0.114

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,8 +29,14 @@ export interface EvalHooks {
29
29
  export type EvalScorerArgs<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> = EvalCase<Input, Expected, Metadata> & {
30
30
  output: Output;
31
31
  };
32
- type ScoreValue = Score | number | null;
33
- export type EvalScorer<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> = (args: EvalScorerArgs<Input, Output, Expected, Metadata>) => ScoreValue | Promise<ScoreValue>;
32
+ type ScoreValue = Score | number;
33
+ type OneOrMoreScores = ScoreValue | Array<ScoreValue> | null;
34
+ export type EvalScorer<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> = (args: EvalScorerArgs<Input, Output, Expected, Metadata>) => OneOrMoreScores | Promise<OneOrMoreScores>;
35
+ export type EvalResult<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> = EvalCase<Input, Expected, Metadata> & {
36
+ output: Output;
37
+ scores: Record<string, number | null>;
38
+ error: unknown;
39
+ };
34
40
  export interface Evaluator<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> {
35
41
  /**
36
42
  * A function that returns a list of inputs, expected outputs, and metadata.
@@ -63,18 +69,56 @@ export interface Evaluator<Input, Output, Expected, Metadata extends BaseMetadat
63
69
  */
64
70
  isPublic?: boolean;
65
71
  }
72
+ export type EvalResultWithSummary<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> = {
73
+ summary: ExperimentSummary;
74
+ results: EvalResult<Input, Output, Expected, Metadata>[];
75
+ };
76
+ export interface ReporterOpts {
77
+ verbose: boolean;
78
+ jsonl: boolean;
79
+ }
80
+ export interface ReporterBody<EvalReport> {
81
+ /**
82
+ * A function that takes an evaluator and its result and returns a report.
83
+ *
84
+ * @param evaluator
85
+ * @param result
86
+ * @param opts
87
+ */
88
+ reportEval(evaluator: EvaluatorDef<any, any, any, any>, result: EvalResultWithSummary<any, any, any, any>, opts: ReporterOpts): Promise<EvalReport> | EvalReport;
89
+ /**
90
+ * A function that takes all evaluator results and returns a boolean indicating
91
+ * whether the run was successful. If you return false, the `braintrust eval`
92
+ * command will exit with a non-zero status code.
93
+ *
94
+ * @param reports
95
+ */
96
+ reportRun(reports: EvalReport[]): boolean | Promise<boolean>;
97
+ }
98
+ export type ReporterDef<EvalReport> = {
99
+ name: string;
100
+ } & ReporterBody<EvalReport>;
66
101
  export type EvaluatorDef<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType> = {
67
102
  projectName: string;
68
103
  evalName: string;
69
104
  } & Evaluator<Input, Output, Expected, Metadata>;
70
105
  export type EvaluatorFile = {
71
- [evalName: string]: EvaluatorDef<any, any, any, any>;
106
+ evaluators: {
107
+ [evalName: string]: {
108
+ evaluator: EvaluatorDef<any, any, any, any>;
109
+ reporter?: ReporterDef<unknown> | string;
110
+ };
111
+ };
112
+ reporters: {
113
+ [reporterName: string]: ReporterDef<unknown>;
114
+ };
72
115
  };
73
116
  declare global {
74
117
  var _evals: EvaluatorFile;
75
118
  var _lazy_load: boolean;
76
119
  }
77
- export declare function Eval<Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType>(name: string, evaluator: Evaluator<Input, Output, Expected, Metadata>): Promise<ExperimentSummary>;
120
+ export declare function Eval<Input, Output, Expected = void, Metadata extends BaseMetadata = DefaultMetadataType, EvalReport = boolean>(name: string, evaluator: Evaluator<Input, Output, Expected, Metadata>, reporter?: ReporterDef<EvalReport> | string): Promise<EvalResultWithSummary<Input, Output, Expected, Metadata>>;
121
+ export declare function Reporter<EvalReport>(name: string, reporter: ReporterBody<EvalReport>): ReporterDef<EvalReport>;
78
122
  export declare function getLoadedEvals(): EvaluatorFile;
79
123
  export interface Filter {
80
124
  path: string[];
@@ -89,26 +133,16 @@ export declare function deserializePlainStringAsJSON(s: string): {
89
133
  error: unknown;
90
134
  };
91
135
  export declare function parseFilters(filters: string[]): Filter[];
92
- export declare function runEvaluator(experiment: Experiment | null, evaluator: EvaluatorDef<any, any, any | void, any | void>, progressReporter: ProgressReporter, filters: Filter[]): Promise<{
93
- results: {
94
- output: any;
95
- metadata: Record<string, unknown>;
96
- scores: Record<string, number | null>;
97
- error: unknown;
98
- }[];
99
- summary: ExperimentSummary | null;
100
- }>;
136
+ export declare function runEvaluator(experiment: Experiment | null, evaluator: EvaluatorDef<any, any, any, any>, progressReporter: ProgressReporter, filters: Filter[]): Promise<EvalResultWithSummary<any, any, any, any>>;
101
137
  export declare const error: chalk.Chalk;
102
138
  export declare const warning: chalk.Chalk;
103
139
  export declare function logError(e: unknown, verbose: boolean): void;
104
- export declare function reportEvaluatorResult(evaluatorName: string | number, evaluatorResult: {
105
- results: {
106
- scores: Record<string, number | null>;
107
- error: unknown;
108
- }[];
109
- summary: unknown;
110
- }, { verbose, jsonl, }: {
111
- verbose: boolean;
112
- jsonl: boolean;
113
- }): void;
140
+ export declare function buildLocalSummary(evaluator: EvaluatorDef<any, any, any, any>, results: EvalResult<any, any, any, any>[]): ExperimentSummary;
141
+ export declare function reportFailures<Input, Output, Expected, Metadata extends BaseMetadata>(evaluator: EvaluatorDef<Input, Output, Expected, Metadata>, failingResults: EvalResult<Input, Output, Expected, Metadata>[], { verbose, jsonl }: ReporterOpts): void;
142
+ /**
143
+ * The default reporter for Braintrust evaluations. This reporter will log the results
144
+ * of each evaluation to the console, and will return false (i.e. fail) if any of the
145
+ * evaluations return an error.
146
+ */
147
+ export declare const defaultReporter: ReporterDef<boolean>;
114
148
  export {};
package/dist/index.d.ts CHANGED
@@ -43,6 +43,6 @@
43
43
  * @module braintrust
44
44
  */
45
45
  export * from "./logger";
46
- export { BaseExperiment, Evaluator, EvalTask, Eval, EvalScorerArgs, } from "./framework";
46
+ export { BaseExperiment, Evaluator, EvalTask, Eval, EvalScorerArgs, ReporterBody, Reporter, buildLocalSummary, reportFailures, } from "./framework";
47
47
  export * from "./oai";
48
48
  export { ParentExperimentIds, ParentProjectLogIds, IdField, InputField, InputsField, OtherExperimentLogFields, ExperimentLogPartialArgs, ExperimentLogFullArgs, LogFeedbackFullArgs, LogCommentFullArgs, CommentEvent, DatasetRecord, } from "@braintrust/core";