@browserbasehq/stagehand 2.5.3 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +69 -28
- package/dist/index.js +24 -40
- package/dist/lib/a11y/utils.d.ts +3 -19
- package/dist/lib/index.d.ts +3 -0
- package/dist/lib/version.d.ts +1 -1
- package/dist/types/context.d.ts +2 -0
- package/dist/types/stagehandErrors.d.ts +1 -0
- package/package.json +25 -27
- package/dist/evals/cli.js +0 -965
- package/dist/evals/evals.config.json +0 -553
- package/dist/examples/accessibility_tests.d.ts +0 -16
- package/dist/examples/download.d.ts +0 -3
- package/dist/examples/evaluator.d.ts +0 -1
- package/dist/examples/mem_test.d.ts +0 -1
- package/dist/examples/multi_page.d.ts +0 -1
- package/dist/examples/perf_test.d.ts +0 -1
- package/dist/examples/pwtest.d.ts +0 -1
- package/dist/examples/test.d.ts +0 -1
package/dist/evals/cli.js
DELETED
|
@@ -1,965 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
var __create = Object.create;
|
|
3
|
-
var __defProp = Object.defineProperty;
|
|
4
|
-
var __defProps = Object.defineProperties;
|
|
5
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
6
|
-
var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
|
|
7
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
8
|
-
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
|
|
9
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
10
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
11
|
-
var __propIsEnum = Object.prototype.propertyIsEnumerable;
|
|
12
|
-
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
13
|
-
var __spreadValues = (a, b) => {
|
|
14
|
-
for (var prop in b || (b = {}))
|
|
15
|
-
if (__hasOwnProp.call(b, prop))
|
|
16
|
-
__defNormalProp(a, prop, b[prop]);
|
|
17
|
-
if (__getOwnPropSymbols)
|
|
18
|
-
for (var prop of __getOwnPropSymbols(b)) {
|
|
19
|
-
if (__propIsEnum.call(b, prop))
|
|
20
|
-
__defNormalProp(a, prop, b[prop]);
|
|
21
|
-
}
|
|
22
|
-
return a;
|
|
23
|
-
};
|
|
24
|
-
var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
|
|
25
|
-
var __copyProps = (to, from, except, desc) => {
|
|
26
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
27
|
-
for (let key of __getOwnPropNames(from))
|
|
28
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
29
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
30
|
-
}
|
|
31
|
-
return to;
|
|
32
|
-
};
|
|
33
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
34
|
-
// If the importer is in node compatibility mode or this is not an ESM
|
|
35
|
-
// file that has been converted to a CommonJS file using a Babel-
|
|
36
|
-
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
37
|
-
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
38
|
-
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
39
|
-
mod
|
|
40
|
-
));
|
|
41
|
-
|
|
42
|
-
// cli.ts
|
|
43
|
-
var import_process = __toESM(require("process"));
|
|
44
|
-
|
|
45
|
-
// ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/vendor/ansi-styles/index.js
|
|
46
|
-
var ANSI_BACKGROUND_OFFSET = 10;
|
|
47
|
-
var wrapAnsi16 = (offset = 0) => (code) => `\x1B[${code + offset}m`;
|
|
48
|
-
var wrapAnsi256 = (offset = 0) => (code) => `\x1B[${38 + offset};5;${code}m`;
|
|
49
|
-
var wrapAnsi16m = (offset = 0) => (red, green, blue) => `\x1B[${38 + offset};2;${red};${green};${blue}m`;
|
|
50
|
-
var styles = {
|
|
51
|
-
modifier: {
|
|
52
|
-
reset: [0, 0],
|
|
53
|
-
// 21 isn't widely supported and 22 does the same thing
|
|
54
|
-
bold: [1, 22],
|
|
55
|
-
dim: [2, 22],
|
|
56
|
-
italic: [3, 23],
|
|
57
|
-
underline: [4, 24],
|
|
58
|
-
overline: [53, 55],
|
|
59
|
-
inverse: [7, 27],
|
|
60
|
-
hidden: [8, 28],
|
|
61
|
-
strikethrough: [9, 29]
|
|
62
|
-
},
|
|
63
|
-
color: {
|
|
64
|
-
black: [30, 39],
|
|
65
|
-
red: [31, 39],
|
|
66
|
-
green: [32, 39],
|
|
67
|
-
yellow: [33, 39],
|
|
68
|
-
blue: [34, 39],
|
|
69
|
-
magenta: [35, 39],
|
|
70
|
-
cyan: [36, 39],
|
|
71
|
-
white: [37, 39],
|
|
72
|
-
// Bright color
|
|
73
|
-
blackBright: [90, 39],
|
|
74
|
-
gray: [90, 39],
|
|
75
|
-
// Alias of `blackBright`
|
|
76
|
-
grey: [90, 39],
|
|
77
|
-
// Alias of `blackBright`
|
|
78
|
-
redBright: [91, 39],
|
|
79
|
-
greenBright: [92, 39],
|
|
80
|
-
yellowBright: [93, 39],
|
|
81
|
-
blueBright: [94, 39],
|
|
82
|
-
magentaBright: [95, 39],
|
|
83
|
-
cyanBright: [96, 39],
|
|
84
|
-
whiteBright: [97, 39]
|
|
85
|
-
},
|
|
86
|
-
bgColor: {
|
|
87
|
-
bgBlack: [40, 49],
|
|
88
|
-
bgRed: [41, 49],
|
|
89
|
-
bgGreen: [42, 49],
|
|
90
|
-
bgYellow: [43, 49],
|
|
91
|
-
bgBlue: [44, 49],
|
|
92
|
-
bgMagenta: [45, 49],
|
|
93
|
-
bgCyan: [46, 49],
|
|
94
|
-
bgWhite: [47, 49],
|
|
95
|
-
// Bright color
|
|
96
|
-
bgBlackBright: [100, 49],
|
|
97
|
-
bgGray: [100, 49],
|
|
98
|
-
// Alias of `bgBlackBright`
|
|
99
|
-
bgGrey: [100, 49],
|
|
100
|
-
// Alias of `bgBlackBright`
|
|
101
|
-
bgRedBright: [101, 49],
|
|
102
|
-
bgGreenBright: [102, 49],
|
|
103
|
-
bgYellowBright: [103, 49],
|
|
104
|
-
bgBlueBright: [104, 49],
|
|
105
|
-
bgMagentaBright: [105, 49],
|
|
106
|
-
bgCyanBright: [106, 49],
|
|
107
|
-
bgWhiteBright: [107, 49]
|
|
108
|
-
}
|
|
109
|
-
};
|
|
110
|
-
var modifierNames = Object.keys(styles.modifier);
|
|
111
|
-
var foregroundColorNames = Object.keys(styles.color);
|
|
112
|
-
var backgroundColorNames = Object.keys(styles.bgColor);
|
|
113
|
-
var colorNames = [...foregroundColorNames, ...backgroundColorNames];
|
|
114
|
-
function assembleStyles() {
|
|
115
|
-
const codes = /* @__PURE__ */ new Map();
|
|
116
|
-
for (const [groupName, group] of Object.entries(styles)) {
|
|
117
|
-
for (const [styleName, style] of Object.entries(group)) {
|
|
118
|
-
styles[styleName] = {
|
|
119
|
-
open: `\x1B[${style[0]}m`,
|
|
120
|
-
close: `\x1B[${style[1]}m`
|
|
121
|
-
};
|
|
122
|
-
group[styleName] = styles[styleName];
|
|
123
|
-
codes.set(style[0], style[1]);
|
|
124
|
-
}
|
|
125
|
-
Object.defineProperty(styles, groupName, {
|
|
126
|
-
value: group,
|
|
127
|
-
enumerable: false
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
Object.defineProperty(styles, "codes", {
|
|
131
|
-
value: codes,
|
|
132
|
-
enumerable: false
|
|
133
|
-
});
|
|
134
|
-
styles.color.close = "\x1B[39m";
|
|
135
|
-
styles.bgColor.close = "\x1B[49m";
|
|
136
|
-
styles.color.ansi = wrapAnsi16();
|
|
137
|
-
styles.color.ansi256 = wrapAnsi256();
|
|
138
|
-
styles.color.ansi16m = wrapAnsi16m();
|
|
139
|
-
styles.bgColor.ansi = wrapAnsi16(ANSI_BACKGROUND_OFFSET);
|
|
140
|
-
styles.bgColor.ansi256 = wrapAnsi256(ANSI_BACKGROUND_OFFSET);
|
|
141
|
-
styles.bgColor.ansi16m = wrapAnsi16m(ANSI_BACKGROUND_OFFSET);
|
|
142
|
-
Object.defineProperties(styles, {
|
|
143
|
-
rgbToAnsi256: {
|
|
144
|
-
value(red, green, blue) {
|
|
145
|
-
if (red === green && green === blue) {
|
|
146
|
-
if (red < 8) {
|
|
147
|
-
return 16;
|
|
148
|
-
}
|
|
149
|
-
if (red > 248) {
|
|
150
|
-
return 231;
|
|
151
|
-
}
|
|
152
|
-
return Math.round((red - 8) / 247 * 24) + 232;
|
|
153
|
-
}
|
|
154
|
-
return 16 + 36 * Math.round(red / 255 * 5) + 6 * Math.round(green / 255 * 5) + Math.round(blue / 255 * 5);
|
|
155
|
-
},
|
|
156
|
-
enumerable: false
|
|
157
|
-
},
|
|
158
|
-
hexToRgb: {
|
|
159
|
-
value(hex) {
|
|
160
|
-
const matches = /[a-f\d]{6}|[a-f\d]{3}/i.exec(hex.toString(16));
|
|
161
|
-
if (!matches) {
|
|
162
|
-
return [0, 0, 0];
|
|
163
|
-
}
|
|
164
|
-
let [colorString] = matches;
|
|
165
|
-
if (colorString.length === 3) {
|
|
166
|
-
colorString = [...colorString].map((character) => character + character).join("");
|
|
167
|
-
}
|
|
168
|
-
const integer = Number.parseInt(colorString, 16);
|
|
169
|
-
return [
|
|
170
|
-
/* eslint-disable no-bitwise */
|
|
171
|
-
integer >> 16 & 255,
|
|
172
|
-
integer >> 8 & 255,
|
|
173
|
-
integer & 255
|
|
174
|
-
/* eslint-enable no-bitwise */
|
|
175
|
-
];
|
|
176
|
-
},
|
|
177
|
-
enumerable: false
|
|
178
|
-
},
|
|
179
|
-
hexToAnsi256: {
|
|
180
|
-
value: (hex) => styles.rgbToAnsi256(...styles.hexToRgb(hex)),
|
|
181
|
-
enumerable: false
|
|
182
|
-
},
|
|
183
|
-
ansi256ToAnsi: {
|
|
184
|
-
value(code) {
|
|
185
|
-
if (code < 8) {
|
|
186
|
-
return 30 + code;
|
|
187
|
-
}
|
|
188
|
-
if (code < 16) {
|
|
189
|
-
return 90 + (code - 8);
|
|
190
|
-
}
|
|
191
|
-
let red;
|
|
192
|
-
let green;
|
|
193
|
-
let blue;
|
|
194
|
-
if (code >= 232) {
|
|
195
|
-
red = ((code - 232) * 10 + 8) / 255;
|
|
196
|
-
green = red;
|
|
197
|
-
blue = red;
|
|
198
|
-
} else {
|
|
199
|
-
code -= 16;
|
|
200
|
-
const remainder = code % 36;
|
|
201
|
-
red = Math.floor(code / 36) / 5;
|
|
202
|
-
green = Math.floor(remainder / 6) / 5;
|
|
203
|
-
blue = remainder % 6 / 5;
|
|
204
|
-
}
|
|
205
|
-
const value = Math.max(red, green, blue) * 2;
|
|
206
|
-
if (value === 0) {
|
|
207
|
-
return 30;
|
|
208
|
-
}
|
|
209
|
-
let result = 30 + (Math.round(blue) << 2 | Math.round(green) << 1 | Math.round(red));
|
|
210
|
-
if (value === 2) {
|
|
211
|
-
result += 60;
|
|
212
|
-
}
|
|
213
|
-
return result;
|
|
214
|
-
},
|
|
215
|
-
enumerable: false
|
|
216
|
-
},
|
|
217
|
-
rgbToAnsi: {
|
|
218
|
-
value: (red, green, blue) => styles.ansi256ToAnsi(styles.rgbToAnsi256(red, green, blue)),
|
|
219
|
-
enumerable: false
|
|
220
|
-
},
|
|
221
|
-
hexToAnsi: {
|
|
222
|
-
value: (hex) => styles.ansi256ToAnsi(styles.hexToAnsi256(hex)),
|
|
223
|
-
enumerable: false
|
|
224
|
-
}
|
|
225
|
-
});
|
|
226
|
-
return styles;
|
|
227
|
-
}
|
|
228
|
-
var ansiStyles = assembleStyles();
|
|
229
|
-
var ansi_styles_default = ansiStyles;
|
|
230
|
-
|
|
231
|
-
// ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/vendor/supports-color/index.js
|
|
232
|
-
var import_node_process = __toESM(require("process"), 1);
|
|
233
|
-
var import_node_os = __toESM(require("os"), 1);
|
|
234
|
-
var import_node_tty = __toESM(require("tty"), 1);
|
|
235
|
-
function hasFlag(flag, argv = globalThis.Deno ? globalThis.Deno.args : import_node_process.default.argv) {
|
|
236
|
-
const prefix = flag.startsWith("-") ? "" : flag.length === 1 ? "-" : "--";
|
|
237
|
-
const position = argv.indexOf(prefix + flag);
|
|
238
|
-
const terminatorPosition = argv.indexOf("--");
|
|
239
|
-
return position !== -1 && (terminatorPosition === -1 || position < terminatorPosition);
|
|
240
|
-
}
|
|
241
|
-
var { env } = import_node_process.default;
|
|
242
|
-
var flagForceColor;
|
|
243
|
-
if (hasFlag("no-color") || hasFlag("no-colors") || hasFlag("color=false") || hasFlag("color=never")) {
|
|
244
|
-
flagForceColor = 0;
|
|
245
|
-
} else if (hasFlag("color") || hasFlag("colors") || hasFlag("color=true") || hasFlag("color=always")) {
|
|
246
|
-
flagForceColor = 1;
|
|
247
|
-
}
|
|
248
|
-
function envForceColor() {
|
|
249
|
-
if ("FORCE_COLOR" in env) {
|
|
250
|
-
if (env.FORCE_COLOR === "true") {
|
|
251
|
-
return 1;
|
|
252
|
-
}
|
|
253
|
-
if (env.FORCE_COLOR === "false") {
|
|
254
|
-
return 0;
|
|
255
|
-
}
|
|
256
|
-
return env.FORCE_COLOR.length === 0 ? 1 : Math.min(Number.parseInt(env.FORCE_COLOR, 10), 3);
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
function translateLevel(level) {
|
|
260
|
-
if (level === 0) {
|
|
261
|
-
return false;
|
|
262
|
-
}
|
|
263
|
-
return {
|
|
264
|
-
level,
|
|
265
|
-
hasBasic: true,
|
|
266
|
-
has256: level >= 2,
|
|
267
|
-
has16m: level >= 3
|
|
268
|
-
};
|
|
269
|
-
}
|
|
270
|
-
function _supportsColor(haveStream, { streamIsTTY, sniffFlags = true } = {}) {
|
|
271
|
-
const noFlagForceColor = envForceColor();
|
|
272
|
-
if (noFlagForceColor !== void 0) {
|
|
273
|
-
flagForceColor = noFlagForceColor;
|
|
274
|
-
}
|
|
275
|
-
const forceColor = sniffFlags ? flagForceColor : noFlagForceColor;
|
|
276
|
-
if (forceColor === 0) {
|
|
277
|
-
return 0;
|
|
278
|
-
}
|
|
279
|
-
if (sniffFlags) {
|
|
280
|
-
if (hasFlag("color=16m") || hasFlag("color=full") || hasFlag("color=truecolor")) {
|
|
281
|
-
return 3;
|
|
282
|
-
}
|
|
283
|
-
if (hasFlag("color=256")) {
|
|
284
|
-
return 2;
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
if ("TF_BUILD" in env && "AGENT_NAME" in env) {
|
|
288
|
-
return 1;
|
|
289
|
-
}
|
|
290
|
-
if (haveStream && !streamIsTTY && forceColor === void 0) {
|
|
291
|
-
return 0;
|
|
292
|
-
}
|
|
293
|
-
const min = forceColor || 0;
|
|
294
|
-
if (env.TERM === "dumb") {
|
|
295
|
-
return min;
|
|
296
|
-
}
|
|
297
|
-
if (import_node_process.default.platform === "win32") {
|
|
298
|
-
const osRelease = import_node_os.default.release().split(".");
|
|
299
|
-
if (Number(osRelease[0]) >= 10 && Number(osRelease[2]) >= 10586) {
|
|
300
|
-
return Number(osRelease[2]) >= 14931 ? 3 : 2;
|
|
301
|
-
}
|
|
302
|
-
return 1;
|
|
303
|
-
}
|
|
304
|
-
if ("CI" in env) {
|
|
305
|
-
if (["GITHUB_ACTIONS", "GITEA_ACTIONS", "CIRCLECI"].some((key) => key in env)) {
|
|
306
|
-
return 3;
|
|
307
|
-
}
|
|
308
|
-
if (["TRAVIS", "APPVEYOR", "GITLAB_CI", "BUILDKITE", "DRONE"].some((sign) => sign in env) || env.CI_NAME === "codeship") {
|
|
309
|
-
return 1;
|
|
310
|
-
}
|
|
311
|
-
return min;
|
|
312
|
-
}
|
|
313
|
-
if ("TEAMCITY_VERSION" in env) {
|
|
314
|
-
return /^(9\.(0*[1-9]\d*)\.|\d{2,}\.)/.test(env.TEAMCITY_VERSION) ? 1 : 0;
|
|
315
|
-
}
|
|
316
|
-
if (env.COLORTERM === "truecolor") {
|
|
317
|
-
return 3;
|
|
318
|
-
}
|
|
319
|
-
if (env.TERM === "xterm-kitty") {
|
|
320
|
-
return 3;
|
|
321
|
-
}
|
|
322
|
-
if ("TERM_PROGRAM" in env) {
|
|
323
|
-
const version = Number.parseInt((env.TERM_PROGRAM_VERSION || "").split(".")[0], 10);
|
|
324
|
-
switch (env.TERM_PROGRAM) {
|
|
325
|
-
case "iTerm.app": {
|
|
326
|
-
return version >= 3 ? 3 : 2;
|
|
327
|
-
}
|
|
328
|
-
case "Apple_Terminal": {
|
|
329
|
-
return 2;
|
|
330
|
-
}
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
if (/-256(color)?$/i.test(env.TERM)) {
|
|
334
|
-
return 2;
|
|
335
|
-
}
|
|
336
|
-
if (/^screen|^xterm|^vt100|^vt220|^rxvt|color|ansi|cygwin|linux/i.test(env.TERM)) {
|
|
337
|
-
return 1;
|
|
338
|
-
}
|
|
339
|
-
if ("COLORTERM" in env) {
|
|
340
|
-
return 1;
|
|
341
|
-
}
|
|
342
|
-
return min;
|
|
343
|
-
}
|
|
344
|
-
function createSupportsColor(stream, options = {}) {
|
|
345
|
-
const level = _supportsColor(stream, __spreadValues({
|
|
346
|
-
streamIsTTY: stream && stream.isTTY
|
|
347
|
-
}, options));
|
|
348
|
-
return translateLevel(level);
|
|
349
|
-
}
|
|
350
|
-
var supportsColor = {
|
|
351
|
-
stdout: createSupportsColor({ isTTY: import_node_tty.default.isatty(1) }),
|
|
352
|
-
stderr: createSupportsColor({ isTTY: import_node_tty.default.isatty(2) })
|
|
353
|
-
};
|
|
354
|
-
var supports_color_default = supportsColor;
|
|
355
|
-
|
|
356
|
-
// ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/utilities.js
|
|
357
|
-
function stringReplaceAll(string, substring, replacer) {
|
|
358
|
-
let index = string.indexOf(substring);
|
|
359
|
-
if (index === -1) {
|
|
360
|
-
return string;
|
|
361
|
-
}
|
|
362
|
-
const substringLength = substring.length;
|
|
363
|
-
let endIndex = 0;
|
|
364
|
-
let returnValue = "";
|
|
365
|
-
do {
|
|
366
|
-
returnValue += string.slice(endIndex, index) + substring + replacer;
|
|
367
|
-
endIndex = index + substringLength;
|
|
368
|
-
index = string.indexOf(substring, endIndex);
|
|
369
|
-
} while (index !== -1);
|
|
370
|
-
returnValue += string.slice(endIndex);
|
|
371
|
-
return returnValue;
|
|
372
|
-
}
|
|
373
|
-
function stringEncaseCRLFWithFirstIndex(string, prefix, postfix, index) {
|
|
374
|
-
let endIndex = 0;
|
|
375
|
-
let returnValue = "";
|
|
376
|
-
do {
|
|
377
|
-
const gotCR = string[index - 1] === "\r";
|
|
378
|
-
returnValue += string.slice(endIndex, gotCR ? index - 1 : index) + prefix + (gotCR ? "\r\n" : "\n") + postfix;
|
|
379
|
-
endIndex = index + 1;
|
|
380
|
-
index = string.indexOf("\n", endIndex);
|
|
381
|
-
} while (index !== -1);
|
|
382
|
-
returnValue += string.slice(endIndex);
|
|
383
|
-
return returnValue;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
// ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/index.js
|
|
387
|
-
var { stdout: stdoutColor, stderr: stderrColor } = supports_color_default;
|
|
388
|
-
var GENERATOR = Symbol("GENERATOR");
|
|
389
|
-
var STYLER = Symbol("STYLER");
|
|
390
|
-
var IS_EMPTY = Symbol("IS_EMPTY");
|
|
391
|
-
var levelMapping = [
|
|
392
|
-
"ansi",
|
|
393
|
-
"ansi",
|
|
394
|
-
"ansi256",
|
|
395
|
-
"ansi16m"
|
|
396
|
-
];
|
|
397
|
-
var styles2 = /* @__PURE__ */ Object.create(null);
|
|
398
|
-
var applyOptions = (object, options = {}) => {
|
|
399
|
-
if (options.level && !(Number.isInteger(options.level) && options.level >= 0 && options.level <= 3)) {
|
|
400
|
-
throw new Error("The `level` option should be an integer from 0 to 3");
|
|
401
|
-
}
|
|
402
|
-
const colorLevel = stdoutColor ? stdoutColor.level : 0;
|
|
403
|
-
object.level = options.level === void 0 ? colorLevel : options.level;
|
|
404
|
-
};
|
|
405
|
-
var chalkFactory = (options) => {
|
|
406
|
-
const chalk2 = (...strings) => strings.join(" ");
|
|
407
|
-
applyOptions(chalk2, options);
|
|
408
|
-
Object.setPrototypeOf(chalk2, createChalk.prototype);
|
|
409
|
-
return chalk2;
|
|
410
|
-
};
|
|
411
|
-
function createChalk(options) {
|
|
412
|
-
return chalkFactory(options);
|
|
413
|
-
}
|
|
414
|
-
Object.setPrototypeOf(createChalk.prototype, Function.prototype);
|
|
415
|
-
for (const [styleName, style] of Object.entries(ansi_styles_default)) {
|
|
416
|
-
styles2[styleName] = {
|
|
417
|
-
get() {
|
|
418
|
-
const builder = createBuilder(this, createStyler(style.open, style.close, this[STYLER]), this[IS_EMPTY]);
|
|
419
|
-
Object.defineProperty(this, styleName, { value: builder });
|
|
420
|
-
return builder;
|
|
421
|
-
}
|
|
422
|
-
};
|
|
423
|
-
}
|
|
424
|
-
styles2.visible = {
|
|
425
|
-
get() {
|
|
426
|
-
const builder = createBuilder(this, this[STYLER], true);
|
|
427
|
-
Object.defineProperty(this, "visible", { value: builder });
|
|
428
|
-
return builder;
|
|
429
|
-
}
|
|
430
|
-
};
|
|
431
|
-
var getModelAnsi = (model, level, type, ...arguments_) => {
|
|
432
|
-
if (model === "rgb") {
|
|
433
|
-
if (level === "ansi16m") {
|
|
434
|
-
return ansi_styles_default[type].ansi16m(...arguments_);
|
|
435
|
-
}
|
|
436
|
-
if (level === "ansi256") {
|
|
437
|
-
return ansi_styles_default[type].ansi256(ansi_styles_default.rgbToAnsi256(...arguments_));
|
|
438
|
-
}
|
|
439
|
-
return ansi_styles_default[type].ansi(ansi_styles_default.rgbToAnsi(...arguments_));
|
|
440
|
-
}
|
|
441
|
-
if (model === "hex") {
|
|
442
|
-
return getModelAnsi("rgb", level, type, ...ansi_styles_default.hexToRgb(...arguments_));
|
|
443
|
-
}
|
|
444
|
-
return ansi_styles_default[type][model](...arguments_);
|
|
445
|
-
};
|
|
446
|
-
var usedModels = ["rgb", "hex", "ansi256"];
|
|
447
|
-
for (const model of usedModels) {
|
|
448
|
-
styles2[model] = {
|
|
449
|
-
get() {
|
|
450
|
-
const { level } = this;
|
|
451
|
-
return function(...arguments_) {
|
|
452
|
-
const styler = createStyler(getModelAnsi(model, levelMapping[level], "color", ...arguments_), ansi_styles_default.color.close, this[STYLER]);
|
|
453
|
-
return createBuilder(this, styler, this[IS_EMPTY]);
|
|
454
|
-
};
|
|
455
|
-
}
|
|
456
|
-
};
|
|
457
|
-
const bgModel = "bg" + model[0].toUpperCase() + model.slice(1);
|
|
458
|
-
styles2[bgModel] = {
|
|
459
|
-
get() {
|
|
460
|
-
const { level } = this;
|
|
461
|
-
return function(...arguments_) {
|
|
462
|
-
const styler = createStyler(getModelAnsi(model, levelMapping[level], "bgColor", ...arguments_), ansi_styles_default.bgColor.close, this[STYLER]);
|
|
463
|
-
return createBuilder(this, styler, this[IS_EMPTY]);
|
|
464
|
-
};
|
|
465
|
-
}
|
|
466
|
-
};
|
|
467
|
-
}
|
|
468
|
-
var proto = Object.defineProperties(() => {
|
|
469
|
-
}, __spreadProps(__spreadValues({}, styles2), {
|
|
470
|
-
level: {
|
|
471
|
-
enumerable: true,
|
|
472
|
-
get() {
|
|
473
|
-
return this[GENERATOR].level;
|
|
474
|
-
},
|
|
475
|
-
set(level) {
|
|
476
|
-
this[GENERATOR].level = level;
|
|
477
|
-
}
|
|
478
|
-
}
|
|
479
|
-
}));
|
|
480
|
-
var createStyler = (open, close, parent) => {
|
|
481
|
-
let openAll;
|
|
482
|
-
let closeAll;
|
|
483
|
-
if (parent === void 0) {
|
|
484
|
-
openAll = open;
|
|
485
|
-
closeAll = close;
|
|
486
|
-
} else {
|
|
487
|
-
openAll = parent.openAll + open;
|
|
488
|
-
closeAll = close + parent.closeAll;
|
|
489
|
-
}
|
|
490
|
-
return {
|
|
491
|
-
open,
|
|
492
|
-
close,
|
|
493
|
-
openAll,
|
|
494
|
-
closeAll,
|
|
495
|
-
parent
|
|
496
|
-
};
|
|
497
|
-
};
|
|
498
|
-
var createBuilder = (self, _styler, _isEmpty) => {
|
|
499
|
-
const builder = (...arguments_) => applyStyle(builder, arguments_.length === 1 ? "" + arguments_[0] : arguments_.join(" "));
|
|
500
|
-
Object.setPrototypeOf(builder, proto);
|
|
501
|
-
builder[GENERATOR] = self;
|
|
502
|
-
builder[STYLER] = _styler;
|
|
503
|
-
builder[IS_EMPTY] = _isEmpty;
|
|
504
|
-
return builder;
|
|
505
|
-
};
|
|
506
|
-
var applyStyle = (self, string) => {
|
|
507
|
-
if (self.level <= 0 || !string) {
|
|
508
|
-
return self[IS_EMPTY] ? "" : string;
|
|
509
|
-
}
|
|
510
|
-
let styler = self[STYLER];
|
|
511
|
-
if (styler === void 0) {
|
|
512
|
-
return string;
|
|
513
|
-
}
|
|
514
|
-
const { openAll, closeAll } = styler;
|
|
515
|
-
if (string.includes("\x1B")) {
|
|
516
|
-
while (styler !== void 0) {
|
|
517
|
-
string = stringReplaceAll(string, styler.close, styler.open);
|
|
518
|
-
styler = styler.parent;
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
const lfIndex = string.indexOf("\n");
|
|
522
|
-
if (lfIndex !== -1) {
|
|
523
|
-
string = stringEncaseCRLFWithFirstIndex(string, closeAll, openAll, lfIndex);
|
|
524
|
-
}
|
|
525
|
-
return openAll + string + closeAll;
|
|
526
|
-
};
|
|
527
|
-
Object.defineProperties(createChalk.prototype, styles2);
|
|
528
|
-
var chalk = createChalk();
|
|
529
|
-
var chalkStderr = createChalk({ level: stderrColor ? stderrColor.level : 0 });
|
|
530
|
-
var source_default = chalk;
|
|
531
|
-
|
|
532
|
-
// cli.ts
|
|
533
|
-
var import_fs = __toESM(require("fs"));
|
|
534
|
-
var import_path = __toESM(require("path"));
|
|
535
|
-
var import_child_process = require("child_process");
|
|
536
|
-
var CONFIG_PATH = import_path.default.join(__dirname, "evals.config.json");
|
|
537
|
-
function loadConfig() {
|
|
538
|
-
return JSON.parse(import_fs.default.readFileSync(CONFIG_PATH, "utf-8"));
|
|
539
|
-
}
|
|
540
|
-
function saveConfig(config) {
|
|
541
|
-
import_fs.default.writeFileSync(CONFIG_PATH, JSON.stringify(config, null, 2));
|
|
542
|
-
}
|
|
543
|
-
function printHelp() {
|
|
544
|
-
console.log(
|
|
545
|
-
source_default.yellow(`\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
546
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28A0\u287E\u283B\u28F6\u2840\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
547
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28A0\u2876\u281B\u28B3\u2846\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
548
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u28B8\u28F7\u2836\u28E6\u28F4\u2836\u28FE\u2847\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
549
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
550
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u2818\u2837\u28E4\u28BE\u284F\u2809\u2809\u2809\u2819\u28FE\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
551
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2808\u28FB\u287F\u281F\u2802\u2800\u28FF\u2803\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
552
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2808\u28F7\u2800\u2800\u2800\u2800\u28B0\u284F\u2800\u2800\u2800\u2880\u28FF\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
553
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2819\u28F7\u2840\u2800\u2800\u2800\u2800\u2800\u2800\u2880\u287E\u2801\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
554
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2808\u2819\u2837\u28E6\u28E4\u28E4\u28F4\u283E\u280B\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
|
|
555
|
-
\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800`)
|
|
556
|
-
);
|
|
557
|
-
console.log(source_default.yellow.bold("\nStagehand Evals CLI"));
|
|
558
|
-
console.log(source_default.cyan("\nevals <command> <target> [options]\n"));
|
|
559
|
-
console.log(source_default.magenta.underline("Commands"));
|
|
560
|
-
console.log(" run Execute evals or benchmarks");
|
|
561
|
-
console.log(" list List available evals/benchmarks");
|
|
562
|
-
console.log(" config Get/set default configuration");
|
|
563
|
-
console.log(" help Show this help message\n");
|
|
564
|
-
console.log(source_default.magenta.underline("Examples"));
|
|
565
|
-
console.log(source_default.dim(" # Run all custom evals"));
|
|
566
|
-
console.log(source_default.green(" evals run all\n"));
|
|
567
|
-
console.log(source_default.dim(" # Run specific category"));
|
|
568
|
-
console.log(
|
|
569
|
-
source_default.green(" evals run act") + source_default.cyan(" -e browserbase -t 5\n")
|
|
570
|
-
);
|
|
571
|
-
console.log(source_default.dim(" # Run specific eval"));
|
|
572
|
-
console.log(source_default.green(" evals run login\n"));
|
|
573
|
-
console.log(source_default.dim(" # Run benchmark"));
|
|
574
|
-
console.log(
|
|
575
|
-
source_default.green(" evals run benchmark:onlineMind2Web") + source_default.cyan(" -l 10 -f difficulty=easy\n")
|
|
576
|
-
);
|
|
577
|
-
console.log(source_default.dim(" # Configure defaults"));
|
|
578
|
-
console.log(source_default.green(" evals config set env browserbase"));
|
|
579
|
-
console.log(source_default.green(" evals config set trials 5\n"));
|
|
580
|
-
console.log(source_default.magenta.underline("Options"));
|
|
581
|
-
console.log(
|
|
582
|
-
source_default.cyan(" -e, --env".padEnd(20)) + "Environment: local|browserbase"
|
|
583
|
-
);
|
|
584
|
-
console.log(
|
|
585
|
-
source_default.cyan(" -t, --trials".padEnd(20)) + "Number of trials per eval"
|
|
586
|
-
);
|
|
587
|
-
console.log(
|
|
588
|
-
source_default.cyan(" -c, --concurrency".padEnd(20)) + "Max parallel sessions"
|
|
589
|
-
);
|
|
590
|
-
console.log(source_default.cyan(" -m, --model".padEnd(20)) + "Model override");
|
|
591
|
-
console.log(source_default.cyan(" -p, --provider".padEnd(20)) + "Provider override");
|
|
592
|
-
console.log(source_default.cyan(" --api".padEnd(20)) + "Use Stagehand API\n");
|
|
593
|
-
console.log(source_default.dim(" Benchmark-specific:"));
|
|
594
|
-
console.log(source_default.cyan(" -l, --limit".padEnd(20)) + "Max tasks to run");
|
|
595
|
-
console.log(
|
|
596
|
-
source_default.cyan(" -s, --sample".padEnd(20)) + "Random sample before limit"
|
|
597
|
-
);
|
|
598
|
-
console.log(
|
|
599
|
-
source_default.cyan(" -f, --filter".padEnd(20)) + "Benchmark filters (key=value)\n"
|
|
600
|
-
);
|
|
601
|
-
}
|
|
602
|
-
function handleConfig(args) {
|
|
603
|
-
const config = loadConfig();
|
|
604
|
-
if (args.length === 0) {
|
|
605
|
-
console.log(source_default.blue.bold("\nCurrent Configuration"));
|
|
606
|
-
console.log(source_default.cyan("\nDefaults:"));
|
|
607
|
-
Object.entries(config.defaults).forEach(([key, value]) => {
|
|
608
|
-
console.log(` ${key}: ${source_default.yellow(value != null ? value : "not set")}`);
|
|
609
|
-
});
|
|
610
|
-
return;
|
|
611
|
-
}
|
|
612
|
-
if (args[0] === "set" && args.length >= 3) {
|
|
613
|
-
const [, key, ...valueParts] = args;
|
|
614
|
-
const value = valueParts.join(" ");
|
|
615
|
-
if (!(key in config.defaults)) {
|
|
616
|
-
console.error(source_default.red(`Error: Unknown config key "${key}"`));
|
|
617
|
-
console.log(
|
|
618
|
-
source_default.dim(`Valid keys: ${Object.keys(config.defaults).join(", ")}`)
|
|
619
|
-
);
|
|
620
|
-
import_process.default.exit(1);
|
|
621
|
-
}
|
|
622
|
-
let parsedValue = value;
|
|
623
|
-
if (key === "trials" || key === "concurrency") {
|
|
624
|
-
parsedValue = parseInt(value, 10);
|
|
625
|
-
if (isNaN(parsedValue)) {
|
|
626
|
-
console.error(source_default.red(`Error: ${key} must be a number`));
|
|
627
|
-
import_process.default.exit(1);
|
|
628
|
-
}
|
|
629
|
-
} else if (key === "api") {
|
|
630
|
-
parsedValue = value === "true";
|
|
631
|
-
} else if (value === "null" || value === "none") {
|
|
632
|
-
parsedValue = null;
|
|
633
|
-
}
|
|
634
|
-
if (key === "env") {
|
|
635
|
-
config.defaults.env = parsedValue;
|
|
636
|
-
} else if (key === "trials") {
|
|
637
|
-
config.defaults.trials = parsedValue;
|
|
638
|
-
} else if (key === "concurrency") {
|
|
639
|
-
config.defaults.concurrency = parsedValue;
|
|
640
|
-
} else if (key === "provider") {
|
|
641
|
-
config.defaults.provider = parsedValue;
|
|
642
|
-
} else if (key === "model") {
|
|
643
|
-
config.defaults.model = parsedValue;
|
|
644
|
-
} else if (key === "api") {
|
|
645
|
-
config.defaults.api = parsedValue;
|
|
646
|
-
}
|
|
647
|
-
saveConfig(config);
|
|
648
|
-
console.log(source_default.green(`\u2713 Set ${key} to ${parsedValue}`));
|
|
649
|
-
} else if (args[0] === "reset") {
|
|
650
|
-
const defaultConfig = {
|
|
651
|
-
env: "local",
|
|
652
|
-
trials: 3,
|
|
653
|
-
concurrency: 3,
|
|
654
|
-
provider: null,
|
|
655
|
-
model: null,
|
|
656
|
-
api: false
|
|
657
|
-
};
|
|
658
|
-
if (args[1] && args[1] in config.defaults) {
|
|
659
|
-
const key = args[1];
|
|
660
|
-
if (key === "env") {
|
|
661
|
-
config.defaults.env = defaultConfig.env;
|
|
662
|
-
} else if (key === "trials") {
|
|
663
|
-
config.defaults.trials = defaultConfig.trials;
|
|
664
|
-
} else if (key === "concurrency") {
|
|
665
|
-
config.defaults.concurrency = defaultConfig.concurrency;
|
|
666
|
-
} else if (key === "provider") {
|
|
667
|
-
config.defaults.provider = defaultConfig.provider;
|
|
668
|
-
} else if (key === "model") {
|
|
669
|
-
config.defaults.model = defaultConfig.model;
|
|
670
|
-
} else if (key === "api") {
|
|
671
|
-
config.defaults.api = defaultConfig.api;
|
|
672
|
-
}
|
|
673
|
-
saveConfig(config);
|
|
674
|
-
console.log(source_default.green(`\u2713 Reset ${args[1]} to default`));
|
|
675
|
-
} else if (!args[1]) {
|
|
676
|
-
config.defaults = defaultConfig;
|
|
677
|
-
saveConfig(config);
|
|
678
|
-
console.log(source_default.green("\u2713 Reset all settings to defaults"));
|
|
679
|
-
} else {
|
|
680
|
-
console.error(source_default.red(`Error: Unknown config key "${args[1]}"`));
|
|
681
|
-
import_process.default.exit(1);
|
|
682
|
-
}
|
|
683
|
-
} else if (args[0] === "path") {
|
|
684
|
-
console.log(CONFIG_PATH);
|
|
685
|
-
} else {
|
|
686
|
-
console.error(source_default.red("Error: Invalid config command"));
|
|
687
|
-
console.log(
|
|
688
|
-
source_default.dim("Usage: evals config [set <key> <value> | reset [key] | path]")
|
|
689
|
-
);
|
|
690
|
-
import_process.default.exit(1);
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
function handleList(args) {
|
|
694
|
-
const config = loadConfig();
|
|
695
|
-
console.log(source_default.blue.bold("\nAvailable Evals\n"));
|
|
696
|
-
const categories = /* @__PURE__ */ new Map();
|
|
697
|
-
config.tasks.forEach((task) => {
|
|
698
|
-
task.categories.forEach((cat) => {
|
|
699
|
-
if (!categories.has(cat)) {
|
|
700
|
-
categories.set(cat, []);
|
|
701
|
-
}
|
|
702
|
-
categories.get(cat).push(task.name);
|
|
703
|
-
});
|
|
704
|
-
});
|
|
705
|
-
console.log(source_default.magenta.underline("Custom Eval Categories"));
|
|
706
|
-
Array.from(categories.entries()).filter(([cat]) => !cat.includes("external_agent_benchmarks")).forEach(([category, tasks]) => {
|
|
707
|
-
console.log(
|
|
708
|
-
` ${source_default.cyan(category)} ${source_default.dim(`(${tasks.length} evals)`)}`
|
|
709
|
-
);
|
|
710
|
-
});
|
|
711
|
-
console.log(source_default.magenta.underline("\nBenchmarks"));
|
|
712
|
-
Object.keys(config.benchmarks).forEach((name) => {
|
|
713
|
-
const shorthand = `b:${name}`;
|
|
714
|
-
console.log(
|
|
715
|
-
` ${source_default.cyan(shorthand.padEnd(20))} ${source_default.dim(`benchmark:${name}`)}`
|
|
716
|
-
);
|
|
717
|
-
});
|
|
718
|
-
if (args.includes("--detailed") || args.includes("-d")) {
|
|
719
|
-
console.log(source_default.magenta.underline("\n\nDetailed Task List"));
|
|
720
|
-
categories.forEach((tasks, category) => {
|
|
721
|
-
if (!category.includes("external_agent_benchmarks")) {
|
|
722
|
-
console.log(source_default.cyan(`
|
|
723
|
-
${category}:`));
|
|
724
|
-
tasks.forEach((task) => {
|
|
725
|
-
console.log(` - ${task}`);
|
|
726
|
-
});
|
|
727
|
-
}
|
|
728
|
-
});
|
|
729
|
-
} else {
|
|
730
|
-
console.log(
|
|
731
|
-
source_default.yellow(
|
|
732
|
-
"\n\u{1F4A1} Tip: Use 'evals list --detailed' to see all individual tasks"
|
|
733
|
-
)
|
|
734
|
-
);
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
function parseArgs(rawArgs) {
|
|
738
|
-
const options = {};
|
|
739
|
-
const filters = [];
|
|
740
|
-
let target;
|
|
741
|
-
for (let i = 0; i < rawArgs.length; i++) {
|
|
742
|
-
const arg = rawArgs[i];
|
|
743
|
-
if (arg.startsWith("-")) {
|
|
744
|
-
const flagName = arg.replace(/^--?/, "");
|
|
745
|
-
const flagMap = {
|
|
746
|
-
e: "env",
|
|
747
|
-
t: "trials",
|
|
748
|
-
c: "concurrency",
|
|
749
|
-
m: "model",
|
|
750
|
-
p: "provider",
|
|
751
|
-
l: "limit",
|
|
752
|
-
s: "sample",
|
|
753
|
-
f: "filter"
|
|
754
|
-
};
|
|
755
|
-
const optionName = flagMap[flagName] || flagName;
|
|
756
|
-
if (optionName === "api") {
|
|
757
|
-
options.api = true;
|
|
758
|
-
} else if (optionName === "filter") {
|
|
759
|
-
const filterValue = rawArgs[++i];
|
|
760
|
-
if (filterValue && filterValue.includes("=")) {
|
|
761
|
-
const [key, value] = filterValue.split("=");
|
|
762
|
-
filters.push([key, value]);
|
|
763
|
-
}
|
|
764
|
-
} else {
|
|
765
|
-
const value = rawArgs[++i];
|
|
766
|
-
if (value && !value.startsWith("-")) {
|
|
767
|
-
if (["trials", "concurrency", "limit", "sample"].includes(optionName)) {
|
|
768
|
-
options[optionName] = parseInt(value, 10);
|
|
769
|
-
} else {
|
|
770
|
-
options[optionName] = value;
|
|
771
|
-
}
|
|
772
|
-
}
|
|
773
|
-
}
|
|
774
|
-
} else if (!target) {
|
|
775
|
-
target = arg;
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
return { options, target, filters };
|
|
779
|
-
}
|
|
780
|
-
function handleRun(args) {
|
|
781
|
-
const config = loadConfig();
|
|
782
|
-
const { options, target, filters } = parseArgs(args);
|
|
783
|
-
const finalOptions = __spreadValues(__spreadValues({}, config.defaults), options);
|
|
784
|
-
const env2 = __spreadValues({}, import_process.default.env);
|
|
785
|
-
if (finalOptions.env === "browserbase") {
|
|
786
|
-
env2.EVAL_ENV = "BROWSERBASE";
|
|
787
|
-
} else {
|
|
788
|
-
env2.EVAL_ENV = "LOCAL";
|
|
789
|
-
}
|
|
790
|
-
if (finalOptions.api) {
|
|
791
|
-
env2.USE_API = "true";
|
|
792
|
-
}
|
|
793
|
-
if (finalOptions.trials) {
|
|
794
|
-
env2.EVAL_TRIAL_COUNT = String(finalOptions.trials);
|
|
795
|
-
}
|
|
796
|
-
if (finalOptions.concurrency) {
|
|
797
|
-
env2.EVAL_MAX_CONCURRENCY = String(finalOptions.concurrency);
|
|
798
|
-
}
|
|
799
|
-
if (finalOptions.provider) {
|
|
800
|
-
env2.EVAL_PROVIDER = finalOptions.provider;
|
|
801
|
-
}
|
|
802
|
-
if (finalOptions.model) {
|
|
803
|
-
env2.EVAL_MODEL_OVERRIDE = finalOptions.model;
|
|
804
|
-
}
|
|
805
|
-
let evalName;
|
|
806
|
-
let categoryFilter;
|
|
807
|
-
if (target) {
|
|
808
|
-
if (target.startsWith("b:") || target.startsWith("benchmark:")) {
|
|
809
|
-
const benchmarkName = target.replace(/^(b:|benchmark:)/, "");
|
|
810
|
-
if (!config.benchmarks[benchmarkName]) {
|
|
811
|
-
console.error(source_default.red(`Error: Unknown benchmark "${benchmarkName}"`));
|
|
812
|
-
console.log(
|
|
813
|
-
source_default.dim(
|
|
814
|
-
`Available benchmarks: ${Object.keys(config.benchmarks).join(", ")}`
|
|
815
|
-
)
|
|
816
|
-
);
|
|
817
|
-
import_process.default.exit(1);
|
|
818
|
-
}
|
|
819
|
-
const benchmarkMap = {
|
|
820
|
-
webbench: "agent/webbench",
|
|
821
|
-
gaia: "agent/gaia",
|
|
822
|
-
webvoyager: "agent/webvoyager",
|
|
823
|
-
osworld: "agent/osworld",
|
|
824
|
-
onlineMind2Web: "agent/onlineMind2Web"
|
|
825
|
-
};
|
|
826
|
-
evalName = benchmarkMap[benchmarkName];
|
|
827
|
-
env2.EVAL_DATASET = benchmarkName;
|
|
828
|
-
if (options.limit) {
|
|
829
|
-
env2.EVAL_MAX_K = String(options.limit);
|
|
830
|
-
env2[`EVAL_${benchmarkName.toUpperCase()}_LIMIT`] = String(
|
|
831
|
-
options.limit
|
|
832
|
-
);
|
|
833
|
-
}
|
|
834
|
-
if (options.sample) {
|
|
835
|
-
env2[`EVAL_${benchmarkName.toUpperCase()}_SAMPLE`] = String(
|
|
836
|
-
options.sample
|
|
837
|
-
);
|
|
838
|
-
}
|
|
839
|
-
filters.forEach(([key, value]) => {
|
|
840
|
-
const envKey = `EVAL_${benchmarkName.toUpperCase()}_${key.toUpperCase()}`;
|
|
841
|
-
env2[envKey] = value;
|
|
842
|
-
});
|
|
843
|
-
} else if (target === "all") {
|
|
844
|
-
} else if (target.includes("/") || target.includes("*")) {
|
|
845
|
-
evalName = target;
|
|
846
|
-
} else {
|
|
847
|
-
const categories = /* @__PURE__ */ new Set();
|
|
848
|
-
config.tasks.forEach((task) => {
|
|
849
|
-
task.categories.forEach((cat) => categories.add(cat));
|
|
850
|
-
});
|
|
851
|
-
if (categories.has(target)) {
|
|
852
|
-
categoryFilter = target;
|
|
853
|
-
} else {
|
|
854
|
-
evalName = target;
|
|
855
|
-
}
|
|
856
|
-
}
|
|
857
|
-
}
|
|
858
|
-
const legacyArgs = [];
|
|
859
|
-
if (evalName) {
|
|
860
|
-
legacyArgs.push(`name=${evalName}`);
|
|
861
|
-
} else if (categoryFilter) {
|
|
862
|
-
legacyArgs.push("category", categoryFilter);
|
|
863
|
-
}
|
|
864
|
-
console.log(source_default.blue.bold("\nRunning evals...\n"));
|
|
865
|
-
const buildChild = (0, import_child_process.spawn)("pnpm", ["run", "build"], {
|
|
866
|
-
stdio: "inherit",
|
|
867
|
-
shell: true
|
|
868
|
-
});
|
|
869
|
-
buildChild.on("exit", (buildCode) => {
|
|
870
|
-
if (buildCode !== 0) {
|
|
871
|
-
import_process.default.exit(buildCode || 1);
|
|
872
|
-
}
|
|
873
|
-
const compiledEvalPath = import_path.default.join(__dirname, "index.eval.js");
|
|
874
|
-
const sourceEvalPath = import_path.default.resolve(
|
|
875
|
-
__dirname,
|
|
876
|
-
"..",
|
|
877
|
-
"..",
|
|
878
|
-
"packages",
|
|
879
|
-
"evals",
|
|
880
|
-
"index.eval.ts"
|
|
881
|
-
);
|
|
882
|
-
let child;
|
|
883
|
-
if (import_fs.default.existsSync(compiledEvalPath)) {
|
|
884
|
-
child = (0, import_child_process.spawn)(import_process.default.execPath, [compiledEvalPath, ...legacyArgs], {
|
|
885
|
-
env: env2,
|
|
886
|
-
stdio: "inherit",
|
|
887
|
-
shell: true
|
|
888
|
-
});
|
|
889
|
-
} else {
|
|
890
|
-
let tsxCliPath;
|
|
891
|
-
try {
|
|
892
|
-
tsxCliPath = require.resolve("tsx/dist/cli.js");
|
|
893
|
-
} catch (e) {
|
|
894
|
-
}
|
|
895
|
-
const tsxArgs = [sourceEvalPath, ...legacyArgs];
|
|
896
|
-
if (tsxCliPath) {
|
|
897
|
-
child = (0, import_child_process.spawn)(import_process.default.execPath, [tsxCliPath, ...tsxArgs], {
|
|
898
|
-
env: env2,
|
|
899
|
-
stdio: "inherit",
|
|
900
|
-
shell: true
|
|
901
|
-
});
|
|
902
|
-
} else {
|
|
903
|
-
child = (0, import_child_process.spawn)("tsx", tsxArgs, {
|
|
904
|
-
env: env2,
|
|
905
|
-
stdio: "inherit",
|
|
906
|
-
shell: true
|
|
907
|
-
});
|
|
908
|
-
}
|
|
909
|
-
}
|
|
910
|
-
child.on("exit", (code) => {
|
|
911
|
-
import_process.default.exit(code || 0);
|
|
912
|
-
});
|
|
913
|
-
import_process.default.on("SIGINT", () => {
|
|
914
|
-
console.log("\n\nReceived SIGINT, killing child process...");
|
|
915
|
-
child.kill("SIGINT");
|
|
916
|
-
setTimeout(() => {
|
|
917
|
-
child.kill("SIGKILL");
|
|
918
|
-
import_process.default.exit(130);
|
|
919
|
-
}, 1e3);
|
|
920
|
-
});
|
|
921
|
-
import_process.default.on("SIGTERM", () => {
|
|
922
|
-
console.log("\n\nReceived SIGTERM, killing child process...");
|
|
923
|
-
child.kill("SIGTERM");
|
|
924
|
-
setTimeout(() => {
|
|
925
|
-
child.kill("SIGKILL");
|
|
926
|
-
import_process.default.exit(143);
|
|
927
|
-
}, 1e3);
|
|
928
|
-
});
|
|
929
|
-
});
|
|
930
|
-
}
|
|
931
|
-
function main() {
|
|
932
|
-
const args = import_process.default.argv.slice(2);
|
|
933
|
-
const command = args[0];
|
|
934
|
-
const commandArgs = args.slice(1);
|
|
935
|
-
switch (command) {
|
|
936
|
-
case "run":
|
|
937
|
-
handleRun(commandArgs);
|
|
938
|
-
break;
|
|
939
|
-
case "list":
|
|
940
|
-
handleList(commandArgs);
|
|
941
|
-
break;
|
|
942
|
-
case "config":
|
|
943
|
-
handleConfig(commandArgs);
|
|
944
|
-
break;
|
|
945
|
-
case "help":
|
|
946
|
-
case "--help":
|
|
947
|
-
case "-h":
|
|
948
|
-
printHelp();
|
|
949
|
-
break;
|
|
950
|
-
case void 0:
|
|
951
|
-
console.error(source_default.red("Error: No command specified"));
|
|
952
|
-
printHelp();
|
|
953
|
-
import_process.default.exit(1);
|
|
954
|
-
break;
|
|
955
|
-
default:
|
|
956
|
-
if (!command.startsWith("-")) {
|
|
957
|
-
handleRun(args);
|
|
958
|
-
} else {
|
|
959
|
-
console.error(source_default.red(`Error: Unknown command "${command}"`));
|
|
960
|
-
printHelp();
|
|
961
|
-
import_process.default.exit(1);
|
|
962
|
-
}
|
|
963
|
-
}
|
|
964
|
-
}
|
|
965
|
-
main();
|