@browserbasehq/stagehand 2.5.3 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/evals/cli.js DELETED
@@ -1,965 +0,0 @@
1
- #!/usr/bin/env node
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __defProps = Object.defineProperties;
5
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
6
- var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
7
- var __getOwnPropNames = Object.getOwnPropertyNames;
8
- var __getOwnPropSymbols = Object.getOwnPropertySymbols;
9
- var __getProtoOf = Object.getPrototypeOf;
10
- var __hasOwnProp = Object.prototype.hasOwnProperty;
11
- var __propIsEnum = Object.prototype.propertyIsEnumerable;
12
- var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
13
- var __spreadValues = (a, b) => {
14
- for (var prop in b || (b = {}))
15
- if (__hasOwnProp.call(b, prop))
16
- __defNormalProp(a, prop, b[prop]);
17
- if (__getOwnPropSymbols)
18
- for (var prop of __getOwnPropSymbols(b)) {
19
- if (__propIsEnum.call(b, prop))
20
- __defNormalProp(a, prop, b[prop]);
21
- }
22
- return a;
23
- };
24
- var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
25
- var __copyProps = (to, from, except, desc) => {
26
- if (from && typeof from === "object" || typeof from === "function") {
27
- for (let key of __getOwnPropNames(from))
28
- if (!__hasOwnProp.call(to, key) && key !== except)
29
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
30
- }
31
- return to;
32
- };
33
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
34
- // If the importer is in node compatibility mode or this is not an ESM
35
- // file that has been converted to a CommonJS file using a Babel-
36
- // compatible transform (i.e. "__esModule" has not been set), then set
37
- // "default" to the CommonJS "module.exports" for node compatibility.
38
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
39
- mod
40
- ));
41
-
42
- // cli.ts
43
- var import_process = __toESM(require("process"));
44
-
45
- // ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/vendor/ansi-styles/index.js
46
- var ANSI_BACKGROUND_OFFSET = 10;
47
- var wrapAnsi16 = (offset = 0) => (code) => `\x1B[${code + offset}m`;
48
- var wrapAnsi256 = (offset = 0) => (code) => `\x1B[${38 + offset};5;${code}m`;
49
- var wrapAnsi16m = (offset = 0) => (red, green, blue) => `\x1B[${38 + offset};2;${red};${green};${blue}m`;
50
- var styles = {
51
- modifier: {
52
- reset: [0, 0],
53
- // 21 isn't widely supported and 22 does the same thing
54
- bold: [1, 22],
55
- dim: [2, 22],
56
- italic: [3, 23],
57
- underline: [4, 24],
58
- overline: [53, 55],
59
- inverse: [7, 27],
60
- hidden: [8, 28],
61
- strikethrough: [9, 29]
62
- },
63
- color: {
64
- black: [30, 39],
65
- red: [31, 39],
66
- green: [32, 39],
67
- yellow: [33, 39],
68
- blue: [34, 39],
69
- magenta: [35, 39],
70
- cyan: [36, 39],
71
- white: [37, 39],
72
- // Bright color
73
- blackBright: [90, 39],
74
- gray: [90, 39],
75
- // Alias of `blackBright`
76
- grey: [90, 39],
77
- // Alias of `blackBright`
78
- redBright: [91, 39],
79
- greenBright: [92, 39],
80
- yellowBright: [93, 39],
81
- blueBright: [94, 39],
82
- magentaBright: [95, 39],
83
- cyanBright: [96, 39],
84
- whiteBright: [97, 39]
85
- },
86
- bgColor: {
87
- bgBlack: [40, 49],
88
- bgRed: [41, 49],
89
- bgGreen: [42, 49],
90
- bgYellow: [43, 49],
91
- bgBlue: [44, 49],
92
- bgMagenta: [45, 49],
93
- bgCyan: [46, 49],
94
- bgWhite: [47, 49],
95
- // Bright color
96
- bgBlackBright: [100, 49],
97
- bgGray: [100, 49],
98
- // Alias of `bgBlackBright`
99
- bgGrey: [100, 49],
100
- // Alias of `bgBlackBright`
101
- bgRedBright: [101, 49],
102
- bgGreenBright: [102, 49],
103
- bgYellowBright: [103, 49],
104
- bgBlueBright: [104, 49],
105
- bgMagentaBright: [105, 49],
106
- bgCyanBright: [106, 49],
107
- bgWhiteBright: [107, 49]
108
- }
109
- };
110
- var modifierNames = Object.keys(styles.modifier);
111
- var foregroundColorNames = Object.keys(styles.color);
112
- var backgroundColorNames = Object.keys(styles.bgColor);
113
- var colorNames = [...foregroundColorNames, ...backgroundColorNames];
114
- function assembleStyles() {
115
- const codes = /* @__PURE__ */ new Map();
116
- for (const [groupName, group] of Object.entries(styles)) {
117
- for (const [styleName, style] of Object.entries(group)) {
118
- styles[styleName] = {
119
- open: `\x1B[${style[0]}m`,
120
- close: `\x1B[${style[1]}m`
121
- };
122
- group[styleName] = styles[styleName];
123
- codes.set(style[0], style[1]);
124
- }
125
- Object.defineProperty(styles, groupName, {
126
- value: group,
127
- enumerable: false
128
- });
129
- }
130
- Object.defineProperty(styles, "codes", {
131
- value: codes,
132
- enumerable: false
133
- });
134
- styles.color.close = "\x1B[39m";
135
- styles.bgColor.close = "\x1B[49m";
136
- styles.color.ansi = wrapAnsi16();
137
- styles.color.ansi256 = wrapAnsi256();
138
- styles.color.ansi16m = wrapAnsi16m();
139
- styles.bgColor.ansi = wrapAnsi16(ANSI_BACKGROUND_OFFSET);
140
- styles.bgColor.ansi256 = wrapAnsi256(ANSI_BACKGROUND_OFFSET);
141
- styles.bgColor.ansi16m = wrapAnsi16m(ANSI_BACKGROUND_OFFSET);
142
- Object.defineProperties(styles, {
143
- rgbToAnsi256: {
144
- value(red, green, blue) {
145
- if (red === green && green === blue) {
146
- if (red < 8) {
147
- return 16;
148
- }
149
- if (red > 248) {
150
- return 231;
151
- }
152
- return Math.round((red - 8) / 247 * 24) + 232;
153
- }
154
- return 16 + 36 * Math.round(red / 255 * 5) + 6 * Math.round(green / 255 * 5) + Math.round(blue / 255 * 5);
155
- },
156
- enumerable: false
157
- },
158
- hexToRgb: {
159
- value(hex) {
160
- const matches = /[a-f\d]{6}|[a-f\d]{3}/i.exec(hex.toString(16));
161
- if (!matches) {
162
- return [0, 0, 0];
163
- }
164
- let [colorString] = matches;
165
- if (colorString.length === 3) {
166
- colorString = [...colorString].map((character) => character + character).join("");
167
- }
168
- const integer = Number.parseInt(colorString, 16);
169
- return [
170
- /* eslint-disable no-bitwise */
171
- integer >> 16 & 255,
172
- integer >> 8 & 255,
173
- integer & 255
174
- /* eslint-enable no-bitwise */
175
- ];
176
- },
177
- enumerable: false
178
- },
179
- hexToAnsi256: {
180
- value: (hex) => styles.rgbToAnsi256(...styles.hexToRgb(hex)),
181
- enumerable: false
182
- },
183
- ansi256ToAnsi: {
184
- value(code) {
185
- if (code < 8) {
186
- return 30 + code;
187
- }
188
- if (code < 16) {
189
- return 90 + (code - 8);
190
- }
191
- let red;
192
- let green;
193
- let blue;
194
- if (code >= 232) {
195
- red = ((code - 232) * 10 + 8) / 255;
196
- green = red;
197
- blue = red;
198
- } else {
199
- code -= 16;
200
- const remainder = code % 36;
201
- red = Math.floor(code / 36) / 5;
202
- green = Math.floor(remainder / 6) / 5;
203
- blue = remainder % 6 / 5;
204
- }
205
- const value = Math.max(red, green, blue) * 2;
206
- if (value === 0) {
207
- return 30;
208
- }
209
- let result = 30 + (Math.round(blue) << 2 | Math.round(green) << 1 | Math.round(red));
210
- if (value === 2) {
211
- result += 60;
212
- }
213
- return result;
214
- },
215
- enumerable: false
216
- },
217
- rgbToAnsi: {
218
- value: (red, green, blue) => styles.ansi256ToAnsi(styles.rgbToAnsi256(red, green, blue)),
219
- enumerable: false
220
- },
221
- hexToAnsi: {
222
- value: (hex) => styles.ansi256ToAnsi(styles.hexToAnsi256(hex)),
223
- enumerable: false
224
- }
225
- });
226
- return styles;
227
- }
228
- var ansiStyles = assembleStyles();
229
- var ansi_styles_default = ansiStyles;
230
-
231
- // ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/vendor/supports-color/index.js
232
- var import_node_process = __toESM(require("process"), 1);
233
- var import_node_os = __toESM(require("os"), 1);
234
- var import_node_tty = __toESM(require("tty"), 1);
235
- function hasFlag(flag, argv = globalThis.Deno ? globalThis.Deno.args : import_node_process.default.argv) {
236
- const prefix = flag.startsWith("-") ? "" : flag.length === 1 ? "-" : "--";
237
- const position = argv.indexOf(prefix + flag);
238
- const terminatorPosition = argv.indexOf("--");
239
- return position !== -1 && (terminatorPosition === -1 || position < terminatorPosition);
240
- }
241
- var { env } = import_node_process.default;
242
- var flagForceColor;
243
- if (hasFlag("no-color") || hasFlag("no-colors") || hasFlag("color=false") || hasFlag("color=never")) {
244
- flagForceColor = 0;
245
- } else if (hasFlag("color") || hasFlag("colors") || hasFlag("color=true") || hasFlag("color=always")) {
246
- flagForceColor = 1;
247
- }
248
- function envForceColor() {
249
- if ("FORCE_COLOR" in env) {
250
- if (env.FORCE_COLOR === "true") {
251
- return 1;
252
- }
253
- if (env.FORCE_COLOR === "false") {
254
- return 0;
255
- }
256
- return env.FORCE_COLOR.length === 0 ? 1 : Math.min(Number.parseInt(env.FORCE_COLOR, 10), 3);
257
- }
258
- }
259
- function translateLevel(level) {
260
- if (level === 0) {
261
- return false;
262
- }
263
- return {
264
- level,
265
- hasBasic: true,
266
- has256: level >= 2,
267
- has16m: level >= 3
268
- };
269
- }
270
- function _supportsColor(haveStream, { streamIsTTY, sniffFlags = true } = {}) {
271
- const noFlagForceColor = envForceColor();
272
- if (noFlagForceColor !== void 0) {
273
- flagForceColor = noFlagForceColor;
274
- }
275
- const forceColor = sniffFlags ? flagForceColor : noFlagForceColor;
276
- if (forceColor === 0) {
277
- return 0;
278
- }
279
- if (sniffFlags) {
280
- if (hasFlag("color=16m") || hasFlag("color=full") || hasFlag("color=truecolor")) {
281
- return 3;
282
- }
283
- if (hasFlag("color=256")) {
284
- return 2;
285
- }
286
- }
287
- if ("TF_BUILD" in env && "AGENT_NAME" in env) {
288
- return 1;
289
- }
290
- if (haveStream && !streamIsTTY && forceColor === void 0) {
291
- return 0;
292
- }
293
- const min = forceColor || 0;
294
- if (env.TERM === "dumb") {
295
- return min;
296
- }
297
- if (import_node_process.default.platform === "win32") {
298
- const osRelease = import_node_os.default.release().split(".");
299
- if (Number(osRelease[0]) >= 10 && Number(osRelease[2]) >= 10586) {
300
- return Number(osRelease[2]) >= 14931 ? 3 : 2;
301
- }
302
- return 1;
303
- }
304
- if ("CI" in env) {
305
- if (["GITHUB_ACTIONS", "GITEA_ACTIONS", "CIRCLECI"].some((key) => key in env)) {
306
- return 3;
307
- }
308
- if (["TRAVIS", "APPVEYOR", "GITLAB_CI", "BUILDKITE", "DRONE"].some((sign) => sign in env) || env.CI_NAME === "codeship") {
309
- return 1;
310
- }
311
- return min;
312
- }
313
- if ("TEAMCITY_VERSION" in env) {
314
- return /^(9\.(0*[1-9]\d*)\.|\d{2,}\.)/.test(env.TEAMCITY_VERSION) ? 1 : 0;
315
- }
316
- if (env.COLORTERM === "truecolor") {
317
- return 3;
318
- }
319
- if (env.TERM === "xterm-kitty") {
320
- return 3;
321
- }
322
- if ("TERM_PROGRAM" in env) {
323
- const version = Number.parseInt((env.TERM_PROGRAM_VERSION || "").split(".")[0], 10);
324
- switch (env.TERM_PROGRAM) {
325
- case "iTerm.app": {
326
- return version >= 3 ? 3 : 2;
327
- }
328
- case "Apple_Terminal": {
329
- return 2;
330
- }
331
- }
332
- }
333
- if (/-256(color)?$/i.test(env.TERM)) {
334
- return 2;
335
- }
336
- if (/^screen|^xterm|^vt100|^vt220|^rxvt|color|ansi|cygwin|linux/i.test(env.TERM)) {
337
- return 1;
338
- }
339
- if ("COLORTERM" in env) {
340
- return 1;
341
- }
342
- return min;
343
- }
344
- function createSupportsColor(stream, options = {}) {
345
- const level = _supportsColor(stream, __spreadValues({
346
- streamIsTTY: stream && stream.isTTY
347
- }, options));
348
- return translateLevel(level);
349
- }
350
- var supportsColor = {
351
- stdout: createSupportsColor({ isTTY: import_node_tty.default.isatty(1) }),
352
- stderr: createSupportsColor({ isTTY: import_node_tty.default.isatty(2) })
353
- };
354
- var supports_color_default = supportsColor;
355
-
356
- // ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/utilities.js
357
- function stringReplaceAll(string, substring, replacer) {
358
- let index = string.indexOf(substring);
359
- if (index === -1) {
360
- return string;
361
- }
362
- const substringLength = substring.length;
363
- let endIndex = 0;
364
- let returnValue = "";
365
- do {
366
- returnValue += string.slice(endIndex, index) + substring + replacer;
367
- endIndex = index + substringLength;
368
- index = string.indexOf(substring, endIndex);
369
- } while (index !== -1);
370
- returnValue += string.slice(endIndex);
371
- return returnValue;
372
- }
373
- function stringEncaseCRLFWithFirstIndex(string, prefix, postfix, index) {
374
- let endIndex = 0;
375
- let returnValue = "";
376
- do {
377
- const gotCR = string[index - 1] === "\r";
378
- returnValue += string.slice(endIndex, gotCR ? index - 1 : index) + prefix + (gotCR ? "\r\n" : "\n") + postfix;
379
- endIndex = index + 1;
380
- index = string.indexOf("\n", endIndex);
381
- } while (index !== -1);
382
- returnValue += string.slice(endIndex);
383
- return returnValue;
384
- }
385
-
386
- // ../../node_modules/.pnpm/chalk@5.4.1/node_modules/chalk/source/index.js
387
- var { stdout: stdoutColor, stderr: stderrColor } = supports_color_default;
388
- var GENERATOR = Symbol("GENERATOR");
389
- var STYLER = Symbol("STYLER");
390
- var IS_EMPTY = Symbol("IS_EMPTY");
391
- var levelMapping = [
392
- "ansi",
393
- "ansi",
394
- "ansi256",
395
- "ansi16m"
396
- ];
397
- var styles2 = /* @__PURE__ */ Object.create(null);
398
- var applyOptions = (object, options = {}) => {
399
- if (options.level && !(Number.isInteger(options.level) && options.level >= 0 && options.level <= 3)) {
400
- throw new Error("The `level` option should be an integer from 0 to 3");
401
- }
402
- const colorLevel = stdoutColor ? stdoutColor.level : 0;
403
- object.level = options.level === void 0 ? colorLevel : options.level;
404
- };
405
- var chalkFactory = (options) => {
406
- const chalk2 = (...strings) => strings.join(" ");
407
- applyOptions(chalk2, options);
408
- Object.setPrototypeOf(chalk2, createChalk.prototype);
409
- return chalk2;
410
- };
411
- function createChalk(options) {
412
- return chalkFactory(options);
413
- }
414
- Object.setPrototypeOf(createChalk.prototype, Function.prototype);
415
- for (const [styleName, style] of Object.entries(ansi_styles_default)) {
416
- styles2[styleName] = {
417
- get() {
418
- const builder = createBuilder(this, createStyler(style.open, style.close, this[STYLER]), this[IS_EMPTY]);
419
- Object.defineProperty(this, styleName, { value: builder });
420
- return builder;
421
- }
422
- };
423
- }
424
- styles2.visible = {
425
- get() {
426
- const builder = createBuilder(this, this[STYLER], true);
427
- Object.defineProperty(this, "visible", { value: builder });
428
- return builder;
429
- }
430
- };
431
- var getModelAnsi = (model, level, type, ...arguments_) => {
432
- if (model === "rgb") {
433
- if (level === "ansi16m") {
434
- return ansi_styles_default[type].ansi16m(...arguments_);
435
- }
436
- if (level === "ansi256") {
437
- return ansi_styles_default[type].ansi256(ansi_styles_default.rgbToAnsi256(...arguments_));
438
- }
439
- return ansi_styles_default[type].ansi(ansi_styles_default.rgbToAnsi(...arguments_));
440
- }
441
- if (model === "hex") {
442
- return getModelAnsi("rgb", level, type, ...ansi_styles_default.hexToRgb(...arguments_));
443
- }
444
- return ansi_styles_default[type][model](...arguments_);
445
- };
446
- var usedModels = ["rgb", "hex", "ansi256"];
447
- for (const model of usedModels) {
448
- styles2[model] = {
449
- get() {
450
- const { level } = this;
451
- return function(...arguments_) {
452
- const styler = createStyler(getModelAnsi(model, levelMapping[level], "color", ...arguments_), ansi_styles_default.color.close, this[STYLER]);
453
- return createBuilder(this, styler, this[IS_EMPTY]);
454
- };
455
- }
456
- };
457
- const bgModel = "bg" + model[0].toUpperCase() + model.slice(1);
458
- styles2[bgModel] = {
459
- get() {
460
- const { level } = this;
461
- return function(...arguments_) {
462
- const styler = createStyler(getModelAnsi(model, levelMapping[level], "bgColor", ...arguments_), ansi_styles_default.bgColor.close, this[STYLER]);
463
- return createBuilder(this, styler, this[IS_EMPTY]);
464
- };
465
- }
466
- };
467
- }
468
- var proto = Object.defineProperties(() => {
469
- }, __spreadProps(__spreadValues({}, styles2), {
470
- level: {
471
- enumerable: true,
472
- get() {
473
- return this[GENERATOR].level;
474
- },
475
- set(level) {
476
- this[GENERATOR].level = level;
477
- }
478
- }
479
- }));
480
- var createStyler = (open, close, parent) => {
481
- let openAll;
482
- let closeAll;
483
- if (parent === void 0) {
484
- openAll = open;
485
- closeAll = close;
486
- } else {
487
- openAll = parent.openAll + open;
488
- closeAll = close + parent.closeAll;
489
- }
490
- return {
491
- open,
492
- close,
493
- openAll,
494
- closeAll,
495
- parent
496
- };
497
- };
498
- var createBuilder = (self, _styler, _isEmpty) => {
499
- const builder = (...arguments_) => applyStyle(builder, arguments_.length === 1 ? "" + arguments_[0] : arguments_.join(" "));
500
- Object.setPrototypeOf(builder, proto);
501
- builder[GENERATOR] = self;
502
- builder[STYLER] = _styler;
503
- builder[IS_EMPTY] = _isEmpty;
504
- return builder;
505
- };
506
- var applyStyle = (self, string) => {
507
- if (self.level <= 0 || !string) {
508
- return self[IS_EMPTY] ? "" : string;
509
- }
510
- let styler = self[STYLER];
511
- if (styler === void 0) {
512
- return string;
513
- }
514
- const { openAll, closeAll } = styler;
515
- if (string.includes("\x1B")) {
516
- while (styler !== void 0) {
517
- string = stringReplaceAll(string, styler.close, styler.open);
518
- styler = styler.parent;
519
- }
520
- }
521
- const lfIndex = string.indexOf("\n");
522
- if (lfIndex !== -1) {
523
- string = stringEncaseCRLFWithFirstIndex(string, closeAll, openAll, lfIndex);
524
- }
525
- return openAll + string + closeAll;
526
- };
527
- Object.defineProperties(createChalk.prototype, styles2);
528
- var chalk = createChalk();
529
- var chalkStderr = createChalk({ level: stderrColor ? stderrColor.level : 0 });
530
- var source_default = chalk;
531
-
532
- // cli.ts
533
- var import_fs = __toESM(require("fs"));
534
- var import_path = __toESM(require("path"));
535
- var import_child_process = require("child_process");
536
- var CONFIG_PATH = import_path.default.join(__dirname, "evals.config.json");
537
- function loadConfig() {
538
- return JSON.parse(import_fs.default.readFileSync(CONFIG_PATH, "utf-8"));
539
- }
540
- function saveConfig(config) {
541
- import_fs.default.writeFileSync(CONFIG_PATH, JSON.stringify(config, null, 2));
542
- }
543
- function printHelp() {
544
- console.log(
545
- source_default.yellow(`\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
546
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28A0\u287E\u283B\u28F6\u2840\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
547
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28A0\u2876\u281B\u28B3\u2846\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
548
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u28B8\u28F7\u2836\u28E6\u28F4\u2836\u28FE\u2847\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
549
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
550
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u2818\u2837\u28E4\u28BE\u284F\u2809\u2809\u2809\u2819\u28FE\u2847\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
551
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u28B8\u2847\u2800\u2800\u2800\u2800\u2808\u28FB\u287F\u281F\u2802\u2800\u28FF\u2803\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
552
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2808\u28F7\u2800\u2800\u2800\u2800\u28B0\u284F\u2800\u2800\u2800\u2880\u28FF\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
553
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2819\u28F7\u2840\u2800\u2800\u2800\u2800\u2800\u2800\u2880\u287E\u2801\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
554
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2808\u2819\u2837\u28E6\u28E4\u28E4\u28F4\u283E\u280B\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800
555
- \u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800\u2800`)
556
- );
557
- console.log(source_default.yellow.bold("\nStagehand Evals CLI"));
558
- console.log(source_default.cyan("\nevals <command> <target> [options]\n"));
559
- console.log(source_default.magenta.underline("Commands"));
560
- console.log(" run Execute evals or benchmarks");
561
- console.log(" list List available evals/benchmarks");
562
- console.log(" config Get/set default configuration");
563
- console.log(" help Show this help message\n");
564
- console.log(source_default.magenta.underline("Examples"));
565
- console.log(source_default.dim(" # Run all custom evals"));
566
- console.log(source_default.green(" evals run all\n"));
567
- console.log(source_default.dim(" # Run specific category"));
568
- console.log(
569
- source_default.green(" evals run act") + source_default.cyan(" -e browserbase -t 5\n")
570
- );
571
- console.log(source_default.dim(" # Run specific eval"));
572
- console.log(source_default.green(" evals run login\n"));
573
- console.log(source_default.dim(" # Run benchmark"));
574
- console.log(
575
- source_default.green(" evals run benchmark:onlineMind2Web") + source_default.cyan(" -l 10 -f difficulty=easy\n")
576
- );
577
- console.log(source_default.dim(" # Configure defaults"));
578
- console.log(source_default.green(" evals config set env browserbase"));
579
- console.log(source_default.green(" evals config set trials 5\n"));
580
- console.log(source_default.magenta.underline("Options"));
581
- console.log(
582
- source_default.cyan(" -e, --env".padEnd(20)) + "Environment: local|browserbase"
583
- );
584
- console.log(
585
- source_default.cyan(" -t, --trials".padEnd(20)) + "Number of trials per eval"
586
- );
587
- console.log(
588
- source_default.cyan(" -c, --concurrency".padEnd(20)) + "Max parallel sessions"
589
- );
590
- console.log(source_default.cyan(" -m, --model".padEnd(20)) + "Model override");
591
- console.log(source_default.cyan(" -p, --provider".padEnd(20)) + "Provider override");
592
- console.log(source_default.cyan(" --api".padEnd(20)) + "Use Stagehand API\n");
593
- console.log(source_default.dim(" Benchmark-specific:"));
594
- console.log(source_default.cyan(" -l, --limit".padEnd(20)) + "Max tasks to run");
595
- console.log(
596
- source_default.cyan(" -s, --sample".padEnd(20)) + "Random sample before limit"
597
- );
598
- console.log(
599
- source_default.cyan(" -f, --filter".padEnd(20)) + "Benchmark filters (key=value)\n"
600
- );
601
- }
602
- function handleConfig(args) {
603
- const config = loadConfig();
604
- if (args.length === 0) {
605
- console.log(source_default.blue.bold("\nCurrent Configuration"));
606
- console.log(source_default.cyan("\nDefaults:"));
607
- Object.entries(config.defaults).forEach(([key, value]) => {
608
- console.log(` ${key}: ${source_default.yellow(value != null ? value : "not set")}`);
609
- });
610
- return;
611
- }
612
- if (args[0] === "set" && args.length >= 3) {
613
- const [, key, ...valueParts] = args;
614
- const value = valueParts.join(" ");
615
- if (!(key in config.defaults)) {
616
- console.error(source_default.red(`Error: Unknown config key "${key}"`));
617
- console.log(
618
- source_default.dim(`Valid keys: ${Object.keys(config.defaults).join(", ")}`)
619
- );
620
- import_process.default.exit(1);
621
- }
622
- let parsedValue = value;
623
- if (key === "trials" || key === "concurrency") {
624
- parsedValue = parseInt(value, 10);
625
- if (isNaN(parsedValue)) {
626
- console.error(source_default.red(`Error: ${key} must be a number`));
627
- import_process.default.exit(1);
628
- }
629
- } else if (key === "api") {
630
- parsedValue = value === "true";
631
- } else if (value === "null" || value === "none") {
632
- parsedValue = null;
633
- }
634
- if (key === "env") {
635
- config.defaults.env = parsedValue;
636
- } else if (key === "trials") {
637
- config.defaults.trials = parsedValue;
638
- } else if (key === "concurrency") {
639
- config.defaults.concurrency = parsedValue;
640
- } else if (key === "provider") {
641
- config.defaults.provider = parsedValue;
642
- } else if (key === "model") {
643
- config.defaults.model = parsedValue;
644
- } else if (key === "api") {
645
- config.defaults.api = parsedValue;
646
- }
647
- saveConfig(config);
648
- console.log(source_default.green(`\u2713 Set ${key} to ${parsedValue}`));
649
- } else if (args[0] === "reset") {
650
- const defaultConfig = {
651
- env: "local",
652
- trials: 3,
653
- concurrency: 3,
654
- provider: null,
655
- model: null,
656
- api: false
657
- };
658
- if (args[1] && args[1] in config.defaults) {
659
- const key = args[1];
660
- if (key === "env") {
661
- config.defaults.env = defaultConfig.env;
662
- } else if (key === "trials") {
663
- config.defaults.trials = defaultConfig.trials;
664
- } else if (key === "concurrency") {
665
- config.defaults.concurrency = defaultConfig.concurrency;
666
- } else if (key === "provider") {
667
- config.defaults.provider = defaultConfig.provider;
668
- } else if (key === "model") {
669
- config.defaults.model = defaultConfig.model;
670
- } else if (key === "api") {
671
- config.defaults.api = defaultConfig.api;
672
- }
673
- saveConfig(config);
674
- console.log(source_default.green(`\u2713 Reset ${args[1]} to default`));
675
- } else if (!args[1]) {
676
- config.defaults = defaultConfig;
677
- saveConfig(config);
678
- console.log(source_default.green("\u2713 Reset all settings to defaults"));
679
- } else {
680
- console.error(source_default.red(`Error: Unknown config key "${args[1]}"`));
681
- import_process.default.exit(1);
682
- }
683
- } else if (args[0] === "path") {
684
- console.log(CONFIG_PATH);
685
- } else {
686
- console.error(source_default.red("Error: Invalid config command"));
687
- console.log(
688
- source_default.dim("Usage: evals config [set <key> <value> | reset [key] | path]")
689
- );
690
- import_process.default.exit(1);
691
- }
692
- }
693
- function handleList(args) {
694
- const config = loadConfig();
695
- console.log(source_default.blue.bold("\nAvailable Evals\n"));
696
- const categories = /* @__PURE__ */ new Map();
697
- config.tasks.forEach((task) => {
698
- task.categories.forEach((cat) => {
699
- if (!categories.has(cat)) {
700
- categories.set(cat, []);
701
- }
702
- categories.get(cat).push(task.name);
703
- });
704
- });
705
- console.log(source_default.magenta.underline("Custom Eval Categories"));
706
- Array.from(categories.entries()).filter(([cat]) => !cat.includes("external_agent_benchmarks")).forEach(([category, tasks]) => {
707
- console.log(
708
- ` ${source_default.cyan(category)} ${source_default.dim(`(${tasks.length} evals)`)}`
709
- );
710
- });
711
- console.log(source_default.magenta.underline("\nBenchmarks"));
712
- Object.keys(config.benchmarks).forEach((name) => {
713
- const shorthand = `b:${name}`;
714
- console.log(
715
- ` ${source_default.cyan(shorthand.padEnd(20))} ${source_default.dim(`benchmark:${name}`)}`
716
- );
717
- });
718
- if (args.includes("--detailed") || args.includes("-d")) {
719
- console.log(source_default.magenta.underline("\n\nDetailed Task List"));
720
- categories.forEach((tasks, category) => {
721
- if (!category.includes("external_agent_benchmarks")) {
722
- console.log(source_default.cyan(`
723
- ${category}:`));
724
- tasks.forEach((task) => {
725
- console.log(` - ${task}`);
726
- });
727
- }
728
- });
729
- } else {
730
- console.log(
731
- source_default.yellow(
732
- "\n\u{1F4A1} Tip: Use 'evals list --detailed' to see all individual tasks"
733
- )
734
- );
735
- }
736
- }
737
- function parseArgs(rawArgs) {
738
- const options = {};
739
- const filters = [];
740
- let target;
741
- for (let i = 0; i < rawArgs.length; i++) {
742
- const arg = rawArgs[i];
743
- if (arg.startsWith("-")) {
744
- const flagName = arg.replace(/^--?/, "");
745
- const flagMap = {
746
- e: "env",
747
- t: "trials",
748
- c: "concurrency",
749
- m: "model",
750
- p: "provider",
751
- l: "limit",
752
- s: "sample",
753
- f: "filter"
754
- };
755
- const optionName = flagMap[flagName] || flagName;
756
- if (optionName === "api") {
757
- options.api = true;
758
- } else if (optionName === "filter") {
759
- const filterValue = rawArgs[++i];
760
- if (filterValue && filterValue.includes("=")) {
761
- const [key, value] = filterValue.split("=");
762
- filters.push([key, value]);
763
- }
764
- } else {
765
- const value = rawArgs[++i];
766
- if (value && !value.startsWith("-")) {
767
- if (["trials", "concurrency", "limit", "sample"].includes(optionName)) {
768
- options[optionName] = parseInt(value, 10);
769
- } else {
770
- options[optionName] = value;
771
- }
772
- }
773
- }
774
- } else if (!target) {
775
- target = arg;
776
- }
777
- }
778
- return { options, target, filters };
779
- }
780
- function handleRun(args) {
781
- const config = loadConfig();
782
- const { options, target, filters } = parseArgs(args);
783
- const finalOptions = __spreadValues(__spreadValues({}, config.defaults), options);
784
- const env2 = __spreadValues({}, import_process.default.env);
785
- if (finalOptions.env === "browserbase") {
786
- env2.EVAL_ENV = "BROWSERBASE";
787
- } else {
788
- env2.EVAL_ENV = "LOCAL";
789
- }
790
- if (finalOptions.api) {
791
- env2.USE_API = "true";
792
- }
793
- if (finalOptions.trials) {
794
- env2.EVAL_TRIAL_COUNT = String(finalOptions.trials);
795
- }
796
- if (finalOptions.concurrency) {
797
- env2.EVAL_MAX_CONCURRENCY = String(finalOptions.concurrency);
798
- }
799
- if (finalOptions.provider) {
800
- env2.EVAL_PROVIDER = finalOptions.provider;
801
- }
802
- if (finalOptions.model) {
803
- env2.EVAL_MODEL_OVERRIDE = finalOptions.model;
804
- }
805
- let evalName;
806
- let categoryFilter;
807
- if (target) {
808
- if (target.startsWith("b:") || target.startsWith("benchmark:")) {
809
- const benchmarkName = target.replace(/^(b:|benchmark:)/, "");
810
- if (!config.benchmarks[benchmarkName]) {
811
- console.error(source_default.red(`Error: Unknown benchmark "${benchmarkName}"`));
812
- console.log(
813
- source_default.dim(
814
- `Available benchmarks: ${Object.keys(config.benchmarks).join(", ")}`
815
- )
816
- );
817
- import_process.default.exit(1);
818
- }
819
- const benchmarkMap = {
820
- webbench: "agent/webbench",
821
- gaia: "agent/gaia",
822
- webvoyager: "agent/webvoyager",
823
- osworld: "agent/osworld",
824
- onlineMind2Web: "agent/onlineMind2Web"
825
- };
826
- evalName = benchmarkMap[benchmarkName];
827
- env2.EVAL_DATASET = benchmarkName;
828
- if (options.limit) {
829
- env2.EVAL_MAX_K = String(options.limit);
830
- env2[`EVAL_${benchmarkName.toUpperCase()}_LIMIT`] = String(
831
- options.limit
832
- );
833
- }
834
- if (options.sample) {
835
- env2[`EVAL_${benchmarkName.toUpperCase()}_SAMPLE`] = String(
836
- options.sample
837
- );
838
- }
839
- filters.forEach(([key, value]) => {
840
- const envKey = `EVAL_${benchmarkName.toUpperCase()}_${key.toUpperCase()}`;
841
- env2[envKey] = value;
842
- });
843
- } else if (target === "all") {
844
- } else if (target.includes("/") || target.includes("*")) {
845
- evalName = target;
846
- } else {
847
- const categories = /* @__PURE__ */ new Set();
848
- config.tasks.forEach((task) => {
849
- task.categories.forEach((cat) => categories.add(cat));
850
- });
851
- if (categories.has(target)) {
852
- categoryFilter = target;
853
- } else {
854
- evalName = target;
855
- }
856
- }
857
- }
858
- const legacyArgs = [];
859
- if (evalName) {
860
- legacyArgs.push(`name=${evalName}`);
861
- } else if (categoryFilter) {
862
- legacyArgs.push("category", categoryFilter);
863
- }
864
- console.log(source_default.blue.bold("\nRunning evals...\n"));
865
- const buildChild = (0, import_child_process.spawn)("pnpm", ["run", "build"], {
866
- stdio: "inherit",
867
- shell: true
868
- });
869
- buildChild.on("exit", (buildCode) => {
870
- if (buildCode !== 0) {
871
- import_process.default.exit(buildCode || 1);
872
- }
873
- const compiledEvalPath = import_path.default.join(__dirname, "index.eval.js");
874
- const sourceEvalPath = import_path.default.resolve(
875
- __dirname,
876
- "..",
877
- "..",
878
- "packages",
879
- "evals",
880
- "index.eval.ts"
881
- );
882
- let child;
883
- if (import_fs.default.existsSync(compiledEvalPath)) {
884
- child = (0, import_child_process.spawn)(import_process.default.execPath, [compiledEvalPath, ...legacyArgs], {
885
- env: env2,
886
- stdio: "inherit",
887
- shell: true
888
- });
889
- } else {
890
- let tsxCliPath;
891
- try {
892
- tsxCliPath = require.resolve("tsx/dist/cli.js");
893
- } catch (e) {
894
- }
895
- const tsxArgs = [sourceEvalPath, ...legacyArgs];
896
- if (tsxCliPath) {
897
- child = (0, import_child_process.spawn)(import_process.default.execPath, [tsxCliPath, ...tsxArgs], {
898
- env: env2,
899
- stdio: "inherit",
900
- shell: true
901
- });
902
- } else {
903
- child = (0, import_child_process.spawn)("tsx", tsxArgs, {
904
- env: env2,
905
- stdio: "inherit",
906
- shell: true
907
- });
908
- }
909
- }
910
- child.on("exit", (code) => {
911
- import_process.default.exit(code || 0);
912
- });
913
- import_process.default.on("SIGINT", () => {
914
- console.log("\n\nReceived SIGINT, killing child process...");
915
- child.kill("SIGINT");
916
- setTimeout(() => {
917
- child.kill("SIGKILL");
918
- import_process.default.exit(130);
919
- }, 1e3);
920
- });
921
- import_process.default.on("SIGTERM", () => {
922
- console.log("\n\nReceived SIGTERM, killing child process...");
923
- child.kill("SIGTERM");
924
- setTimeout(() => {
925
- child.kill("SIGKILL");
926
- import_process.default.exit(143);
927
- }, 1e3);
928
- });
929
- });
930
- }
931
- function main() {
932
- const args = import_process.default.argv.slice(2);
933
- const command = args[0];
934
- const commandArgs = args.slice(1);
935
- switch (command) {
936
- case "run":
937
- handleRun(commandArgs);
938
- break;
939
- case "list":
940
- handleList(commandArgs);
941
- break;
942
- case "config":
943
- handleConfig(commandArgs);
944
- break;
945
- case "help":
946
- case "--help":
947
- case "-h":
948
- printHelp();
949
- break;
950
- case void 0:
951
- console.error(source_default.red("Error: No command specified"));
952
- printHelp();
953
- import_process.default.exit(1);
954
- break;
955
- default:
956
- if (!command.startsWith("-")) {
957
- handleRun(args);
958
- } else {
959
- console.error(source_default.red(`Error: Unknown command "${command}"`));
960
- printHelp();
961
- import_process.default.exit(1);
962
- }
963
- }
964
- }
965
- main();