@wix/evalforge-evaluator 0.186.0 → 0.188.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -0
- package/build/index.js +346 -93
- package/build/index.js.map +4 -4
- package/build/index.mjs +345 -93
- package/build/index.mjs.map +4 -4
- package/build/types/ambassador-converters.d.ts +8 -1
- package/build/types/api-client.d.ts +3 -1
- package/build/types/run-scenario/environment.d.ts +6 -2
- package/build/types/run-scenario/index.d.ts +2 -1
- package/build/types/run-scenario/install-dependencies.d.ts +9 -1
- package/package.json +7 -6
package/README.md
CHANGED
|
@@ -50,6 +50,10 @@ Backend calls go through the evalforge ambassador packages (gRPC via
|
|
|
50
50
|
user-editable fields, not the system state transitions
|
|
51
51
|
(`status`/`completedAt`/`jobError`/`jobStatus`) the evaluator writes.
|
|
52
52
|
|
|
53
|
+
## Live trace during environment setup
|
|
54
|
+
|
|
55
|
+
For templated runs, the evaluator emits `PROGRESS` trace events during environment setup — "Setting up environment", "Fetching template files", "Installing dependencies", "Environment ready" — via the shared `emitTraceEvent` helper. Because `emitTraceEvent` writes to stdout (captured by the backend for local runs) and also calls the `pushEvent` callback (used for remote jobs via `tracePushUrl`), these events appear in the live trace in both local and remote runs. Without them, the trace panel stays blank during the often multi-minute setup phase before the agent starts.
|
|
56
|
+
|
|
53
57
|
## Scripts
|
|
54
58
|
|
|
55
59
|
```bash
|
package/build/index.js
CHANGED
|
@@ -1277,7 +1277,7 @@ var require_error = __commonJS({
|
|
|
1277
1277
|
"toJSON",
|
|
1278
1278
|
"__CANCEL__"
|
|
1279
1279
|
];
|
|
1280
|
-
var
|
|
1280
|
+
var HttpError2 = class _HttpError extends Error {
|
|
1281
1281
|
constructor(error) {
|
|
1282
1282
|
var _a;
|
|
1283
1283
|
super(error.message);
|
|
@@ -1294,9 +1294,9 @@ var require_error = __commonJS({
|
|
|
1294
1294
|
return (0, headers_1.requestIdOrEmptyString)(this.response);
|
|
1295
1295
|
}
|
|
1296
1296
|
};
|
|
1297
|
-
exports2.HttpError =
|
|
1297
|
+
exports2.HttpError = HttpError2;
|
|
1298
1298
|
function createHttpError(...args) {
|
|
1299
|
-
return new
|
|
1299
|
+
return new HttpError2(...args);
|
|
1300
1300
|
}
|
|
1301
1301
|
exports2.createHttpError = createHttpError;
|
|
1302
1302
|
}
|
|
@@ -6351,6 +6351,133 @@ function getLatestCapabilityVersion(payload) {
|
|
|
6351
6351
|
return __getLatestCapabilityVersion;
|
|
6352
6352
|
}
|
|
6353
6353
|
|
|
6354
|
+
// ../../node_modules/@wix/ambassador-evalforge-v1-site-provisioning/es/build/http.impl.js
|
|
6355
|
+
var _deleteProvisionedSiteRequest = {};
|
|
6356
|
+
var _deleteProvisionedSiteResponse = {};
|
|
6357
|
+
var _provisionScenarioSiteRequest = {};
|
|
6358
|
+
var _provisionScenarioSiteResponse = {};
|
|
6359
|
+
function resolveWixEvalforgeV1SiteProvisioningServiceUrl(opts) {
|
|
6360
|
+
var domainToMappings = {
|
|
6361
|
+
"dev._base_domain_": [
|
|
6362
|
+
{
|
|
6363
|
+
srcPath: "/_api/evalforge-backend",
|
|
6364
|
+
destPath: ""
|
|
6365
|
+
}
|
|
6366
|
+
],
|
|
6367
|
+
"api._api_base_domain_": [
|
|
6368
|
+
{
|
|
6369
|
+
srcPath: "/evalforge-backend",
|
|
6370
|
+
destPath: ""
|
|
6371
|
+
}
|
|
6372
|
+
],
|
|
6373
|
+
"bo._base_domain_": [
|
|
6374
|
+
{
|
|
6375
|
+
srcPath: "/_api/evalforge-backend",
|
|
6376
|
+
destPath: ""
|
|
6377
|
+
}
|
|
6378
|
+
],
|
|
6379
|
+
"wixbo.ai": [
|
|
6380
|
+
{
|
|
6381
|
+
srcPath: "/_api/evalforge-backend",
|
|
6382
|
+
destPath: ""
|
|
6383
|
+
}
|
|
6384
|
+
],
|
|
6385
|
+
"wix-bo.com": [
|
|
6386
|
+
{
|
|
6387
|
+
srcPath: "/_api/evalforge-backend",
|
|
6388
|
+
destPath: ""
|
|
6389
|
+
}
|
|
6390
|
+
],
|
|
6391
|
+
"manage._base_domain_": [
|
|
6392
|
+
{
|
|
6393
|
+
srcPath: "/_api/evalforge-backend",
|
|
6394
|
+
destPath: ""
|
|
6395
|
+
}
|
|
6396
|
+
]
|
|
6397
|
+
};
|
|
6398
|
+
return resolveUrl(Object.assign(opts, { domainToMappings }));
|
|
6399
|
+
}
|
|
6400
|
+
function provisionScenarioSite(payload) {
|
|
6401
|
+
var _a = serializer(_provisionScenarioSiteRequest, {}), toReq = _a.toJSON, fromReq = _a.fromJSON;
|
|
6402
|
+
var fromRes = serializer(_provisionScenarioSiteResponse, {}).fromJSON;
|
|
6403
|
+
function __provisionScenarioSite(_a2) {
|
|
6404
|
+
var host = _a2.host;
|
|
6405
|
+
var serializedData = toReq(payload);
|
|
6406
|
+
var metadata = {
|
|
6407
|
+
entityFqdn: "wix.evalforge.v1.site_provisioning",
|
|
6408
|
+
method: "POST",
|
|
6409
|
+
methodFqn: "wix.evalforge.v1.SiteProvisioningService.ProvisionScenarioSite",
|
|
6410
|
+
migrationOptions: {
|
|
6411
|
+
optInTransformResponse: true
|
|
6412
|
+
},
|
|
6413
|
+
url: resolveWixEvalforgeV1SiteProvisioningServiceUrl({
|
|
6414
|
+
protoPath: "/v1/projects/{projectId}/site-provisioning/provision-site",
|
|
6415
|
+
data: serializedData,
|
|
6416
|
+
host
|
|
6417
|
+
}),
|
|
6418
|
+
data: serializedData,
|
|
6419
|
+
transformResponse: fromRes
|
|
6420
|
+
};
|
|
6421
|
+
return metadata;
|
|
6422
|
+
}
|
|
6423
|
+
__provisionScenarioSite.fromReq = fromReq;
|
|
6424
|
+
__provisionScenarioSite.__isAmbassador = true;
|
|
6425
|
+
return __provisionScenarioSite;
|
|
6426
|
+
}
|
|
6427
|
+
function deleteProvisionedSite(payload) {
|
|
6428
|
+
var _a = serializer(_deleteProvisionedSiteRequest, {}), toReq = _a.toJSON, fromReq = _a.fromJSON;
|
|
6429
|
+
var fromRes = serializer(_deleteProvisionedSiteResponse, {}).fromJSON;
|
|
6430
|
+
function __deleteProvisionedSite(_a2) {
|
|
6431
|
+
var host = _a2.host;
|
|
6432
|
+
var serializedData = toReq(payload);
|
|
6433
|
+
var metadata = {
|
|
6434
|
+
entityFqdn: "wix.evalforge.v1.site_provisioning",
|
|
6435
|
+
method: "POST",
|
|
6436
|
+
methodFqn: "wix.evalforge.v1.SiteProvisioningService.DeleteProvisionedSite",
|
|
6437
|
+
migrationOptions: {
|
|
6438
|
+
optInTransformResponse: true
|
|
6439
|
+
},
|
|
6440
|
+
url: resolveWixEvalforgeV1SiteProvisioningServiceUrl({
|
|
6441
|
+
protoPath: "/v1/projects/{projectId}/site-provisioning/delete-site",
|
|
6442
|
+
data: serializedData,
|
|
6443
|
+
host
|
|
6444
|
+
}),
|
|
6445
|
+
data: serializedData,
|
|
6446
|
+
transformResponse: fromRes
|
|
6447
|
+
};
|
|
6448
|
+
return metadata;
|
|
6449
|
+
}
|
|
6450
|
+
__deleteProvisionedSite.fromReq = fromReq;
|
|
6451
|
+
__deleteProvisionedSite.__isAmbassador = true;
|
|
6452
|
+
return __deleteProvisionedSite;
|
|
6453
|
+
}
|
|
6454
|
+
|
|
6455
|
+
// ../../node_modules/@wix/ambassador-evalforge-v1-test-scenario/es/build/types.impl.js
|
|
6456
|
+
var WebhookIdentityType;
|
|
6457
|
+
(function(WebhookIdentityType2) {
|
|
6458
|
+
WebhookIdentityType2["UNKNOWN"] = "UNKNOWN";
|
|
6459
|
+
WebhookIdentityType2["ANONYMOUS_VISITOR"] = "ANONYMOUS_VISITOR";
|
|
6460
|
+
WebhookIdentityType2["MEMBER"] = "MEMBER";
|
|
6461
|
+
WebhookIdentityType2["WIX_USER"] = "WIX_USER";
|
|
6462
|
+
WebhookIdentityType2["APP"] = "APP";
|
|
6463
|
+
})(WebhookIdentityType || (WebhookIdentityType = {}));
|
|
6464
|
+
var SiteBootstrapHttpMethod;
|
|
6465
|
+
(function(SiteBootstrapHttpMethod2) {
|
|
6466
|
+
SiteBootstrapHttpMethod2["SITE_BOOTSTRAP_HTTP_METHOD_UNSPECIFIED"] = "SITE_BOOTSTRAP_HTTP_METHOD_UNSPECIFIED";
|
|
6467
|
+
SiteBootstrapHttpMethod2["GET"] = "GET";
|
|
6468
|
+
SiteBootstrapHttpMethod2["POST"] = "POST";
|
|
6469
|
+
SiteBootstrapHttpMethod2["PUT"] = "PUT";
|
|
6470
|
+
SiteBootstrapHttpMethod2["PATCH"] = "PATCH";
|
|
6471
|
+
SiteBootstrapHttpMethod2["DELETE"] = "DELETE";
|
|
6472
|
+
})(SiteBootstrapHttpMethod || (SiteBootstrapHttpMethod = {}));
|
|
6473
|
+
var Mode;
|
|
6474
|
+
(function(Mode2) {
|
|
6475
|
+
Mode2["UNKNOWN_MODE"] = "UNKNOWN_MODE";
|
|
6476
|
+
Mode2["NONE"] = "NONE";
|
|
6477
|
+
Mode2["CLONE"] = "CLONE";
|
|
6478
|
+
Mode2["TEMPLATE"] = "TEMPLATE";
|
|
6479
|
+
})(Mode || (Mode = {}));
|
|
6480
|
+
|
|
6354
6481
|
// src/ambassador-converters.ts
|
|
6355
6482
|
function toProtoEnum(prefix, value) {
|
|
6356
6483
|
return `${prefix}${value.toUpperCase()}`;
|
|
@@ -6657,9 +6784,37 @@ function testScenarioFromProto(wire) {
|
|
|
6657
6784
|
})
|
|
6658
6785
|
),
|
|
6659
6786
|
createdAt: fromProtoDate(wire.createdAt) ?? "",
|
|
6660
|
-
updatedAt: fromProtoDate(wire.updatedAt) ?? ""
|
|
6787
|
+
updatedAt: fromProtoDate(wire.updatedAt) ?? "",
|
|
6788
|
+
siteSetup: siteSetupFromAmbassador(wire.siteSetup)
|
|
6661
6789
|
};
|
|
6662
6790
|
}
|
|
6791
|
+
function siteSetupFromAmbassador(wire) {
|
|
6792
|
+
if (!wire) return void 0;
|
|
6793
|
+
const steps = (wire.bootstrap?.steps ?? []).filter(
|
|
6794
|
+
(step) => step.method && step.method !== SiteBootstrapHttpMethod.SITE_BOOTSTRAP_HTTP_METHOD_UNSPECIFIED && step.url
|
|
6795
|
+
).map((step) => ({
|
|
6796
|
+
label: step.label ?? void 0,
|
|
6797
|
+
method: step.method.toLowerCase(),
|
|
6798
|
+
url: step.url ?? "",
|
|
6799
|
+
body: step.body ?? void 0
|
|
6800
|
+
}));
|
|
6801
|
+
const bootstrap = steps.length > 0 ? { steps } : void 0;
|
|
6802
|
+
if (wire.mode === Mode.CLONE) {
|
|
6803
|
+
return {
|
|
6804
|
+
mode: "clone",
|
|
6805
|
+
sourceSiteId: wire.cloneOptions?.sourceSiteId ?? "",
|
|
6806
|
+
bootstrap
|
|
6807
|
+
};
|
|
6808
|
+
}
|
|
6809
|
+
if (wire.mode === Mode.TEMPLATE) {
|
|
6810
|
+
return {
|
|
6811
|
+
mode: "template",
|
|
6812
|
+
templateId: wire.templateOptions?.templateId ?? "",
|
|
6813
|
+
bootstrap
|
|
6814
|
+
};
|
|
6815
|
+
}
|
|
6816
|
+
return void 0;
|
|
6817
|
+
}
|
|
6663
6818
|
function templateFromProto(wire) {
|
|
6664
6819
|
return {
|
|
6665
6820
|
id: wire.id ?? "",
|
|
@@ -6713,8 +6868,36 @@ function capabilityVersionFromProto(wire, projectId2) {
|
|
|
6713
6868
|
createdAt: fromProtoDate(wire.createdAt) ?? ""
|
|
6714
6869
|
};
|
|
6715
6870
|
}
|
|
6871
|
+
function provisionedSiteFromProto(proto) {
|
|
6872
|
+
return {
|
|
6873
|
+
id: proto.id ?? "",
|
|
6874
|
+
url: proto.url ?? void 0,
|
|
6875
|
+
editorUrl: proto.editorUrl ?? void 0
|
|
6876
|
+
};
|
|
6877
|
+
}
|
|
6878
|
+
function siteBootstrapResultFromProto(proto) {
|
|
6879
|
+
if (!proto) return void 0;
|
|
6880
|
+
return {
|
|
6881
|
+
steps: (proto.steps ?? []).map((step) => ({
|
|
6882
|
+
label: step.label ?? void 0,
|
|
6883
|
+
statusCode: step.statusCode ?? 0,
|
|
6884
|
+
ok: step.ok ?? false,
|
|
6885
|
+
error: step.error ?? void 0
|
|
6886
|
+
}))
|
|
6887
|
+
};
|
|
6888
|
+
}
|
|
6716
6889
|
|
|
6717
6890
|
// src/api-client.ts
|
|
6891
|
+
function rethrowWithRequestId(err, action) {
|
|
6892
|
+
if (err instanceof import_http_client.HttpError) {
|
|
6893
|
+
const status = err.response?.status;
|
|
6894
|
+
const requestId = err.requestId;
|
|
6895
|
+
throw new Error(
|
|
6896
|
+
`Failed to ${action}` + (status !== void 0 ? ` (HTTP ${status})` : "") + (requestId ? ` [request id: ${requestId}]` : "") + `: ${err.message}`
|
|
6897
|
+
);
|
|
6898
|
+
}
|
|
6899
|
+
throw err;
|
|
6900
|
+
}
|
|
6718
6901
|
function resolveAmbassadorBaseUrl(serverUrl) {
|
|
6719
6902
|
try {
|
|
6720
6903
|
return new URL(serverUrl).origin;
|
|
@@ -6890,6 +7073,29 @@ function createApiClient(serverUrl, options = "") {
|
|
|
6890
7073
|
},
|
|
6891
7074
|
updateEvalRun(projectId2, evalRunId2, update) {
|
|
6892
7075
|
return putJson(`/projects/${projectId2}/eval-runs/${evalRunId2}`, update);
|
|
7076
|
+
},
|
|
7077
|
+
async provisionScenarioSite(projectId2, evalRunId2, scenarioId) {
|
|
7078
|
+
const res = await httpClient.request(provisionScenarioSite({ projectId: projectId2, evalRunId: evalRunId2, scenarioId })).catch(
|
|
7079
|
+
(err) => rethrowWithRequestId(err, `provision a site for scenario ${scenarioId}`)
|
|
7080
|
+
);
|
|
7081
|
+
const site = res.data.site;
|
|
7082
|
+
if (!site) {
|
|
7083
|
+
throw new Error(
|
|
7084
|
+
`Site provisioning for scenario ${scenarioId} returned no site.`
|
|
7085
|
+
);
|
|
7086
|
+
}
|
|
7087
|
+
return {
|
|
7088
|
+
...provisionedSiteFromProto(site),
|
|
7089
|
+
bootstrapResult: siteBootstrapResultFromProto(res.data.bootstrapResult)
|
|
7090
|
+
};
|
|
7091
|
+
},
|
|
7092
|
+
async deleteProvisionedSite(projectId2, siteId) {
|
|
7093
|
+
await httpClient.request(deleteProvisionedSite({ projectId: projectId2, siteId })).catch((err) => {
|
|
7094
|
+
console.warn(
|
|
7095
|
+
"[site-provisioning] deleteProvisionedSite failed \u2014 site may remain:",
|
|
7096
|
+
err
|
|
7097
|
+
);
|
|
7098
|
+
});
|
|
6893
7099
|
}
|
|
6894
7100
|
};
|
|
6895
7101
|
}
|
|
@@ -7256,10 +7462,13 @@ function installWithCache(workDir, exec, cacheBase, pm) {
|
|
|
7256
7462
|
);
|
|
7257
7463
|
}
|
|
7258
7464
|
}
|
|
7259
|
-
async function installDependencies(workDir,
|
|
7465
|
+
async function installDependencies(workDir, onProgress, options = {}) {
|
|
7260
7466
|
if (!(0, import_fs.existsSync)(import_path2.default.join(workDir, "package.json"))) {
|
|
7261
7467
|
return;
|
|
7262
7468
|
}
|
|
7469
|
+
const exec = options.exec ?? import_child_process.execFileSync;
|
|
7470
|
+
const cacheBase = options.cacheBase;
|
|
7471
|
+
onProgress("Installing dependencies...");
|
|
7263
7472
|
const pm = detectPackageManager(workDir);
|
|
7264
7473
|
if (cacheBase) {
|
|
7265
7474
|
installWithCache(workDir, exec, cacheBase, pm);
|
|
@@ -7335,7 +7544,8 @@ function writeWixEnvFile(workDir) {
|
|
|
7335
7544
|
console.warn("[environment] Failed to read wix.config.json");
|
|
7336
7545
|
}
|
|
7337
7546
|
}
|
|
7338
|
-
async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
7547
|
+
async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId, onProgress, options = {}) {
|
|
7548
|
+
const template = options.template;
|
|
7339
7549
|
const baseDir = config.evaluationsDir ?? import_path3.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
|
|
7340
7550
|
const nodeModulesCacheDir = import_path3.default.join(baseDir, "_node_modules_cache");
|
|
7341
7551
|
if (template) {
|
|
@@ -7349,10 +7559,14 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
7349
7559
|
(0, import_fs2.rmSync)(workDir2, { recursive: true });
|
|
7350
7560
|
}
|
|
7351
7561
|
(0, import_fs2.mkdirSync)(workDir2, { recursive: true });
|
|
7562
|
+
onProgress("Fetching template files...");
|
|
7352
7563
|
await fetchAndWriteTemplateFiles(template, workDir2);
|
|
7353
7564
|
console.log(`Template files written to ${workDir2}`);
|
|
7354
7565
|
writeWixEnvFile(workDir2);
|
|
7355
|
-
await installDependencies(workDir2,
|
|
7566
|
+
await installDependencies(workDir2, onProgress, {
|
|
7567
|
+
cacheBase: nodeModulesCacheDir
|
|
7568
|
+
});
|
|
7569
|
+
onProgress("Environment ready");
|
|
7356
7570
|
return workDir2;
|
|
7357
7571
|
}
|
|
7358
7572
|
const workDir = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
|
|
@@ -7364,6 +7578,13 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
7364
7578
|
return workDir;
|
|
7365
7579
|
}
|
|
7366
7580
|
|
|
7581
|
+
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7582
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
7583
|
+
function emitTraceEvent(event, pushEvent) {
|
|
7584
|
+
console.log(`${import_evalforge_types2.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7585
|
+
pushEvent?.(event);
|
|
7586
|
+
}
|
|
7587
|
+
|
|
7367
7588
|
// src/run-scenario/run-agent-with-context.ts
|
|
7368
7589
|
var import_crypto5 = require("crypto");
|
|
7369
7590
|
|
|
@@ -7528,7 +7749,7 @@ var import_crypto2 = require("crypto");
|
|
|
7528
7749
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7529
7750
|
var import_promises5 = require("fs/promises");
|
|
7530
7751
|
var import_path6 = require("path");
|
|
7531
|
-
var
|
|
7752
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7532
7753
|
|
|
7533
7754
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
7534
7755
|
var import_promises4 = require("fs/promises");
|
|
@@ -7589,7 +7810,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
7589
7810
|
}
|
|
7590
7811
|
const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
|
|
7591
7812
|
const content = JSON.stringify(
|
|
7592
|
-
{ [
|
|
7813
|
+
{ [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
7593
7814
|
null,
|
|
7594
7815
|
2
|
|
7595
7816
|
);
|
|
@@ -7815,13 +8036,6 @@ function buildConversation(timestampedMessages) {
|
|
|
7815
8036
|
return messages;
|
|
7816
8037
|
}
|
|
7817
8038
|
|
|
7818
|
-
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7819
|
-
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7820
|
-
function emitTraceEvent(event, pushEvent) {
|
|
7821
|
-
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7822
|
-
pushEvent?.(event);
|
|
7823
|
-
}
|
|
7824
|
-
|
|
7825
8039
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7826
8040
|
var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7827
8041
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
@@ -11783,87 +11997,124 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
11783
11997
|
}
|
|
11784
11998
|
|
|
11785
11999
|
// src/run-scenario/index.ts
|
|
11786
|
-
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions, pushEvent) {
|
|
12000
|
+
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions, pushEvent, apiClient, projectId2) {
|
|
11787
12001
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
11788
12002
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11789
|
-
|
|
11790
|
-
|
|
11791
|
-
|
|
11792
|
-
|
|
11793
|
-
|
|
11794
|
-
|
|
11795
|
-
|
|
11796
|
-
|
|
11797
|
-
|
|
11798
|
-
|
|
11799
|
-
|
|
11800
|
-
|
|
11801
|
-
|
|
11802
|
-
|
|
11803
|
-
})
|
|
11804
|
-
);
|
|
11805
|
-
}
|
|
11806
|
-
const workDir = await prepareWorkingDirectory(
|
|
11807
|
-
config,
|
|
11808
|
-
evalRunId2,
|
|
11809
|
-
targetId,
|
|
11810
|
-
scenario.id,
|
|
11811
|
-
template
|
|
11812
|
-
);
|
|
11813
|
-
const partialResult = await runAgentWithContext(
|
|
11814
|
-
config,
|
|
11815
|
-
evalRunId2,
|
|
11816
|
-
scenario,
|
|
11817
|
-
evalData,
|
|
11818
|
-
workDir,
|
|
12003
|
+
const emitSetupProgress = (outputPreview) => emitTraceEvent(
|
|
12004
|
+
{
|
|
12005
|
+
evalRunId: evalRunId2,
|
|
12006
|
+
scenarioId: scenario.id,
|
|
12007
|
+
scenarioName: scenario.name,
|
|
12008
|
+
targetId,
|
|
12009
|
+
targetName,
|
|
12010
|
+
stepNumber: 0,
|
|
12011
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
12012
|
+
outputPreview,
|
|
12013
|
+
elapsedMs: 0,
|
|
12014
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12015
|
+
isComplete: false
|
|
12016
|
+
},
|
|
11819
12017
|
pushEvent
|
|
11820
12018
|
);
|
|
11821
|
-
|
|
11822
|
-
|
|
11823
|
-
|
|
11824
|
-
|
|
11825
|
-
|
|
11826
|
-
|
|
11827
|
-
|
|
11828
|
-
|
|
11829
|
-
const
|
|
11830
|
-
|
|
11831
|
-
|
|
11832
|
-
|
|
11833
|
-
|
|
11834
|
-
|
|
11835
|
-
|
|
11836
|
-
|
|
11837
|
-
|
|
11838
|
-
|
|
11839
|
-
|
|
11840
|
-
|
|
11841
|
-
|
|
11842
|
-
|
|
11843
|
-
|
|
11844
|
-
|
|
12019
|
+
let provisionedSite;
|
|
12020
|
+
if (apiClient && projectId2 && scenario.siteSetup && scenario.siteSetup.mode !== "none") {
|
|
12021
|
+
provisionedSite = await apiClient.provisionScenarioSite(
|
|
12022
|
+
projectId2,
|
|
12023
|
+
evalRunId2,
|
|
12024
|
+
scenario.id
|
|
12025
|
+
);
|
|
12026
|
+
}
|
|
12027
|
+
const failedStep = provisionedSite?.bootstrapResult?.steps.find((s) => !s.ok);
|
|
12028
|
+
if (failedStep) {
|
|
12029
|
+
const message = `Site bootstrap step ${failedStep.label ? `"${failedStep.label}" ` : ""}failed (HTTP ${failedStep.statusCode}): ${failedStep.error ?? "unknown error"}`;
|
|
12030
|
+
console.warn(`[run-scenario] ${message}`);
|
|
12031
|
+
pushEvent?.({
|
|
12032
|
+
evalRunId: evalRunId2,
|
|
12033
|
+
scenarioId: scenario.id,
|
|
12034
|
+
scenarioName: scenario.name,
|
|
12035
|
+
targetId,
|
|
12036
|
+
targetName,
|
|
12037
|
+
stepNumber: 0,
|
|
12038
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
12039
|
+
outputPreview: message,
|
|
12040
|
+
elapsedMs: 0,
|
|
12041
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12042
|
+
isComplete: false
|
|
12043
|
+
});
|
|
12044
|
+
}
|
|
12045
|
+
const effectiveTriggerPrompt = provisionedSite ? `${scenario.triggerPrompt}
|
|
12046
|
+
|
|
12047
|
+
Site ID: ${provisionedSite.id}` : scenario.triggerPrompt;
|
|
12048
|
+
try {
|
|
12049
|
+
emitSetupProgress("Setting up environment...");
|
|
12050
|
+
const workDir = await prepareWorkingDirectory(
|
|
12051
|
+
config,
|
|
12052
|
+
evalRunId2,
|
|
12053
|
+
targetId,
|
|
12054
|
+
scenario.id,
|
|
12055
|
+
emitSetupProgress,
|
|
12056
|
+
{ template }
|
|
12057
|
+
);
|
|
12058
|
+
const partialResult = await runAgentWithContext(
|
|
12059
|
+
config,
|
|
12060
|
+
evalRunId2,
|
|
12061
|
+
{ ...scenario, triggerPrompt: effectiveTriggerPrompt },
|
|
12062
|
+
evalData,
|
|
12063
|
+
workDir,
|
|
12064
|
+
pushEvent
|
|
12065
|
+
);
|
|
12066
|
+
const inlineAssertions = scenario.assertions ?? [];
|
|
12067
|
+
const assertions = [
|
|
12068
|
+
...inlineAssertions,
|
|
12069
|
+
...resolvedAssertions ?? []
|
|
12070
|
+
];
|
|
12071
|
+
const templateFilesMap = new Map(
|
|
12072
|
+
(partialResult.templateFiles ?? []).map((f) => [f.path, f.status])
|
|
12073
|
+
);
|
|
12074
|
+
const evaluationInput = {
|
|
12075
|
+
outputText: partialResult.outputText,
|
|
12076
|
+
llmTrace: partialResult.llmTrace,
|
|
12077
|
+
fileDiffs: partialResult.fileDiffs?.map((d) => ({
|
|
12078
|
+
path: d.path,
|
|
12079
|
+
status: templateFilesMap.get(d.path)
|
|
12080
|
+
})),
|
|
12081
|
+
durationMs: partialResult.duration
|
|
12082
|
+
};
|
|
12083
|
+
const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
|
|
12084
|
+
const assertionContext = {
|
|
12085
|
+
workDir,
|
|
12086
|
+
defaultJudgeModel,
|
|
12087
|
+
llmConfig: {
|
|
12088
|
+
baseUrl: config.aiGatewayUrl,
|
|
12089
|
+
headers: config.aiGatewayHeaders
|
|
12090
|
+
}
|
|
12091
|
+
};
|
|
12092
|
+
const assertionResults = assertions.length > 0 ? await (0, import_eval_assertions.evaluateAssertions)(
|
|
12093
|
+
evaluationInput,
|
|
12094
|
+
assertions,
|
|
12095
|
+
assertionContext
|
|
12096
|
+
) : [];
|
|
12097
|
+
const passed = assertionResults.filter(
|
|
12098
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
12099
|
+
).length;
|
|
12100
|
+
const failed = assertionResults.filter(
|
|
12101
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
12102
|
+
).length;
|
|
12103
|
+
const total = assertionResults.length;
|
|
12104
|
+
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
12105
|
+
return {
|
|
12106
|
+
...partialResult,
|
|
12107
|
+
assertionResults,
|
|
12108
|
+
passed,
|
|
12109
|
+
failed,
|
|
12110
|
+
passRate,
|
|
12111
|
+
provisionedSite
|
|
12112
|
+
};
|
|
12113
|
+
} finally {
|
|
12114
|
+
if (provisionedSite && apiClient && projectId2) {
|
|
12115
|
+
await apiClient.deleteProvisionedSite(projectId2, provisionedSite.id);
|
|
11845
12116
|
}
|
|
11846
|
-
}
|
|
11847
|
-
const assertionResults = assertions.length > 0 ? await (0, import_eval_assertions.evaluateAssertions)(
|
|
11848
|
-
evaluationInput,
|
|
11849
|
-
assertions,
|
|
11850
|
-
assertionContext
|
|
11851
|
-
) : [];
|
|
11852
|
-
const passed = assertionResults.filter(
|
|
11853
|
-
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
11854
|
-
).length;
|
|
11855
|
-
const failed = assertionResults.filter(
|
|
11856
|
-
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
11857
|
-
).length;
|
|
11858
|
-
const total = assertionResults.length;
|
|
11859
|
-
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
11860
|
-
return {
|
|
11861
|
-
...partialResult,
|
|
11862
|
-
assertionResults,
|
|
11863
|
-
passed,
|
|
11864
|
-
failed,
|
|
11865
|
-
passRate
|
|
11866
|
-
};
|
|
12117
|
+
}
|
|
11867
12118
|
}
|
|
11868
12119
|
|
|
11869
12120
|
// src/evaluation-loop.ts
|
|
@@ -12143,7 +12394,9 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12143
12394
|
evalData,
|
|
12144
12395
|
template,
|
|
12145
12396
|
resolvedAssertions,
|
|
12146
|
-
pushEvent
|
|
12397
|
+
pushEvent,
|
|
12398
|
+
api,
|
|
12399
|
+
projectId2
|
|
12147
12400
|
);
|
|
12148
12401
|
},
|
|
12149
12402
|
addResult: async (result) => {
|