@wix/evalforge-evaluator 0.186.0 → 0.188.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -0
- package/build/index.js +346 -93
- package/build/index.js.map +4 -4
- package/build/index.mjs +345 -93
- package/build/index.mjs.map +4 -4
- package/build/types/ambassador-converters.d.ts +8 -1
- package/build/types/api-client.d.ts +3 -1
- package/build/types/run-scenario/environment.d.ts +6 -2
- package/build/types/run-scenario/index.d.ts +2 -1
- package/build/types/run-scenario/install-dependencies.d.ts +9 -1
- package/package.json +7 -6
package/build/index.mjs
CHANGED
|
@@ -1282,7 +1282,7 @@ var require_error = __commonJS({
|
|
|
1282
1282
|
"toJSON",
|
|
1283
1283
|
"__CANCEL__"
|
|
1284
1284
|
];
|
|
1285
|
-
var
|
|
1285
|
+
var HttpError2 = class _HttpError extends Error {
|
|
1286
1286
|
constructor(error) {
|
|
1287
1287
|
var _a;
|
|
1288
1288
|
super(error.message);
|
|
@@ -1299,9 +1299,9 @@ var require_error = __commonJS({
|
|
|
1299
1299
|
return (0, headers_1.requestIdOrEmptyString)(this.response);
|
|
1300
1300
|
}
|
|
1301
1301
|
};
|
|
1302
|
-
exports.HttpError =
|
|
1302
|
+
exports.HttpError = HttpError2;
|
|
1303
1303
|
function createHttpError(...args) {
|
|
1304
|
-
return new
|
|
1304
|
+
return new HttpError2(...args);
|
|
1305
1305
|
}
|
|
1306
1306
|
exports.createHttpError = createHttpError;
|
|
1307
1307
|
}
|
|
@@ -6356,6 +6356,133 @@ function getLatestCapabilityVersion(payload) {
|
|
|
6356
6356
|
return __getLatestCapabilityVersion;
|
|
6357
6357
|
}
|
|
6358
6358
|
|
|
6359
|
+
// ../../node_modules/@wix/ambassador-evalforge-v1-site-provisioning/es/build/http.impl.js
|
|
6360
|
+
var _deleteProvisionedSiteRequest = {};
|
|
6361
|
+
var _deleteProvisionedSiteResponse = {};
|
|
6362
|
+
var _provisionScenarioSiteRequest = {};
|
|
6363
|
+
var _provisionScenarioSiteResponse = {};
|
|
6364
|
+
function resolveWixEvalforgeV1SiteProvisioningServiceUrl(opts) {
|
|
6365
|
+
var domainToMappings = {
|
|
6366
|
+
"dev._base_domain_": [
|
|
6367
|
+
{
|
|
6368
|
+
srcPath: "/_api/evalforge-backend",
|
|
6369
|
+
destPath: ""
|
|
6370
|
+
}
|
|
6371
|
+
],
|
|
6372
|
+
"api._api_base_domain_": [
|
|
6373
|
+
{
|
|
6374
|
+
srcPath: "/evalforge-backend",
|
|
6375
|
+
destPath: ""
|
|
6376
|
+
}
|
|
6377
|
+
],
|
|
6378
|
+
"bo._base_domain_": [
|
|
6379
|
+
{
|
|
6380
|
+
srcPath: "/_api/evalforge-backend",
|
|
6381
|
+
destPath: ""
|
|
6382
|
+
}
|
|
6383
|
+
],
|
|
6384
|
+
"wixbo.ai": [
|
|
6385
|
+
{
|
|
6386
|
+
srcPath: "/_api/evalforge-backend",
|
|
6387
|
+
destPath: ""
|
|
6388
|
+
}
|
|
6389
|
+
],
|
|
6390
|
+
"wix-bo.com": [
|
|
6391
|
+
{
|
|
6392
|
+
srcPath: "/_api/evalforge-backend",
|
|
6393
|
+
destPath: ""
|
|
6394
|
+
}
|
|
6395
|
+
],
|
|
6396
|
+
"manage._base_domain_": [
|
|
6397
|
+
{
|
|
6398
|
+
srcPath: "/_api/evalforge-backend",
|
|
6399
|
+
destPath: ""
|
|
6400
|
+
}
|
|
6401
|
+
]
|
|
6402
|
+
};
|
|
6403
|
+
return resolveUrl(Object.assign(opts, { domainToMappings }));
|
|
6404
|
+
}
|
|
6405
|
+
function provisionScenarioSite(payload) {
|
|
6406
|
+
var _a = serializer(_provisionScenarioSiteRequest, {}), toReq = _a.toJSON, fromReq = _a.fromJSON;
|
|
6407
|
+
var fromRes = serializer(_provisionScenarioSiteResponse, {}).fromJSON;
|
|
6408
|
+
function __provisionScenarioSite(_a2) {
|
|
6409
|
+
var host = _a2.host;
|
|
6410
|
+
var serializedData = toReq(payload);
|
|
6411
|
+
var metadata = {
|
|
6412
|
+
entityFqdn: "wix.evalforge.v1.site_provisioning",
|
|
6413
|
+
method: "POST",
|
|
6414
|
+
methodFqn: "wix.evalforge.v1.SiteProvisioningService.ProvisionScenarioSite",
|
|
6415
|
+
migrationOptions: {
|
|
6416
|
+
optInTransformResponse: true
|
|
6417
|
+
},
|
|
6418
|
+
url: resolveWixEvalforgeV1SiteProvisioningServiceUrl({
|
|
6419
|
+
protoPath: "/v1/projects/{projectId}/site-provisioning/provision-site",
|
|
6420
|
+
data: serializedData,
|
|
6421
|
+
host
|
|
6422
|
+
}),
|
|
6423
|
+
data: serializedData,
|
|
6424
|
+
transformResponse: fromRes
|
|
6425
|
+
};
|
|
6426
|
+
return metadata;
|
|
6427
|
+
}
|
|
6428
|
+
__provisionScenarioSite.fromReq = fromReq;
|
|
6429
|
+
__provisionScenarioSite.__isAmbassador = true;
|
|
6430
|
+
return __provisionScenarioSite;
|
|
6431
|
+
}
|
|
6432
|
+
function deleteProvisionedSite(payload) {
|
|
6433
|
+
var _a = serializer(_deleteProvisionedSiteRequest, {}), toReq = _a.toJSON, fromReq = _a.fromJSON;
|
|
6434
|
+
var fromRes = serializer(_deleteProvisionedSiteResponse, {}).fromJSON;
|
|
6435
|
+
function __deleteProvisionedSite(_a2) {
|
|
6436
|
+
var host = _a2.host;
|
|
6437
|
+
var serializedData = toReq(payload);
|
|
6438
|
+
var metadata = {
|
|
6439
|
+
entityFqdn: "wix.evalforge.v1.site_provisioning",
|
|
6440
|
+
method: "POST",
|
|
6441
|
+
methodFqn: "wix.evalforge.v1.SiteProvisioningService.DeleteProvisionedSite",
|
|
6442
|
+
migrationOptions: {
|
|
6443
|
+
optInTransformResponse: true
|
|
6444
|
+
},
|
|
6445
|
+
url: resolveWixEvalforgeV1SiteProvisioningServiceUrl({
|
|
6446
|
+
protoPath: "/v1/projects/{projectId}/site-provisioning/delete-site",
|
|
6447
|
+
data: serializedData,
|
|
6448
|
+
host
|
|
6449
|
+
}),
|
|
6450
|
+
data: serializedData,
|
|
6451
|
+
transformResponse: fromRes
|
|
6452
|
+
};
|
|
6453
|
+
return metadata;
|
|
6454
|
+
}
|
|
6455
|
+
__deleteProvisionedSite.fromReq = fromReq;
|
|
6456
|
+
__deleteProvisionedSite.__isAmbassador = true;
|
|
6457
|
+
return __deleteProvisionedSite;
|
|
6458
|
+
}
|
|
6459
|
+
|
|
6460
|
+
// ../../node_modules/@wix/ambassador-evalforge-v1-test-scenario/es/build/types.impl.js
|
|
6461
|
+
var WebhookIdentityType;
|
|
6462
|
+
(function(WebhookIdentityType2) {
|
|
6463
|
+
WebhookIdentityType2["UNKNOWN"] = "UNKNOWN";
|
|
6464
|
+
WebhookIdentityType2["ANONYMOUS_VISITOR"] = "ANONYMOUS_VISITOR";
|
|
6465
|
+
WebhookIdentityType2["MEMBER"] = "MEMBER";
|
|
6466
|
+
WebhookIdentityType2["WIX_USER"] = "WIX_USER";
|
|
6467
|
+
WebhookIdentityType2["APP"] = "APP";
|
|
6468
|
+
})(WebhookIdentityType || (WebhookIdentityType = {}));
|
|
6469
|
+
var SiteBootstrapHttpMethod;
|
|
6470
|
+
(function(SiteBootstrapHttpMethod2) {
|
|
6471
|
+
SiteBootstrapHttpMethod2["SITE_BOOTSTRAP_HTTP_METHOD_UNSPECIFIED"] = "SITE_BOOTSTRAP_HTTP_METHOD_UNSPECIFIED";
|
|
6472
|
+
SiteBootstrapHttpMethod2["GET"] = "GET";
|
|
6473
|
+
SiteBootstrapHttpMethod2["POST"] = "POST";
|
|
6474
|
+
SiteBootstrapHttpMethod2["PUT"] = "PUT";
|
|
6475
|
+
SiteBootstrapHttpMethod2["PATCH"] = "PATCH";
|
|
6476
|
+
SiteBootstrapHttpMethod2["DELETE"] = "DELETE";
|
|
6477
|
+
})(SiteBootstrapHttpMethod || (SiteBootstrapHttpMethod = {}));
|
|
6478
|
+
var Mode;
|
|
6479
|
+
(function(Mode2) {
|
|
6480
|
+
Mode2["UNKNOWN_MODE"] = "UNKNOWN_MODE";
|
|
6481
|
+
Mode2["NONE"] = "NONE";
|
|
6482
|
+
Mode2["CLONE"] = "CLONE";
|
|
6483
|
+
Mode2["TEMPLATE"] = "TEMPLATE";
|
|
6484
|
+
})(Mode || (Mode = {}));
|
|
6485
|
+
|
|
6359
6486
|
// src/ambassador-converters.ts
|
|
6360
6487
|
function toProtoEnum(prefix, value) {
|
|
6361
6488
|
return `${prefix}${value.toUpperCase()}`;
|
|
@@ -6662,9 +6789,37 @@ function testScenarioFromProto(wire) {
|
|
|
6662
6789
|
})
|
|
6663
6790
|
),
|
|
6664
6791
|
createdAt: fromProtoDate(wire.createdAt) ?? "",
|
|
6665
|
-
updatedAt: fromProtoDate(wire.updatedAt) ?? ""
|
|
6792
|
+
updatedAt: fromProtoDate(wire.updatedAt) ?? "",
|
|
6793
|
+
siteSetup: siteSetupFromAmbassador(wire.siteSetup)
|
|
6666
6794
|
};
|
|
6667
6795
|
}
|
|
6796
|
+
function siteSetupFromAmbassador(wire) {
|
|
6797
|
+
if (!wire) return void 0;
|
|
6798
|
+
const steps = (wire.bootstrap?.steps ?? []).filter(
|
|
6799
|
+
(step) => step.method && step.method !== SiteBootstrapHttpMethod.SITE_BOOTSTRAP_HTTP_METHOD_UNSPECIFIED && step.url
|
|
6800
|
+
).map((step) => ({
|
|
6801
|
+
label: step.label ?? void 0,
|
|
6802
|
+
method: step.method.toLowerCase(),
|
|
6803
|
+
url: step.url ?? "",
|
|
6804
|
+
body: step.body ?? void 0
|
|
6805
|
+
}));
|
|
6806
|
+
const bootstrap = steps.length > 0 ? { steps } : void 0;
|
|
6807
|
+
if (wire.mode === Mode.CLONE) {
|
|
6808
|
+
return {
|
|
6809
|
+
mode: "clone",
|
|
6810
|
+
sourceSiteId: wire.cloneOptions?.sourceSiteId ?? "",
|
|
6811
|
+
bootstrap
|
|
6812
|
+
};
|
|
6813
|
+
}
|
|
6814
|
+
if (wire.mode === Mode.TEMPLATE) {
|
|
6815
|
+
return {
|
|
6816
|
+
mode: "template",
|
|
6817
|
+
templateId: wire.templateOptions?.templateId ?? "",
|
|
6818
|
+
bootstrap
|
|
6819
|
+
};
|
|
6820
|
+
}
|
|
6821
|
+
return void 0;
|
|
6822
|
+
}
|
|
6668
6823
|
function templateFromProto(wire) {
|
|
6669
6824
|
return {
|
|
6670
6825
|
id: wire.id ?? "",
|
|
@@ -6718,8 +6873,36 @@ function capabilityVersionFromProto(wire, projectId2) {
|
|
|
6718
6873
|
createdAt: fromProtoDate(wire.createdAt) ?? ""
|
|
6719
6874
|
};
|
|
6720
6875
|
}
|
|
6876
|
+
function provisionedSiteFromProto(proto) {
|
|
6877
|
+
return {
|
|
6878
|
+
id: proto.id ?? "",
|
|
6879
|
+
url: proto.url ?? void 0,
|
|
6880
|
+
editorUrl: proto.editorUrl ?? void 0
|
|
6881
|
+
};
|
|
6882
|
+
}
|
|
6883
|
+
function siteBootstrapResultFromProto(proto) {
|
|
6884
|
+
if (!proto) return void 0;
|
|
6885
|
+
return {
|
|
6886
|
+
steps: (proto.steps ?? []).map((step) => ({
|
|
6887
|
+
label: step.label ?? void 0,
|
|
6888
|
+
statusCode: step.statusCode ?? 0,
|
|
6889
|
+
ok: step.ok ?? false,
|
|
6890
|
+
error: step.error ?? void 0
|
|
6891
|
+
}))
|
|
6892
|
+
};
|
|
6893
|
+
}
|
|
6721
6894
|
|
|
6722
6895
|
// src/api-client.ts
|
|
6896
|
+
function rethrowWithRequestId(err, action) {
|
|
6897
|
+
if (err instanceof import_http_client.HttpError) {
|
|
6898
|
+
const status = err.response?.status;
|
|
6899
|
+
const requestId = err.requestId;
|
|
6900
|
+
throw new Error(
|
|
6901
|
+
`Failed to ${action}` + (status !== void 0 ? ` (HTTP ${status})` : "") + (requestId ? ` [request id: ${requestId}]` : "") + `: ${err.message}`
|
|
6902
|
+
);
|
|
6903
|
+
}
|
|
6904
|
+
throw err;
|
|
6905
|
+
}
|
|
6723
6906
|
function resolveAmbassadorBaseUrl(serverUrl) {
|
|
6724
6907
|
try {
|
|
6725
6908
|
return new URL(serverUrl).origin;
|
|
@@ -6895,6 +7078,29 @@ function createApiClient(serverUrl, options = "") {
|
|
|
6895
7078
|
},
|
|
6896
7079
|
updateEvalRun(projectId2, evalRunId2, update) {
|
|
6897
7080
|
return putJson(`/projects/${projectId2}/eval-runs/${evalRunId2}`, update);
|
|
7081
|
+
},
|
|
7082
|
+
async provisionScenarioSite(projectId2, evalRunId2, scenarioId) {
|
|
7083
|
+
const res = await httpClient.request(provisionScenarioSite({ projectId: projectId2, evalRunId: evalRunId2, scenarioId })).catch(
|
|
7084
|
+
(err) => rethrowWithRequestId(err, `provision a site for scenario ${scenarioId}`)
|
|
7085
|
+
);
|
|
7086
|
+
const site = res.data.site;
|
|
7087
|
+
if (!site) {
|
|
7088
|
+
throw new Error(
|
|
7089
|
+
`Site provisioning for scenario ${scenarioId} returned no site.`
|
|
7090
|
+
);
|
|
7091
|
+
}
|
|
7092
|
+
return {
|
|
7093
|
+
...provisionedSiteFromProto(site),
|
|
7094
|
+
bootstrapResult: siteBootstrapResultFromProto(res.data.bootstrapResult)
|
|
7095
|
+
};
|
|
7096
|
+
},
|
|
7097
|
+
async deleteProvisionedSite(projectId2, siteId) {
|
|
7098
|
+
await httpClient.request(deleteProvisionedSite({ projectId: projectId2, siteId })).catch((err) => {
|
|
7099
|
+
console.warn(
|
|
7100
|
+
"[site-provisioning] deleteProvisionedSite failed \u2014 site may remain:",
|
|
7101
|
+
err
|
|
7102
|
+
);
|
|
7103
|
+
});
|
|
6898
7104
|
}
|
|
6899
7105
|
};
|
|
6900
7106
|
}
|
|
@@ -7147,8 +7353,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
7147
7353
|
import {
|
|
7148
7354
|
AssertionResultStatus,
|
|
7149
7355
|
DEFAULT_JUDGE_MODEL,
|
|
7150
|
-
LiveTraceEventType as LiveTraceEventType4
|
|
7151
|
-
formatTraceEventLine
|
|
7356
|
+
LiveTraceEventType as LiveTraceEventType4
|
|
7152
7357
|
} from "@wix/evalforge-types";
|
|
7153
7358
|
import {
|
|
7154
7359
|
evaluateAssertions as evaluateAssertionsBase
|
|
@@ -7279,10 +7484,13 @@ function installWithCache(workDir, exec, cacheBase, pm) {
|
|
|
7279
7484
|
);
|
|
7280
7485
|
}
|
|
7281
7486
|
}
|
|
7282
|
-
async function installDependencies(workDir,
|
|
7487
|
+
async function installDependencies(workDir, onProgress, options = {}) {
|
|
7283
7488
|
if (!existsSync(path.join(workDir, "package.json"))) {
|
|
7284
7489
|
return;
|
|
7285
7490
|
}
|
|
7491
|
+
const exec = options.exec ?? execFileSync;
|
|
7492
|
+
const cacheBase = options.cacheBase;
|
|
7493
|
+
onProgress("Installing dependencies...");
|
|
7286
7494
|
const pm = detectPackageManager(workDir);
|
|
7287
7495
|
if (cacheBase) {
|
|
7288
7496
|
installWithCache(workDir, exec, cacheBase, pm);
|
|
@@ -7358,7 +7566,8 @@ function writeWixEnvFile(workDir) {
|
|
|
7358
7566
|
console.warn("[environment] Failed to read wix.config.json");
|
|
7359
7567
|
}
|
|
7360
7568
|
}
|
|
7361
|
-
async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
7569
|
+
async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId, onProgress, options = {}) {
|
|
7570
|
+
const template = options.template;
|
|
7362
7571
|
const baseDir = config.evaluationsDir ?? path2.join(tmpdir(), "evalforge-evaluations");
|
|
7363
7572
|
const nodeModulesCacheDir = path2.join(baseDir, "_node_modules_cache");
|
|
7364
7573
|
if (template) {
|
|
@@ -7372,10 +7581,14 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
7372
7581
|
rmSync(workDir2, { recursive: true });
|
|
7373
7582
|
}
|
|
7374
7583
|
mkdirSync2(workDir2, { recursive: true });
|
|
7584
|
+
onProgress("Fetching template files...");
|
|
7375
7585
|
await fetchAndWriteTemplateFiles(template, workDir2);
|
|
7376
7586
|
console.log(`Template files written to ${workDir2}`);
|
|
7377
7587
|
writeWixEnvFile(workDir2);
|
|
7378
|
-
await installDependencies(workDir2,
|
|
7588
|
+
await installDependencies(workDir2, onProgress, {
|
|
7589
|
+
cacheBase: nodeModulesCacheDir
|
|
7590
|
+
});
|
|
7591
|
+
onProgress("Environment ready");
|
|
7379
7592
|
return workDir2;
|
|
7380
7593
|
}
|
|
7381
7594
|
const workDir = path2.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
|
|
@@ -7387,6 +7600,13 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
|
|
|
7387
7600
|
return workDir;
|
|
7388
7601
|
}
|
|
7389
7602
|
|
|
7603
|
+
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7604
|
+
import { TRACE_EVENT_PREFIX } from "@wix/evalforge-types";
|
|
7605
|
+
function emitTraceEvent(event, pushEvent) {
|
|
7606
|
+
console.log(`${TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7607
|
+
pushEvent?.(event);
|
|
7608
|
+
}
|
|
7609
|
+
|
|
7390
7610
|
// src/run-scenario/run-agent-with-context.ts
|
|
7391
7611
|
import { randomUUID as randomUUID4 } from "crypto";
|
|
7392
7612
|
|
|
@@ -7845,13 +8065,6 @@ function buildConversation(timestampedMessages) {
|
|
|
7845
8065
|
return messages;
|
|
7846
8066
|
}
|
|
7847
8067
|
|
|
7848
|
-
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7849
|
-
import { TRACE_EVENT_PREFIX } from "@wix/evalforge-types";
|
|
7850
|
-
function emitTraceEvent(event, pushEvent) {
|
|
7851
|
-
console.log(`${TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7852
|
-
pushEvent?.(event);
|
|
7853
|
-
}
|
|
7854
|
-
|
|
7855
8068
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7856
8069
|
var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7857
8070
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
@@ -11836,87 +12049,124 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
11836
12049
|
}
|
|
11837
12050
|
|
|
11838
12051
|
// src/run-scenario/index.ts
|
|
11839
|
-
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions, pushEvent) {
|
|
12052
|
+
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions, pushEvent, apiClient, projectId2) {
|
|
11840
12053
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
11841
12054
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11842
|
-
|
|
11843
|
-
|
|
11844
|
-
|
|
11845
|
-
|
|
11846
|
-
|
|
11847
|
-
|
|
11848
|
-
|
|
11849
|
-
|
|
11850
|
-
|
|
11851
|
-
|
|
11852
|
-
|
|
11853
|
-
|
|
11854
|
-
|
|
11855
|
-
|
|
11856
|
-
})
|
|
11857
|
-
);
|
|
11858
|
-
}
|
|
11859
|
-
const workDir = await prepareWorkingDirectory(
|
|
11860
|
-
config,
|
|
11861
|
-
evalRunId2,
|
|
11862
|
-
targetId,
|
|
11863
|
-
scenario.id,
|
|
11864
|
-
template
|
|
11865
|
-
);
|
|
11866
|
-
const partialResult = await runAgentWithContext(
|
|
11867
|
-
config,
|
|
11868
|
-
evalRunId2,
|
|
11869
|
-
scenario,
|
|
11870
|
-
evalData,
|
|
11871
|
-
workDir,
|
|
12055
|
+
const emitSetupProgress = (outputPreview) => emitTraceEvent(
|
|
12056
|
+
{
|
|
12057
|
+
evalRunId: evalRunId2,
|
|
12058
|
+
scenarioId: scenario.id,
|
|
12059
|
+
scenarioName: scenario.name,
|
|
12060
|
+
targetId,
|
|
12061
|
+
targetName,
|
|
12062
|
+
stepNumber: 0,
|
|
12063
|
+
type: LiveTraceEventType4.PROGRESS,
|
|
12064
|
+
outputPreview,
|
|
12065
|
+
elapsedMs: 0,
|
|
12066
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12067
|
+
isComplete: false
|
|
12068
|
+
},
|
|
11872
12069
|
pushEvent
|
|
11873
12070
|
);
|
|
11874
|
-
|
|
11875
|
-
|
|
11876
|
-
|
|
11877
|
-
|
|
11878
|
-
|
|
11879
|
-
|
|
11880
|
-
|
|
11881
|
-
|
|
11882
|
-
const
|
|
11883
|
-
|
|
11884
|
-
|
|
11885
|
-
|
|
11886
|
-
|
|
11887
|
-
|
|
11888
|
-
|
|
11889
|
-
|
|
11890
|
-
|
|
11891
|
-
|
|
11892
|
-
|
|
11893
|
-
|
|
11894
|
-
|
|
11895
|
-
|
|
11896
|
-
|
|
11897
|
-
|
|
12071
|
+
let provisionedSite;
|
|
12072
|
+
if (apiClient && projectId2 && scenario.siteSetup && scenario.siteSetup.mode !== "none") {
|
|
12073
|
+
provisionedSite = await apiClient.provisionScenarioSite(
|
|
12074
|
+
projectId2,
|
|
12075
|
+
evalRunId2,
|
|
12076
|
+
scenario.id
|
|
12077
|
+
);
|
|
12078
|
+
}
|
|
12079
|
+
const failedStep = provisionedSite?.bootstrapResult?.steps.find((s) => !s.ok);
|
|
12080
|
+
if (failedStep) {
|
|
12081
|
+
const message = `Site bootstrap step ${failedStep.label ? `"${failedStep.label}" ` : ""}failed (HTTP ${failedStep.statusCode}): ${failedStep.error ?? "unknown error"}`;
|
|
12082
|
+
console.warn(`[run-scenario] ${message}`);
|
|
12083
|
+
pushEvent?.({
|
|
12084
|
+
evalRunId: evalRunId2,
|
|
12085
|
+
scenarioId: scenario.id,
|
|
12086
|
+
scenarioName: scenario.name,
|
|
12087
|
+
targetId,
|
|
12088
|
+
targetName,
|
|
12089
|
+
stepNumber: 0,
|
|
12090
|
+
type: LiveTraceEventType4.PROGRESS,
|
|
12091
|
+
outputPreview: message,
|
|
12092
|
+
elapsedMs: 0,
|
|
12093
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12094
|
+
isComplete: false
|
|
12095
|
+
});
|
|
12096
|
+
}
|
|
12097
|
+
const effectiveTriggerPrompt = provisionedSite ? `${scenario.triggerPrompt}
|
|
12098
|
+
|
|
12099
|
+
Site ID: ${provisionedSite.id}` : scenario.triggerPrompt;
|
|
12100
|
+
try {
|
|
12101
|
+
emitSetupProgress("Setting up environment...");
|
|
12102
|
+
const workDir = await prepareWorkingDirectory(
|
|
12103
|
+
config,
|
|
12104
|
+
evalRunId2,
|
|
12105
|
+
targetId,
|
|
12106
|
+
scenario.id,
|
|
12107
|
+
emitSetupProgress,
|
|
12108
|
+
{ template }
|
|
12109
|
+
);
|
|
12110
|
+
const partialResult = await runAgentWithContext(
|
|
12111
|
+
config,
|
|
12112
|
+
evalRunId2,
|
|
12113
|
+
{ ...scenario, triggerPrompt: effectiveTriggerPrompt },
|
|
12114
|
+
evalData,
|
|
12115
|
+
workDir,
|
|
12116
|
+
pushEvent
|
|
12117
|
+
);
|
|
12118
|
+
const inlineAssertions = scenario.assertions ?? [];
|
|
12119
|
+
const assertions = [
|
|
12120
|
+
...inlineAssertions,
|
|
12121
|
+
...resolvedAssertions ?? []
|
|
12122
|
+
];
|
|
12123
|
+
const templateFilesMap = new Map(
|
|
12124
|
+
(partialResult.templateFiles ?? []).map((f) => [f.path, f.status])
|
|
12125
|
+
);
|
|
12126
|
+
const evaluationInput = {
|
|
12127
|
+
outputText: partialResult.outputText,
|
|
12128
|
+
llmTrace: partialResult.llmTrace,
|
|
12129
|
+
fileDiffs: partialResult.fileDiffs?.map((d) => ({
|
|
12130
|
+
path: d.path,
|
|
12131
|
+
status: templateFilesMap.get(d.path)
|
|
12132
|
+
})),
|
|
12133
|
+
durationMs: partialResult.duration
|
|
12134
|
+
};
|
|
12135
|
+
const defaultJudgeModel = DEFAULT_JUDGE_MODEL;
|
|
12136
|
+
const assertionContext = {
|
|
12137
|
+
workDir,
|
|
12138
|
+
defaultJudgeModel,
|
|
12139
|
+
llmConfig: {
|
|
12140
|
+
baseUrl: config.aiGatewayUrl,
|
|
12141
|
+
headers: config.aiGatewayHeaders
|
|
12142
|
+
}
|
|
12143
|
+
};
|
|
12144
|
+
const assertionResults = assertions.length > 0 ? await evaluateAssertionsBase(
|
|
12145
|
+
evaluationInput,
|
|
12146
|
+
assertions,
|
|
12147
|
+
assertionContext
|
|
12148
|
+
) : [];
|
|
12149
|
+
const passed = assertionResults.filter(
|
|
12150
|
+
(r) => r.status === AssertionResultStatus.PASSED
|
|
12151
|
+
).length;
|
|
12152
|
+
const failed = assertionResults.filter(
|
|
12153
|
+
(r) => r.status === AssertionResultStatus.FAILED
|
|
12154
|
+
).length;
|
|
12155
|
+
const total = assertionResults.length;
|
|
12156
|
+
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
12157
|
+
return {
|
|
12158
|
+
...partialResult,
|
|
12159
|
+
assertionResults,
|
|
12160
|
+
passed,
|
|
12161
|
+
failed,
|
|
12162
|
+
passRate,
|
|
12163
|
+
provisionedSite
|
|
12164
|
+
};
|
|
12165
|
+
} finally {
|
|
12166
|
+
if (provisionedSite && apiClient && projectId2) {
|
|
12167
|
+
await apiClient.deleteProvisionedSite(projectId2, provisionedSite.id);
|
|
11898
12168
|
}
|
|
11899
|
-
}
|
|
11900
|
-
const assertionResults = assertions.length > 0 ? await evaluateAssertionsBase(
|
|
11901
|
-
evaluationInput,
|
|
11902
|
-
assertions,
|
|
11903
|
-
assertionContext
|
|
11904
|
-
) : [];
|
|
11905
|
-
const passed = assertionResults.filter(
|
|
11906
|
-
(r) => r.status === AssertionResultStatus.PASSED
|
|
11907
|
-
).length;
|
|
11908
|
-
const failed = assertionResults.filter(
|
|
11909
|
-
(r) => r.status === AssertionResultStatus.FAILED
|
|
11910
|
-
).length;
|
|
11911
|
-
const total = assertionResults.length;
|
|
11912
|
-
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
11913
|
-
return {
|
|
11914
|
-
...partialResult,
|
|
11915
|
-
assertionResults,
|
|
11916
|
-
passed,
|
|
11917
|
-
failed,
|
|
11918
|
-
passRate
|
|
11919
|
-
};
|
|
12169
|
+
}
|
|
11920
12170
|
}
|
|
11921
12171
|
|
|
11922
12172
|
// src/evaluation-loop.ts
|
|
@@ -12196,7 +12446,9 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12196
12446
|
evalData,
|
|
12197
12447
|
template,
|
|
12198
12448
|
resolvedAssertions,
|
|
12199
|
-
pushEvent
|
|
12449
|
+
pushEvent,
|
|
12450
|
+
api,
|
|
12451
|
+
projectId2
|
|
12200
12452
|
);
|
|
12201
12453
|
},
|
|
12202
12454
|
addResult: async (result) => {
|