@wix/evalforge-evaluator 0.120.0 → 0.122.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +16 -93
- package/build/index.js.map +2 -2
- package/build/index.mjs +16 -93
- package/build/index.mjs.map +2 -2
- package/build/types/api-client.d.ts +1 -2
- package/build/types/fetch-evaluation-data.d.ts +1 -5
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -163,9 +163,6 @@ function createApiClient(serverUrl, options = "") {
|
|
|
163
163
|
getPreset(projectId2, id) {
|
|
164
164
|
return fetchJson(`/projects/${projectId2}/presets/${id}`);
|
|
165
165
|
},
|
|
166
|
-
getAssertion(projectId2, id) {
|
|
167
|
-
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
168
|
-
},
|
|
169
166
|
addResult(projectId2, evalRunId2, result) {
|
|
170
167
|
return postJson(
|
|
171
168
|
`/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
|
|
@@ -230,9 +227,6 @@ function resolveValue(value, placeholders) {
|
|
|
230
227
|
}
|
|
231
228
|
return value;
|
|
232
229
|
}
|
|
233
|
-
function resolvePlaceholdersInString(text, placeholders) {
|
|
234
|
-
return resolveValue(text, placeholders);
|
|
235
|
-
}
|
|
236
230
|
|
|
237
231
|
// src/fetch-evaluation-data.ts
|
|
238
232
|
function parseSkillNamesFromParams(value) {
|
|
@@ -245,59 +239,6 @@ function parseSkillNamesFromParams(value) {
|
|
|
245
239
|
}
|
|
246
240
|
return [];
|
|
247
241
|
}
|
|
248
|
-
function applyParamsToAssertion(assertion, params) {
|
|
249
|
-
if (!params || Object.keys(params).length === 0) {
|
|
250
|
-
return assertion;
|
|
251
|
-
}
|
|
252
|
-
if (assertion.type === "llm_judge") {
|
|
253
|
-
const stringParams = {};
|
|
254
|
-
for (const [key, value] of Object.entries(params)) {
|
|
255
|
-
stringParams[key] = String(value ?? "");
|
|
256
|
-
}
|
|
257
|
-
const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
|
|
258
|
-
return {
|
|
259
|
-
...assertion,
|
|
260
|
-
prompt,
|
|
261
|
-
...params.model !== void 0 && { model: params.model },
|
|
262
|
-
...params.maxTokens !== void 0 && {
|
|
263
|
-
maxTokens: params.maxTokens
|
|
264
|
-
},
|
|
265
|
-
...params.temperature !== void 0 && {
|
|
266
|
-
temperature: params.temperature
|
|
267
|
-
},
|
|
268
|
-
...params.minScore !== void 0 && {
|
|
269
|
-
minScore: params.minScore
|
|
270
|
-
}
|
|
271
|
-
};
|
|
272
|
-
}
|
|
273
|
-
if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
|
|
274
|
-
return {
|
|
275
|
-
...assertion,
|
|
276
|
-
maxDurationMs: params.maxDurationMs
|
|
277
|
-
};
|
|
278
|
-
}
|
|
279
|
-
if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
|
|
280
|
-
return {
|
|
281
|
-
...assertion,
|
|
282
|
-
skillNames: parseSkillNamesFromParams(params.skillNames)
|
|
283
|
-
};
|
|
284
|
-
}
|
|
285
|
-
if (assertion.type === "tool_called_with_param") {
|
|
286
|
-
return {
|
|
287
|
-
...assertion,
|
|
288
|
-
...params.toolName !== void 0 && {
|
|
289
|
-
toolName: params.toolName
|
|
290
|
-
},
|
|
291
|
-
...params.expectedParams !== void 0 && {
|
|
292
|
-
expectedParams: params.expectedParams
|
|
293
|
-
},
|
|
294
|
-
...params.requireSuccess !== void 0 && {
|
|
295
|
-
requireSuccess: params.requireSuccess
|
|
296
|
-
}
|
|
297
|
-
};
|
|
298
|
-
}
|
|
299
|
-
return { ...assertion, ...params };
|
|
300
|
-
}
|
|
301
242
|
function resolveSystemAssertion(assertionId, params) {
|
|
302
243
|
const systemAssertion = SYSTEM_ASSERTIONS[assertionId];
|
|
303
244
|
let baseAssertion;
|
|
@@ -352,18 +293,6 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
352
293
|
}
|
|
353
294
|
return baseAssertion;
|
|
354
295
|
}
|
|
355
|
-
function customAssertionToAssertion(ca, params) {
|
|
356
|
-
const config = ca.config;
|
|
357
|
-
const baseAssertion = {
|
|
358
|
-
type: "llm_judge",
|
|
359
|
-
prompt: config?.prompt ?? "",
|
|
360
|
-
minScore: config?.minScore,
|
|
361
|
-
model: config?.model,
|
|
362
|
-
maxTokens: config?.maxTokens,
|
|
363
|
-
temperature: config?.temperature
|
|
364
|
-
};
|
|
365
|
-
return applyParamsToAssertion(baseAssertion, params);
|
|
366
|
-
}
|
|
367
296
|
async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
368
297
|
const evalRun = await api.getEvalRun(projectId2, evalRunId2);
|
|
369
298
|
const scenarios = await Promise.all(
|
|
@@ -433,30 +362,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
433
362
|
templateIds.map((id) => api.getTemplate(projectId2, id))
|
|
434
363
|
) : [];
|
|
435
364
|
const templateMap = new Map(templates.map((t) => [t.id, t]));
|
|
436
|
-
const assertionIds = [
|
|
437
|
-
...new Set(
|
|
438
|
-
scenarios.flatMap((s) => s.assertionLinks ?? []).map((link) => link.assertionId).filter((id) => !isSystemAssertionId(id))
|
|
439
|
-
)
|
|
440
|
-
];
|
|
441
|
-
const assertions = assertionIds.length > 0 ? await Promise.all(
|
|
442
|
-
assertionIds.map((id) => api.getAssertion(projectId2, id))
|
|
443
|
-
) : [];
|
|
444
|
-
const assertionMap = new Map(assertions.map((a) => [a.id, a]));
|
|
445
365
|
const scenarioItems = scenarios.map((scenario) => {
|
|
446
366
|
const resolvedAssertions = (scenario.assertionLinks ?? []).map((link) => {
|
|
447
367
|
const { assertionId, params } = link;
|
|
448
|
-
if (isSystemAssertionId(assertionId)) {
|
|
449
|
-
return resolveSystemAssertion(
|
|
450
|
-
assertionId,
|
|
451
|
-
params
|
|
452
|
-
);
|
|
453
|
-
}
|
|
454
|
-
const customAssertion = assertionMap.get(assertionId);
|
|
455
|
-
if (!customAssertion) {
|
|
368
|
+
if (!isSystemAssertionId(assertionId)) {
|
|
456
369
|
return null;
|
|
457
370
|
}
|
|
458
|
-
return
|
|
459
|
-
|
|
371
|
+
return resolveSystemAssertion(
|
|
372
|
+
assertionId,
|
|
460
373
|
params
|
|
461
374
|
);
|
|
462
375
|
}).filter((a) => a !== null);
|
|
@@ -2693,6 +2606,7 @@ import { writeFile as writeFile6, mkdir as mkdir7 } from "fs/promises";
|
|
|
2693
2606
|
import { join as join8 } from "path";
|
|
2694
2607
|
var KILL_GRACE_PERIOD_MS = 5e3;
|
|
2695
2608
|
var IDLE_TIMEOUT_MS = 12e4;
|
|
2609
|
+
var TOOL_RUNNING_IDLE_TIMEOUT_MS = 36e4;
|
|
2696
2610
|
var IDLE_CHECK_INTERVAL_MS = 15e3;
|
|
2697
2611
|
function extractToolAction(toolName, args) {
|
|
2698
2612
|
if (!toolName) return "Using tool...";
|
|
@@ -2848,6 +2762,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2848
2762
|
let lastAction = "Starting...";
|
|
2849
2763
|
let lastToolName;
|
|
2850
2764
|
let lastFilePath;
|
|
2765
|
+
let isToolRunning = false;
|
|
2851
2766
|
if (traceContext) {
|
|
2852
2767
|
emitTraceEvent(
|
|
2853
2768
|
{
|
|
@@ -3048,15 +2963,16 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3048
2963
|
timers.idleCheck = setInterval(() => {
|
|
3049
2964
|
if (resolved) return;
|
|
3050
2965
|
const idleTime = Date.now() - lastOutputTime;
|
|
3051
|
-
|
|
2966
|
+
const effectiveTimeout = isToolRunning ? TOOL_RUNNING_IDLE_TIMEOUT_MS : IDLE_TIMEOUT_MS;
|
|
2967
|
+
if (idleTime >= effectiveTimeout) {
|
|
3052
2968
|
console.warn(
|
|
3053
|
-
`[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
|
|
2969
|
+
`[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s (tool running: ${isToolRunning}). Killing process.`
|
|
3054
2970
|
);
|
|
3055
2971
|
killProcess(child, resolved);
|
|
3056
2972
|
finalize(
|
|
3057
2973
|
false,
|
|
3058
2974
|
new Error(
|
|
3059
|
-
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
|
|
2975
|
+
`OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout, tool running: ${isToolRunning}). Skills: ${skillNames}, Scenario: ${scenario.name}`
|
|
3060
2976
|
)
|
|
3061
2977
|
);
|
|
3062
2978
|
}
|
|
@@ -3117,6 +3033,13 @@ Stderr: ${stderr.slice(0, 1e3)}`
|
|
|
3117
3033
|
const evt = tryParseJson(line);
|
|
3118
3034
|
if (!evt || !evt.type) continue;
|
|
3119
3035
|
allEvents.push({ event: evt, receivedAt: Date.now() });
|
|
3036
|
+
if (evt.type === "tool_use") {
|
|
3037
|
+
const tu = evt;
|
|
3038
|
+
const status = tu.part.state.status;
|
|
3039
|
+
isToolRunning = status !== "completed" && status !== "error";
|
|
3040
|
+
} else {
|
|
3041
|
+
isToolRunning = false;
|
|
3042
|
+
}
|
|
3120
3043
|
if (traceContext) {
|
|
3121
3044
|
traceStepNumber++;
|
|
3122
3045
|
const traceEvt = createTraceEventFromNdjson(
|