@wix/evalforge-evaluator 0.120.0 → 0.122.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -163,9 +163,6 @@ function createApiClient(serverUrl, options = "") {
163
163
  getPreset(projectId2, id) {
164
164
  return fetchJson(`/projects/${projectId2}/presets/${id}`);
165
165
  },
166
- getAssertion(projectId2, id) {
167
- return fetchJson(`/projects/${projectId2}/assertions/${id}`);
168
- },
169
166
  addResult(projectId2, evalRunId2, result) {
170
167
  return postJson(
171
168
  `/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
@@ -230,9 +227,6 @@ function resolveValue(value, placeholders) {
230
227
  }
231
228
  return value;
232
229
  }
233
- function resolvePlaceholdersInString(text, placeholders) {
234
- return resolveValue(text, placeholders);
235
- }
236
230
 
237
231
  // src/fetch-evaluation-data.ts
238
232
  function parseSkillNamesFromParams(value) {
@@ -245,59 +239,6 @@ function parseSkillNamesFromParams(value) {
245
239
  }
246
240
  return [];
247
241
  }
248
- function applyParamsToAssertion(assertion, params) {
249
- if (!params || Object.keys(params).length === 0) {
250
- return assertion;
251
- }
252
- if (assertion.type === "llm_judge") {
253
- const stringParams = {};
254
- for (const [key, value] of Object.entries(params)) {
255
- stringParams[key] = String(value ?? "");
256
- }
257
- const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
258
- return {
259
- ...assertion,
260
- prompt,
261
- ...params.model !== void 0 && { model: params.model },
262
- ...params.maxTokens !== void 0 && {
263
- maxTokens: params.maxTokens
264
- },
265
- ...params.temperature !== void 0 && {
266
- temperature: params.temperature
267
- },
268
- ...params.minScore !== void 0 && {
269
- minScore: params.minScore
270
- }
271
- };
272
- }
273
- if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
274
- return {
275
- ...assertion,
276
- maxDurationMs: params.maxDurationMs
277
- };
278
- }
279
- if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
280
- return {
281
- ...assertion,
282
- skillNames: parseSkillNamesFromParams(params.skillNames)
283
- };
284
- }
285
- if (assertion.type === "tool_called_with_param") {
286
- return {
287
- ...assertion,
288
- ...params.toolName !== void 0 && {
289
- toolName: params.toolName
290
- },
291
- ...params.expectedParams !== void 0 && {
292
- expectedParams: params.expectedParams
293
- },
294
- ...params.requireSuccess !== void 0 && {
295
- requireSuccess: params.requireSuccess
296
- }
297
- };
298
- }
299
- return { ...assertion, ...params };
300
- }
301
242
  function resolveSystemAssertion(assertionId, params) {
302
243
  const systemAssertion = SYSTEM_ASSERTIONS[assertionId];
303
244
  let baseAssertion;
@@ -352,18 +293,6 @@ function resolveSystemAssertion(assertionId, params) {
352
293
  }
353
294
  return baseAssertion;
354
295
  }
355
- function customAssertionToAssertion(ca, params) {
356
- const config = ca.config;
357
- const baseAssertion = {
358
- type: "llm_judge",
359
- prompt: config?.prompt ?? "",
360
- minScore: config?.minScore,
361
- model: config?.model,
362
- maxTokens: config?.maxTokens,
363
- temperature: config?.temperature
364
- };
365
- return applyParamsToAssertion(baseAssertion, params);
366
- }
367
296
  async function fetchEvaluationData(api, projectId2, evalRunId2) {
368
297
  const evalRun = await api.getEvalRun(projectId2, evalRunId2);
369
298
  const scenarios = await Promise.all(
@@ -433,30 +362,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
433
362
  templateIds.map((id) => api.getTemplate(projectId2, id))
434
363
  ) : [];
435
364
  const templateMap = new Map(templates.map((t) => [t.id, t]));
436
- const assertionIds = [
437
- ...new Set(
438
- scenarios.flatMap((s) => s.assertionLinks ?? []).map((link) => link.assertionId).filter((id) => !isSystemAssertionId(id))
439
- )
440
- ];
441
- const assertions = assertionIds.length > 0 ? await Promise.all(
442
- assertionIds.map((id) => api.getAssertion(projectId2, id))
443
- ) : [];
444
- const assertionMap = new Map(assertions.map((a) => [a.id, a]));
445
365
  const scenarioItems = scenarios.map((scenario) => {
446
366
  const resolvedAssertions = (scenario.assertionLinks ?? []).map((link) => {
447
367
  const { assertionId, params } = link;
448
- if (isSystemAssertionId(assertionId)) {
449
- return resolveSystemAssertion(
450
- assertionId,
451
- params
452
- );
453
- }
454
- const customAssertion = assertionMap.get(assertionId);
455
- if (!customAssertion) {
368
+ if (!isSystemAssertionId(assertionId)) {
456
369
  return null;
457
370
  }
458
- return customAssertionToAssertion(
459
- customAssertion,
371
+ return resolveSystemAssertion(
372
+ assertionId,
460
373
  params
461
374
  );
462
375
  }).filter((a) => a !== null);
@@ -2693,6 +2606,7 @@ import { writeFile as writeFile6, mkdir as mkdir7 } from "fs/promises";
2693
2606
  import { join as join8 } from "path";
2694
2607
  var KILL_GRACE_PERIOD_MS = 5e3;
2695
2608
  var IDLE_TIMEOUT_MS = 12e4;
2609
+ var TOOL_RUNNING_IDLE_TIMEOUT_MS = 36e4;
2696
2610
  var IDLE_CHECK_INTERVAL_MS = 15e3;
2697
2611
  function extractToolAction(toolName, args) {
2698
2612
  if (!toolName) return "Using tool...";
@@ -2848,6 +2762,7 @@ async function executeWithOpenCode(skills, scenario, options) {
2848
2762
  let lastAction = "Starting...";
2849
2763
  let lastToolName;
2850
2764
  let lastFilePath;
2765
+ let isToolRunning = false;
2851
2766
  if (traceContext) {
2852
2767
  emitTraceEvent(
2853
2768
  {
@@ -3048,15 +2963,16 @@ Stderr: ${stderr.slice(0, 1e3)}`
3048
2963
  timers.idleCheck = setInterval(() => {
3049
2964
  if (resolved) return;
3050
2965
  const idleTime = Date.now() - lastOutputTime;
3051
- if (idleTime >= IDLE_TIMEOUT_MS) {
2966
+ const effectiveTimeout = isToolRunning ? TOOL_RUNNING_IDLE_TIMEOUT_MS : IDLE_TIMEOUT_MS;
2967
+ if (idleTime >= effectiveTimeout) {
3052
2968
  console.warn(
3053
- `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s. Killing process.`
2969
+ `[OpenCode] Process appears stuck - no output for ${Math.round(idleTime / 1e3)}s (tool running: ${isToolRunning}). Killing process.`
3054
2970
  );
3055
2971
  killProcess(child, resolved);
3056
2972
  finalize(
3057
2973
  false,
3058
2974
  new Error(
3059
- `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout). Skills: ${skillNames}, Scenario: ${scenario.name}`
2975
+ `OpenCode process stuck - no output for ${Math.round(idleTime / 1e3)} seconds (idle timeout, tool running: ${isToolRunning}). Skills: ${skillNames}, Scenario: ${scenario.name}`
3060
2976
  )
3061
2977
  );
3062
2978
  }
@@ -3117,6 +3033,13 @@ Stderr: ${stderr.slice(0, 1e3)}`
3117
3033
  const evt = tryParseJson(line);
3118
3034
  if (!evt || !evt.type) continue;
3119
3035
  allEvents.push({ event: evt, receivedAt: Date.now() });
3036
+ if (evt.type === "tool_use") {
3037
+ const tu = evt;
3038
+ const status = tu.part.state.status;
3039
+ isToolRunning = status !== "completed" && status !== "error";
3040
+ } else {
3041
+ isToolRunning = false;
3042
+ }
3120
3043
  if (traceContext) {
3121
3044
  traceStepNumber++;
3122
3045
  const traceEvt = createTraceEventFromNdjson(