veryfront 0.1.522 → 0.1.523
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/deno.d.ts +0 -5
- package/esm/deno.js +1 -13
- package/esm/src/agent/testing/index.d.ts +1 -1
- package/esm/src/agent/testing/index.d.ts.map +1 -1
- package/esm/src/agent/testing/index.js +1 -1
- package/esm/src/agent/testing/live-evals/index.d.ts +2 -1
- package/esm/src/agent/testing/live-evals/index.d.ts.map +1 -1
- package/esm/src/agent/testing/live-evals/index.js +2 -1
- package/esm/src/agent/testing/live-evals/request.d.ts +16 -17
- package/esm/src/agent/testing/live-evals/request.d.ts.map +1 -1
- package/esm/src/agent/testing/live-evals/runner.d.ts +124 -0
- package/esm/src/agent/testing/live-evals/runner.d.ts.map +1 -0
- package/esm/src/agent/testing/live-evals/runner.js +391 -0
- package/esm/src/utils/version-constant.d.ts +1 -1
- package/esm/src/utils/version-constant.js +1 -1
- package/package.json +1 -1
- package/src/deno.js +1 -13
- package/src/src/agent/testing/index.ts +12 -0
- package/src/src/agent/testing/live-evals/index.ts +18 -1
- package/src/src/agent/testing/live-evals/request.ts +19 -1
- package/src/src/agent/testing/live-evals/runner.ts +629 -0
- package/src/src/utils/version-constant.ts +1 -1
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import * as dntShim from "../../../../_dnt.shims.js";
|
|
2
|
+
import { agUiSseEventTypes, buildAgUiSseTraceSignature as buildTraceSignature, getAgUiSseStringField as getStringField, parseAgUiSseResponse as parseSseResponse, } from "../../index.js";
|
|
3
|
+
import { buildFailureSuffix, buildProgressLine, containsOrderedSubsequence } from "./formatting.js";
|
|
4
|
+
import { buildLiveEvalRequestBody } from "./request.js";
|
|
5
|
+
import { createFailedEvalResult, createPassedEvalResult, createSkippedEvalResult, } from "./result.js";
|
|
6
|
+
function resolveFetch(config) {
|
|
7
|
+
return config.fetch ?? fetch;
|
|
8
|
+
}
|
|
9
|
+
function createLiveEvalJudgeSupport(config) {
|
|
10
|
+
async function judgeLlm(input) {
|
|
11
|
+
try {
|
|
12
|
+
const body = buildLiveEvalRequestBody({
|
|
13
|
+
testCaseId: "llm-judge",
|
|
14
|
+
prompt: `You are an eval judge. Grade the following answer.
|
|
15
|
+
|
|
16
|
+
QUESTION: ${input.question}
|
|
17
|
+
|
|
18
|
+
ANSWER: ${input.answer}
|
|
19
|
+
|
|
20
|
+
CRITERIA: ${input.criteria}
|
|
21
|
+
|
|
22
|
+
Respond with exactly one line: PASS or FAIL followed by a brief reason.
|
|
23
|
+
Example: "PASS — correctly explains the pattern with accurate details"
|
|
24
|
+
Example: "FAIL — mentions the wrong file convention"`,
|
|
25
|
+
projectId: null,
|
|
26
|
+
allowedTools: [],
|
|
27
|
+
forceRuntimeOverrides: true,
|
|
28
|
+
maxSteps: 2,
|
|
29
|
+
});
|
|
30
|
+
const response = await resolveFetch(config)(config.endpoint, {
|
|
31
|
+
method: "POST",
|
|
32
|
+
headers: {
|
|
33
|
+
"Content-Type": "application/json",
|
|
34
|
+
Authorization: `Bearer ${config.authToken}`,
|
|
35
|
+
},
|
|
36
|
+
body: JSON.stringify(body),
|
|
37
|
+
signal: AbortSignal.timeout(30_000),
|
|
38
|
+
});
|
|
39
|
+
const run = await parseSseResponse(response);
|
|
40
|
+
if (run.responseStatus !== 200) {
|
|
41
|
+
return { pass: false, reason: `judge returned HTTP ${run.responseStatus}` };
|
|
42
|
+
}
|
|
43
|
+
const line = run.text
|
|
44
|
+
.split("\n")
|
|
45
|
+
.map((value) => value.trim())
|
|
46
|
+
.find((value) => value.length > 0) ?? "";
|
|
47
|
+
if (line.toUpperCase().startsWith("PASS")) {
|
|
48
|
+
return { pass: true, reason: line };
|
|
49
|
+
}
|
|
50
|
+
return { pass: false, reason: line || "judge returned no decision" };
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
return {
|
|
54
|
+
pass: false,
|
|
55
|
+
reason: error instanceof Error ? error.message : String(error),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
function withJudge(structuralVerify, judgeInput) {
|
|
60
|
+
return async (run) => {
|
|
61
|
+
const structuralFailure = structuralVerify(run);
|
|
62
|
+
if (structuralFailure) {
|
|
63
|
+
return structuralFailure;
|
|
64
|
+
}
|
|
65
|
+
if (!config.enableLlmJudge) {
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
const judgment = await judgeLlm({
|
|
69
|
+
question: judgeInput.question,
|
|
70
|
+
answer: run.text,
|
|
71
|
+
criteria: judgeInput.criteria,
|
|
72
|
+
});
|
|
73
|
+
return judgment.pass ? null : `LLM judge: ${judgment.reason}`;
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
return {
|
|
77
|
+
judgeLlm,
|
|
78
|
+
withJudge,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
function createInitialProgressSnapshot() {
|
|
82
|
+
return {
|
|
83
|
+
eventCount: 0,
|
|
84
|
+
lastEventType: null,
|
|
85
|
+
lastToolCallName: null,
|
|
86
|
+
toolStarts: [],
|
|
87
|
+
textLength: 0,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
function isUnrefableTimer(value) {
|
|
91
|
+
return typeof value === "object" && value !== null && "unref" in value &&
|
|
92
|
+
typeof value.unref === "function";
|
|
93
|
+
}
|
|
94
|
+
function maybeUnrefTimer(timer) {
|
|
95
|
+
if (isUnrefableTimer(timer)) {
|
|
96
|
+
timer.unref();
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
function createLiveEvalProgressReporter(input) {
|
|
100
|
+
let latestProgress = createInitialProgressSnapshot();
|
|
101
|
+
const progressTimer = dntShim.setInterval(() => {
|
|
102
|
+
input.log(buildProgressLine({
|
|
103
|
+
caseId: input.caseId,
|
|
104
|
+
startedAt: input.startedAt,
|
|
105
|
+
progress: latestProgress,
|
|
106
|
+
}));
|
|
107
|
+
}, input.intervalMs);
|
|
108
|
+
maybeUnrefTimer(progressTimer);
|
|
109
|
+
return {
|
|
110
|
+
stop: () => {
|
|
111
|
+
clearInterval(progressTimer);
|
|
112
|
+
},
|
|
113
|
+
update: (snapshot) => {
|
|
114
|
+
latestProgress = snapshot;
|
|
115
|
+
},
|
|
116
|
+
getSnapshot: () => latestProgress,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
function collectPreparedArtifactPaths(prepared) {
|
|
120
|
+
if (!prepared?.metadata) {
|
|
121
|
+
return [];
|
|
122
|
+
}
|
|
123
|
+
return [
|
|
124
|
+
...new Set(Object.entries(prepared.metadata)
|
|
125
|
+
.filter(([key, value]) => key.toLowerCase().includes("path") && value.length > 0)
|
|
126
|
+
.map(([, value]) => value)),
|
|
127
|
+
].sort();
|
|
128
|
+
}
|
|
129
|
+
function extractPreparedConversationId(prepared) {
|
|
130
|
+
return typeof prepared?.metadata?.conversationId === "string" &&
|
|
131
|
+
prepared.metadata.conversationId.length > 0
|
|
132
|
+
? prepared.metadata.conversationId
|
|
133
|
+
: null;
|
|
134
|
+
}
|
|
135
|
+
function createLiveEvalRunArtifacts(input) {
|
|
136
|
+
return {
|
|
137
|
+
...(input.runId ? { runId: input.runId } : {}),
|
|
138
|
+
traceSignature: input.traceSignature,
|
|
139
|
+
toolStarts: input.run.toolStarts,
|
|
140
|
+
toolArgsPreview: input.run.toolArgs.join(" | ").slice(0, 1000),
|
|
141
|
+
textPreview: input.run.text.slice(0, 280),
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
function createFailedRunEvalResult(input) {
|
|
145
|
+
return createFailedEvalResult({
|
|
146
|
+
id: input.context.id,
|
|
147
|
+
label: input.context.label,
|
|
148
|
+
runtime: input.context.runtime,
|
|
149
|
+
details: input.details,
|
|
150
|
+
startedAt: input.context.startedAt,
|
|
151
|
+
...(input.context.conversationId ? { conversationId: input.context.conversationId } : {}),
|
|
152
|
+
...(input.runArtifacts.runId ? { runId: input.runArtifacts.runId } : {}),
|
|
153
|
+
...(input.context.artifactPaths?.length ? { artifactPaths: input.context.artifactPaths } : {}),
|
|
154
|
+
traceSignature: input.runArtifacts.traceSignature,
|
|
155
|
+
toolStarts: input.runArtifacts.toolStarts,
|
|
156
|
+
toolArgsPreview: input.runArtifacts.toolArgsPreview,
|
|
157
|
+
textPreview: input.runArtifacts.textPreview,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
function createPassedRunEvalResult(input) {
|
|
161
|
+
return createPassedEvalResult({
|
|
162
|
+
id: input.context.id,
|
|
163
|
+
label: input.context.label,
|
|
164
|
+
runtime: input.context.runtime,
|
|
165
|
+
details: input.details,
|
|
166
|
+
startedAt: input.context.startedAt,
|
|
167
|
+
...(input.context.conversationId ? { conversationId: input.context.conversationId } : {}),
|
|
168
|
+
...(input.runArtifacts.runId ? { runId: input.runArtifacts.runId } : {}),
|
|
169
|
+
...(input.context.artifactPaths?.length ? { artifactPaths: input.context.artifactPaths } : {}),
|
|
170
|
+
traceSignature: input.runArtifacts.traceSignature,
|
|
171
|
+
toolStarts: input.runArtifacts.toolStarts,
|
|
172
|
+
toolArgsPreview: input.runArtifacts.toolArgsPreview,
|
|
173
|
+
textPreview: input.runArtifacts.textPreview,
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
function createStreamingFailureEvalResult(input) {
|
|
177
|
+
return createFailedEvalResult({
|
|
178
|
+
id: input.context.id,
|
|
179
|
+
label: input.context.label,
|
|
180
|
+
runtime: input.context.runtime,
|
|
181
|
+
details: `${input.details}${buildFailureSuffix(input.progress)}`,
|
|
182
|
+
startedAt: input.context.startedAt,
|
|
183
|
+
...(input.context.conversationId ? { conversationId: input.context.conversationId } : {}),
|
|
184
|
+
...(input.context.artifactPaths?.length ? { artifactPaths: input.context.artifactPaths } : {}),
|
|
185
|
+
toolStarts: input.progress.toolStarts,
|
|
186
|
+
textPreview: input.progress.textLength > 0
|
|
187
|
+
? `${input.progress.textLength} characters streamed`
|
|
188
|
+
: undefined,
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
function createLiveEvalResultContext(input) {
|
|
192
|
+
return {
|
|
193
|
+
id: input.testCase.id,
|
|
194
|
+
label: input.testCase.label,
|
|
195
|
+
runtime: input.runtime,
|
|
196
|
+
startedAt: input.startedAt,
|
|
197
|
+
...(input.conversationId ? { conversationId: input.conversationId } : {}),
|
|
198
|
+
...(input.artifactPaths.length > 0 ? { artifactPaths: input.artifactPaths } : {}),
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
function buildLiveEvalRunBody(input) {
|
|
202
|
+
const customBody = typeof input.prepared?.metadata?.customBody === "string"
|
|
203
|
+
? input.prepared.metadata.customBody
|
|
204
|
+
: null;
|
|
205
|
+
if (customBody) {
|
|
206
|
+
return JSON.parse(customBody);
|
|
207
|
+
}
|
|
208
|
+
return buildLiveEvalRequestBody({
|
|
209
|
+
testCaseId: input.testCase.id,
|
|
210
|
+
prompt: input.prepared?.prompt ?? input.testCase.prompt ?? "",
|
|
211
|
+
metadata: input.prepared?.metadata,
|
|
212
|
+
projectId: input.config.projectId && input.testCase.requireProject
|
|
213
|
+
? input.config.projectId
|
|
214
|
+
: null,
|
|
215
|
+
...(input.config.branchId ? { branchId: input.config.branchId } : {}),
|
|
216
|
+
...(input.config.model ? { model: input.config.model } : {}),
|
|
217
|
+
...(input.conversationId ? { conversationId: input.conversationId } : {}),
|
|
218
|
+
allowedTools: input.testCase.allowedTools,
|
|
219
|
+
forceRuntimeOverrides: input.testCase.forceRuntimeOverrides,
|
|
220
|
+
maxSteps: input.testCase.maxSteps,
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
async function resolveCompletedLiveEvalRun(input) {
|
|
224
|
+
const traceSignature = buildTraceSignature(input.run.eventTypes);
|
|
225
|
+
const runArtifacts = createLiveEvalRunArtifacts({
|
|
226
|
+
run: input.run,
|
|
227
|
+
runId: input.runId,
|
|
228
|
+
traceSignature,
|
|
229
|
+
});
|
|
230
|
+
const failure = await input.testCase.verify(input.run, input.prepared);
|
|
231
|
+
if (!failure && input.testCase.expectedEventSubsequence) {
|
|
232
|
+
if (!containsOrderedSubsequence(input.run.eventTypes, input.testCase.expectedEventSubsequence)) {
|
|
233
|
+
return createFailedRunEvalResult({
|
|
234
|
+
context: input.context,
|
|
235
|
+
details: `Expected AG-UI event subsequence ${input.testCase.expectedEventSubsequence.join(" -> ")}, got ${traceSignature}`,
|
|
236
|
+
runArtifacts,
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
if (failure) {
|
|
241
|
+
return createFailedRunEvalResult({
|
|
242
|
+
context: input.context,
|
|
243
|
+
details: failure,
|
|
244
|
+
runArtifacts,
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
return createPassedRunEvalResult({
|
|
248
|
+
context: input.context,
|
|
249
|
+
details: `OK: ${input.run.toolStarts.join(", ") || "no tools"} | ${input.run.text.slice(0, 140) || "no text"}`,
|
|
250
|
+
runArtifacts,
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
function extractRunId(run) {
|
|
254
|
+
for (const event of run.events) {
|
|
255
|
+
const runId = getStringField(event, "runId") ?? getStringField(event, "run_id");
|
|
256
|
+
if (runId) {
|
|
257
|
+
return runId;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return null;
|
|
261
|
+
}
|
|
262
|
+
export function hasFinished(run) {
|
|
263
|
+
return run.eventTypes.includes(agUiSseEventTypes.runFinished) && !run.runError;
|
|
264
|
+
}
|
|
265
|
+
export function containsSkillLoad(run, skillId) {
|
|
266
|
+
return run.toolStarts.includes("load_skill") && run.toolArgs.join("").includes(skillId);
|
|
267
|
+
}
|
|
268
|
+
export function countStepStartedEvents(run) {
|
|
269
|
+
return run.eventTypes.filter((eventType) => eventType === agUiSseEventTypes.stepStarted).length;
|
|
270
|
+
}
|
|
271
|
+
export function createLiveEvalCaseSupport(config) {
|
|
272
|
+
const fetchImpl = resolveFetch(config);
|
|
273
|
+
const log = config.log ?? console.log;
|
|
274
|
+
const { judgeLlm, withJudge } = createLiveEvalJudgeSupport(config);
|
|
275
|
+
async function verifyFileExists(input) {
|
|
276
|
+
if (!config.projectId || !config.readProjectFile) {
|
|
277
|
+
return null;
|
|
278
|
+
}
|
|
279
|
+
const file = await config.readProjectFile({
|
|
280
|
+
filePath: input.filePath,
|
|
281
|
+
requestTimeoutMs: config.requestTimeoutMs,
|
|
282
|
+
});
|
|
283
|
+
if (!file) {
|
|
284
|
+
return `${input.description ?? input.filePath}: file not found in project after task completed`;
|
|
285
|
+
}
|
|
286
|
+
if (!file.content || file.content.trim().length === 0) {
|
|
287
|
+
return `${input.description ?? input.filePath}: file exists but is empty`;
|
|
288
|
+
}
|
|
289
|
+
if (input.requiredContent) {
|
|
290
|
+
const missing = input.requiredContent.filter((keyword) => !file.content.toLowerCase().includes(keyword.toLowerCase()));
|
|
291
|
+
if (missing.length > 0) {
|
|
292
|
+
return `${input.description ?? input.filePath}: missing required content: ${missing.join(", ")}. Got: ${file.content.slice(0, 200)}`;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
return null;
|
|
296
|
+
}
|
|
297
|
+
async function runEval(testCase, runtime) {
|
|
298
|
+
const startedAt = Date.now();
|
|
299
|
+
if (testCase.requireProject && !config.projectId) {
|
|
300
|
+
return createSkippedEvalResult({
|
|
301
|
+
id: testCase.id,
|
|
302
|
+
label: testCase.label,
|
|
303
|
+
runtime,
|
|
304
|
+
details: "Skipped because AG_UI_EVAL_PROJECT_ID is not set.",
|
|
305
|
+
startedAt,
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
const prepared = testCase.prepare
|
|
309
|
+
? await testCase.prepare({
|
|
310
|
+
apiUrl: config.apiUrl,
|
|
311
|
+
authToken: config.authToken,
|
|
312
|
+
projectId: config.projectId,
|
|
313
|
+
})
|
|
314
|
+
: null;
|
|
315
|
+
const preparedConversationId = extractPreparedConversationId(prepared);
|
|
316
|
+
const preparedArtifactPaths = collectPreparedArtifactPaths(prepared);
|
|
317
|
+
const resultContext = createLiveEvalResultContext({
|
|
318
|
+
testCase,
|
|
319
|
+
runtime,
|
|
320
|
+
startedAt,
|
|
321
|
+
conversationId: preparedConversationId,
|
|
322
|
+
artifactPaths: preparedArtifactPaths,
|
|
323
|
+
});
|
|
324
|
+
try {
|
|
325
|
+
const sidecarCleanup = prepared?.startSidecar ? await prepared.startSidecar() : undefined;
|
|
326
|
+
const progressReporter = createLiveEvalProgressReporter({
|
|
327
|
+
caseId: testCase.id,
|
|
328
|
+
startedAt,
|
|
329
|
+
intervalMs: config.progressLogIntervalMs,
|
|
330
|
+
log,
|
|
331
|
+
});
|
|
332
|
+
const body = buildLiveEvalRunBody({
|
|
333
|
+
config,
|
|
334
|
+
testCase,
|
|
335
|
+
prepared,
|
|
336
|
+
conversationId: preparedConversationId,
|
|
337
|
+
});
|
|
338
|
+
try {
|
|
339
|
+
const response = await fetchImpl(config.endpoint, {
|
|
340
|
+
method: "POST",
|
|
341
|
+
headers: {
|
|
342
|
+
"Content-Type": "application/json",
|
|
343
|
+
Authorization: `Bearer ${config.authToken}`,
|
|
344
|
+
},
|
|
345
|
+
body: JSON.stringify(body),
|
|
346
|
+
signal: AbortSignal.timeout(config.requestTimeoutMs),
|
|
347
|
+
});
|
|
348
|
+
log(`[stream] ${runtime}:${testCase.id} HTTP ${response.status}`);
|
|
349
|
+
const run = await parseSseResponse(response, {
|
|
350
|
+
onProgress: progressReporter.update,
|
|
351
|
+
});
|
|
352
|
+
return resolveCompletedLiveEvalRun({
|
|
353
|
+
testCase,
|
|
354
|
+
run,
|
|
355
|
+
prepared,
|
|
356
|
+
context: resultContext,
|
|
357
|
+
runId: extractRunId(run) ?? undefined,
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
catch (error) {
|
|
361
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
362
|
+
return createStreamingFailureEvalResult({
|
|
363
|
+
context: resultContext,
|
|
364
|
+
details: message,
|
|
365
|
+
progress: progressReporter.getSnapshot(),
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
finally {
|
|
369
|
+
progressReporter.stop();
|
|
370
|
+
await sidecarCleanup?.();
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
finally {
|
|
374
|
+
await prepared?.cleanup?.();
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
return {
|
|
378
|
+
judgeLlm,
|
|
379
|
+
runEval,
|
|
380
|
+
verifyFileExists,
|
|
381
|
+
withJudge,
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
export const liveEvalRunnerInternals = {
|
|
385
|
+
collectPreparedArtifactPaths,
|
|
386
|
+
createFailedRunEvalResult,
|
|
387
|
+
createLiveEvalRunArtifacts,
|
|
388
|
+
createPassedRunEvalResult,
|
|
389
|
+
createStreamingFailureEvalResult,
|
|
390
|
+
extractRunId,
|
|
391
|
+
};
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export declare const VERSION = "0.1.
|
|
1
|
+
export declare const VERSION = "0.1.523";
|
|
2
2
|
//# sourceMappingURL=version-constant.d.ts.map
|
package/package.json
CHANGED
package/src/deno.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
export default {
|
|
2
2
|
"name": "veryfront",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.523",
|
|
4
4
|
"license": "Apache-2.0",
|
|
5
5
|
"nodeModulesDir": "auto",
|
|
6
6
|
"workspace": [
|
|
@@ -296,9 +296,6 @@ export default {
|
|
|
296
296
|
"strict": true,
|
|
297
297
|
"noImplicitAny": true,
|
|
298
298
|
"noUncheckedIndexedAccess": true,
|
|
299
|
-
"types": [
|
|
300
|
-
"npm:@types/react@19.2.14"
|
|
301
|
-
],
|
|
302
299
|
"lib": [
|
|
303
300
|
"deno.window",
|
|
304
301
|
"dom",
|
|
@@ -431,14 +428,5 @@ export default {
|
|
|
431
428
|
"semiColons": true,
|
|
432
429
|
"singleQuote": false,
|
|
433
430
|
"proseWrap": "preserve"
|
|
434
|
-
},
|
|
435
|
-
"allowScripts": {
|
|
436
|
-
"allow": [
|
|
437
|
-
"npm:sharp@0.33.5",
|
|
438
|
-
"npm:onnxruntime-node@1.21.0"
|
|
439
|
-
],
|
|
440
|
-
"deny": [
|
|
441
|
-
"npm:protobufjs@7.5.4"
|
|
442
|
-
]
|
|
443
431
|
}
|
|
444
432
|
};
|
|
@@ -27,17 +27,29 @@ export {
|
|
|
27
27
|
buildProgressLine,
|
|
28
28
|
buildRuntimePerformanceSummary,
|
|
29
29
|
containsOrderedSubsequence,
|
|
30
|
+
containsSkillLoad,
|
|
31
|
+
countStepStartedEvents,
|
|
30
32
|
createFailedEvalResult,
|
|
33
|
+
createLiveEvalCaseSupport,
|
|
31
34
|
createPassedEvalResult,
|
|
32
35
|
createPlainTextPdf,
|
|
33
36
|
createSkippedEvalResult,
|
|
34
37
|
hasEveryLiveEvalTag,
|
|
38
|
+
hasFinished,
|
|
39
|
+
type LiveEvalCase,
|
|
35
40
|
type LiveEvalCaseMetadata,
|
|
36
41
|
type LiveEvalCaseSelectionInput,
|
|
42
|
+
type LiveEvalContext,
|
|
43
|
+
type LiveEvalProjectFile,
|
|
44
|
+
type LiveEvalProjectFileReaderInput,
|
|
45
|
+
type LiveEvalRequestBody,
|
|
37
46
|
type LiveEvalResultForPerformance,
|
|
38
47
|
type LiveEvalResultForReport,
|
|
39
48
|
type LiveEvalResultRecord,
|
|
49
|
+
type LiveEvalRunnerConfig,
|
|
50
|
+
liveEvalRunnerInternals,
|
|
40
51
|
type LiveEvalRuntime,
|
|
52
|
+
type PreparedLiveEvalInput,
|
|
41
53
|
resolveLiveEvalRequestedCaseIds,
|
|
42
54
|
type RuntimePerformanceSummary,
|
|
43
55
|
selectLiveEvalCases,
|
|
@@ -10,7 +10,11 @@ export {
|
|
|
10
10
|
type LiveEvalRuntime,
|
|
11
11
|
type RuntimePerformanceSummary,
|
|
12
12
|
} from "./performance.js";
|
|
13
|
-
export {
|
|
13
|
+
export {
|
|
14
|
+
buildLiveEvalRequestBody,
|
|
15
|
+
type BuildLiveEvalRequestBodyInput,
|
|
16
|
+
type LiveEvalRequestBody,
|
|
17
|
+
} from "./request.js";
|
|
14
18
|
export {
|
|
15
19
|
buildLiveEvalCaseTagSummary,
|
|
16
20
|
buildLiveEvalRuntimeSummary,
|
|
@@ -28,3 +32,16 @@ export {
|
|
|
28
32
|
createSkippedEvalResult,
|
|
29
33
|
type LiveEvalResultRecord,
|
|
30
34
|
} from "./result.js";
|
|
35
|
+
export {
|
|
36
|
+
containsSkillLoad,
|
|
37
|
+
countStepStartedEvents,
|
|
38
|
+
createLiveEvalCaseSupport,
|
|
39
|
+
hasFinished,
|
|
40
|
+
type LiveEvalCase,
|
|
41
|
+
type LiveEvalContext,
|
|
42
|
+
type LiveEvalProjectFile,
|
|
43
|
+
type LiveEvalProjectFileReaderInput,
|
|
44
|
+
type LiveEvalRunnerConfig,
|
|
45
|
+
liveEvalRunnerInternals,
|
|
46
|
+
type PreparedLiveEvalInput,
|
|
47
|
+
} from "./runner.js";
|
|
@@ -1,4 +1,20 @@
|
|
|
1
1
|
import * as dntShim from "../../../../_dnt.shims.js";
|
|
2
|
+
export interface LiveEvalRequestBody {
|
|
3
|
+
threadId: string;
|
|
4
|
+
runId: string;
|
|
5
|
+
state: Record<string, string>;
|
|
6
|
+
tools: unknown[];
|
|
7
|
+
context: unknown[];
|
|
8
|
+
forwardedProps?: {
|
|
9
|
+
veryfront: Record<string, unknown>;
|
|
10
|
+
};
|
|
11
|
+
messages: Array<{
|
|
12
|
+
id: string;
|
|
13
|
+
role: "user";
|
|
14
|
+
content: string;
|
|
15
|
+
}>;
|
|
16
|
+
}
|
|
17
|
+
|
|
2
18
|
export interface BuildLiveEvalRequestBodyInput {
|
|
3
19
|
testCaseId: string;
|
|
4
20
|
prompt: string;
|
|
@@ -12,7 +28,9 @@ export interface BuildLiveEvalRequestBodyInput {
|
|
|
12
28
|
maxSteps?: number;
|
|
13
29
|
}
|
|
14
30
|
|
|
15
|
-
export function buildLiveEvalRequestBody(
|
|
31
|
+
export function buildLiveEvalRequestBody(
|
|
32
|
+
input: BuildLiveEvalRequestBodyInput,
|
|
33
|
+
): LiveEvalRequestBody {
|
|
16
34
|
const veryfront: Record<string, unknown> = {};
|
|
17
35
|
if (input.projectId) {
|
|
18
36
|
veryfront.projectId = input.projectId;
|