veryfront 0.1.521 → 0.1.523

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/esm/cli/templates/manifest.d.ts +405 -405
  2. package/esm/cli/templates/manifest.js +454 -454
  3. package/esm/deno.d.ts +2 -7
  4. package/esm/deno.js +4 -16
  5. package/esm/extensions/{ext-tracing-opentelemetry → ext-observability-opentelemetry}/src/index.d.ts +3 -3
  6. package/esm/extensions/ext-observability-opentelemetry/src/index.d.ts.map +1 -0
  7. package/esm/extensions/{ext-tracing-opentelemetry → ext-observability-opentelemetry}/src/index.js +10 -10
  8. package/esm/src/agent/service/config.d.ts.map +1 -1
  9. package/esm/src/agent/service/config.js +2 -0
  10. package/esm/src/agent/service/node-telemetry.d.ts +1 -1
  11. package/esm/src/agent/service/node-telemetry.d.ts.map +1 -1
  12. package/esm/src/agent/service/node-telemetry.js +1 -1
  13. package/esm/src/agent/testing/index.d.ts +1 -1
  14. package/esm/src/agent/testing/index.d.ts.map +1 -1
  15. package/esm/src/agent/testing/index.js +1 -1
  16. package/esm/src/agent/testing/live-evals/index.d.ts +2 -1
  17. package/esm/src/agent/testing/live-evals/index.d.ts.map +1 -1
  18. package/esm/src/agent/testing/live-evals/index.js +2 -1
  19. package/esm/src/agent/testing/live-evals/request.d.ts +16 -17
  20. package/esm/src/agent/testing/live-evals/request.d.ts.map +1 -1
  21. package/esm/src/agent/testing/live-evals/runner.d.ts +124 -0
  22. package/esm/src/agent/testing/live-evals/runner.d.ts.map +1 -0
  23. package/esm/src/agent/testing/live-evals/runner.js +391 -0
  24. package/esm/src/agent/veryfront-cloud-agent-service.js +2 -2
  25. package/esm/src/extensions/{tracing → observability}/index.d.ts +2 -2
  26. package/esm/src/extensions/observability/index.d.ts.map +1 -0
  27. package/esm/src/extensions/{tracing → observability}/index.js +2 -2
  28. package/esm/src/extensions/{tracing → observability}/node-telemetry-provider.d.ts +2 -2
  29. package/esm/src/extensions/observability/node-telemetry-provider.d.ts.map +1 -0
  30. package/esm/src/extensions/{tracing → observability}/node-telemetry-provider.js +2 -2
  31. package/esm/src/extensions/{tracing → observability}/tracing-exporter.d.ts +2 -2
  32. package/esm/src/extensions/observability/tracing-exporter.d.ts.map +1 -0
  33. package/esm/src/extensions/observability/tracing-exporter.js +8 -0
  34. package/esm/src/extensions/recommendations.js +2 -2
  35. package/esm/src/observability/metrics/manager.js +1 -1
  36. package/esm/src/observability/simple-metrics/otel-instruments.js +1 -1
  37. package/esm/src/observability/tracing/api-shim.d.ts +4 -4
  38. package/esm/src/observability/tracing/api-shim.js +7 -7
  39. package/esm/src/observability/tracing/manager.js +2 -2
  40. package/esm/src/observability/tracing/otlp-setup.d.ts +1 -1
  41. package/esm/src/observability/tracing/otlp-setup.js +4 -4
  42. package/esm/src/proxy/tracing.d.ts +1 -1
  43. package/esm/src/proxy/tracing.js +2 -2
  44. package/esm/src/react/components/chat/theme.d.ts.map +1 -1
  45. package/esm/src/react/components/chat/theme.js +4 -2
  46. package/esm/src/server/dev-ui/manifest.d.ts +17 -17
  47. package/esm/src/server/dev-ui/manifest.js +17 -17
  48. package/esm/src/server/handlers/dev/framework-candidates.generated.d.ts.map +1 -1
  49. package/esm/src/server/handlers/dev/framework-candidates.generated.js +4 -1
  50. package/esm/src/utils/version-constant.d.ts +1 -1
  51. package/esm/src/utils/version-constant.js +1 -1
  52. package/package.json +4 -4
  53. package/src/cli/templates/manifest.js +454 -454
  54. package/src/deno.js +4 -16
  55. package/src/extensions/{ext-tracing-opentelemetry → ext-observability-opentelemetry}/src/index.ts +12 -12
  56. package/src/src/agent/service/config.ts +2 -0
  57. package/src/src/agent/service/node-telemetry.ts +1 -1
  58. package/src/src/agent/testing/index.ts +12 -0
  59. package/src/src/agent/testing/live-evals/index.ts +18 -1
  60. package/src/src/agent/testing/live-evals/request.ts +19 -1
  61. package/src/src/agent/testing/live-evals/runner.ts +629 -0
  62. package/src/src/agent/veryfront-cloud-agent-service.ts +2 -2
  63. package/src/src/extensions/{tracing → observability}/index.ts +2 -2
  64. package/src/src/extensions/{tracing → observability}/node-telemetry-provider.ts +2 -2
  65. package/src/src/extensions/{tracing → observability}/tracing-exporter.ts +2 -2
  66. package/src/src/extensions/recommendations.ts +2 -2
  67. package/src/src/observability/metrics/manager.ts +1 -1
  68. package/src/src/observability/simple-metrics/otel-instruments.ts +1 -1
  69. package/src/src/observability/tracing/api-shim.ts +7 -7
  70. package/src/src/observability/tracing/manager.ts +2 -2
  71. package/src/src/observability/tracing/otlp-setup.ts +4 -4
  72. package/src/src/proxy/tracing.ts +2 -2
  73. package/src/src/react/components/chat/theme.ts +4 -2
  74. package/src/src/server/bootstrap.ts +1 -1
  75. package/src/src/server/dev-ui/manifest.js +17 -17
  76. package/src/src/server/handlers/dev/framework-candidates.generated.ts +4 -1
  77. package/src/src/utils/version-constant.ts +1 -1
  78. package/esm/extensions/ext-tracing-opentelemetry/src/index.d.ts.map +0 -1
  79. package/esm/src/extensions/tracing/index.d.ts.map +0 -1
  80. package/esm/src/extensions/tracing/node-telemetry-provider.d.ts.map +0 -1
  81. package/esm/src/extensions/tracing/tracing-exporter.d.ts.map +0 -1
  82. package/esm/src/extensions/tracing/tracing-exporter.js +0 -8
@@ -0,0 +1,629 @@
1
+ import * as dntShim from "../../../../_dnt.shims.js";
2
+ import {
3
+ agUiSseEventTypes,
4
+ type AgUiSseProgressSnapshot as EvalProgressSnapshot,
5
+ buildAgUiSseTraceSignature as buildTraceSignature,
6
+ getAgUiSseStringField as getStringField,
7
+ parseAgUiSseResponse as parseSseResponse,
8
+ type ParsedAgUiSseRun as ParsedRun,
9
+ } from "../../index.js";
10
+ import { buildFailureSuffix, buildProgressLine, containsOrderedSubsequence } from "./formatting.js";
11
+ import { type LiveEvalRuntime } from "./performance.js";
12
+ import { buildLiveEvalRequestBody } from "./request.js";
13
+ import { type LiveEvalCaseMetadata } from "./report.js";
14
+ import {
15
+ createFailedEvalResult,
16
+ createPassedEvalResult,
17
+ createSkippedEvalResult,
18
+ type LiveEvalResultRecord,
19
+ } from "./result.js";
20
+
21
+ export interface PreparedLiveEvalInput {
22
+ prompt?: string;
23
+ metadata?: Record<string, string>;
24
+ verificationContext?: LiveEvalContext;
25
+ cleanup?: () => Promise<void>;
26
+ startSidecar?: () => Promise<(() => Promise<void>) | void>;
27
+ }
28
+
29
+ export interface LiveEvalContext {
30
+ apiUrl: string;
31
+ authToken: string;
32
+ projectId: string | null;
33
+ }
34
+
35
+ export interface LiveEvalCase {
36
+ readonly id: string;
37
+ readonly label: string;
38
+ readonly prompt?: string;
39
+ allowedTools?: string[];
40
+ forceRuntimeOverrides?: boolean;
41
+ requireProject?: boolean;
42
+ maxSteps?: number;
43
+ expectedEventSubsequence?: string[];
44
+ metadata?: LiveEvalCaseMetadata;
45
+ prepare?: (context: LiveEvalContext) => Promise<PreparedLiveEvalInput>;
46
+ verify: (
47
+ run: ParsedRun,
48
+ prepared: PreparedLiveEvalInput | null,
49
+ ) => string | null | Promise<string | null>;
50
+ }
51
+
52
+ interface FileCheckInput {
53
+ filePath: string;
54
+ requiredContent?: string[];
55
+ description?: string;
56
+ }
57
+
58
+ export interface LiveEvalProjectFile {
59
+ path: string;
60
+ content: string;
61
+ }
62
+
63
+ export interface LiveEvalProjectFileReaderInput {
64
+ filePath: string;
65
+ requestTimeoutMs: number;
66
+ }
67
+
68
+ export interface LiveEvalRunnerConfig {
69
+ endpoint: string;
70
+ authToken: string;
71
+ apiUrl: string;
72
+ projectId: string | null;
73
+ branchId: string | null;
74
+ model: string | null;
75
+ requestTimeoutMs: number;
76
+ progressLogIntervalMs: number;
77
+ enableLlmJudge: boolean;
78
+ fetch?: (input: string | URL | Request, init?: RequestInit) => Promise<Response>;
79
+ log?: (message: string) => void;
80
+ readProjectFile?: (input: LiveEvalProjectFileReaderInput) => Promise<LiveEvalProjectFile | null>;
81
+ }
82
+
83
+ interface LiveEvalJudgeInput {
84
+ question: string;
85
+ criteria: string;
86
+ }
87
+
88
+ interface LiveEvalJudgeRequest extends LiveEvalJudgeInput {
89
+ answer: string;
90
+ }
91
+
92
+ interface LiveEvalJudgeResult {
93
+ pass: boolean;
94
+ reason: string;
95
+ }
96
+
97
+ function resolveFetch(config: Pick<LiveEvalRunnerConfig, "fetch">) {
98
+ return config.fetch ?? fetch;
99
+ }
100
+
101
+ function createLiveEvalJudgeSupport(
102
+ config: Pick<LiveEvalRunnerConfig, "endpoint" | "authToken" | "enableLlmJudge" | "fetch">,
103
+ ): {
104
+ judgeLlm: (input: LiveEvalJudgeRequest) => Promise<LiveEvalJudgeResult>;
105
+ withJudge: (
106
+ structuralVerify: (run: ParsedRun) => string | null,
107
+ judgeInput: LiveEvalJudgeInput,
108
+ ) => (run: ParsedRun) => Promise<string | null>;
109
+ } {
110
+ async function judgeLlm(input: LiveEvalJudgeRequest): Promise<LiveEvalJudgeResult> {
111
+ try {
112
+ const body = buildLiveEvalRequestBody({
113
+ testCaseId: "llm-judge",
114
+ prompt: `You are an eval judge. Grade the following answer.
115
+
116
+ QUESTION: ${input.question}
117
+
118
+ ANSWER: ${input.answer}
119
+
120
+ CRITERIA: ${input.criteria}
121
+
122
+ Respond with exactly one line: PASS or FAIL followed by a brief reason.
123
+ Example: "PASS — correctly explains the pattern with accurate details"
124
+ Example: "FAIL — mentions the wrong file convention"`,
125
+ projectId: null,
126
+ allowedTools: [],
127
+ forceRuntimeOverrides: true,
128
+ maxSteps: 2,
129
+ });
130
+
131
+ const response = await resolveFetch(config)(config.endpoint, {
132
+ method: "POST",
133
+ headers: {
134
+ "Content-Type": "application/json",
135
+ Authorization: `Bearer ${config.authToken}`,
136
+ },
137
+ body: JSON.stringify(body),
138
+ signal: AbortSignal.timeout(30_000),
139
+ });
140
+
141
+ const run = await parseSseResponse(response);
142
+ if (run.responseStatus !== 200) {
143
+ return { pass: false, reason: `judge returned HTTP ${run.responseStatus}` };
144
+ }
145
+ const line = run.text
146
+ .split("\n")
147
+ .map((value) => value.trim())
148
+ .find((value) => value.length > 0) ?? "";
149
+ if (line.toUpperCase().startsWith("PASS")) {
150
+ return { pass: true, reason: line };
151
+ }
152
+ return { pass: false, reason: line || "judge returned no decision" };
153
+ } catch (error) {
154
+ return {
155
+ pass: false,
156
+ reason: error instanceof Error ? error.message : String(error),
157
+ };
158
+ }
159
+ }
160
+
161
+ function withJudge(
162
+ structuralVerify: (run: ParsedRun) => string | null,
163
+ judgeInput: LiveEvalJudgeInput,
164
+ ): (run: ParsedRun) => Promise<string | null> {
165
+ return async (run) => {
166
+ const structuralFailure = structuralVerify(run);
167
+ if (structuralFailure) {
168
+ return structuralFailure;
169
+ }
170
+ if (!config.enableLlmJudge) {
171
+ return null;
172
+ }
173
+ const judgment = await judgeLlm({
174
+ question: judgeInput.question,
175
+ answer: run.text,
176
+ criteria: judgeInput.criteria,
177
+ });
178
+ return judgment.pass ? null : `LLM judge: ${judgment.reason}`;
179
+ };
180
+ }
181
+
182
+ return {
183
+ judgeLlm,
184
+ withJudge,
185
+ };
186
+ }
187
+
188
+ interface LiveEvalProgressReporter {
189
+ stop: () => void;
190
+ update: (snapshot: EvalProgressSnapshot) => void;
191
+ getSnapshot: () => EvalProgressSnapshot;
192
+ }
193
+
194
+ function createInitialProgressSnapshot(): EvalProgressSnapshot {
195
+ return {
196
+ eventCount: 0,
197
+ lastEventType: null,
198
+ lastToolCallName: null,
199
+ toolStarts: [],
200
+ textLength: 0,
201
+ };
202
+ }
203
+
204
+ interface UnrefableTimer {
205
+ unref: () => void;
206
+ }
207
+
208
+ function isUnrefableTimer(value: unknown): value is UnrefableTimer {
209
+ return typeof value === "object" && value !== null && "unref" in value &&
210
+ typeof value.unref === "function";
211
+ }
212
+
213
+ function maybeUnrefTimer(timer: ReturnType<typeof dntShim.setInterval>): void {
214
+ if (isUnrefableTimer(timer)) {
215
+ timer.unref();
216
+ }
217
+ }
218
+
219
+ function createLiveEvalProgressReporter(input: {
220
+ caseId: string;
221
+ startedAt: number;
222
+ intervalMs: number;
223
+ log: (message: string) => void;
224
+ }): LiveEvalProgressReporter {
225
+ let latestProgress = createInitialProgressSnapshot();
226
+ const progressTimer = dntShim.setInterval(() => {
227
+ input.log(
228
+ buildProgressLine({
229
+ caseId: input.caseId,
230
+ startedAt: input.startedAt,
231
+ progress: latestProgress,
232
+ }),
233
+ );
234
+ }, input.intervalMs);
235
+ maybeUnrefTimer(progressTimer);
236
+
237
+ return {
238
+ stop: () => {
239
+ clearInterval(progressTimer);
240
+ },
241
+ update: (snapshot) => {
242
+ latestProgress = snapshot;
243
+ },
244
+ getSnapshot: () => latestProgress,
245
+ };
246
+ }
247
+
248
+ function collectPreparedArtifactPaths(prepared: PreparedLiveEvalInput | null): string[] {
249
+ if (!prepared?.metadata) {
250
+ return [];
251
+ }
252
+
253
+ return [
254
+ ...new Set(
255
+ Object.entries(prepared.metadata)
256
+ .filter(([key, value]) => key.toLowerCase().includes("path") && value.length > 0)
257
+ .map(([, value]) => value),
258
+ ),
259
+ ].sort();
260
+ }
261
+
262
+ function extractPreparedConversationId(prepared: PreparedLiveEvalInput | null): string | null {
263
+ return typeof prepared?.metadata?.conversationId === "string" &&
264
+ prepared.metadata.conversationId.length > 0
265
+ ? prepared.metadata.conversationId
266
+ : null;
267
+ }
268
+
269
+ interface LiveEvalResultContext {
270
+ id: string;
271
+ label: string;
272
+ runtime: LiveEvalRuntime;
273
+ startedAt: number;
274
+ conversationId?: string | null;
275
+ artifactPaths?: string[];
276
+ }
277
+
278
+ interface LiveEvalRunArtifactsInput {
279
+ run: ParsedRun;
280
+ runId?: string;
281
+ traceSignature: string;
282
+ }
283
+
284
+ interface LiveEvalRunArtifacts {
285
+ runId?: string;
286
+ traceSignature: string;
287
+ toolStarts: string[];
288
+ toolArgsPreview: string;
289
+ textPreview: string;
290
+ }
291
+
292
+ function createLiveEvalRunArtifacts(input: LiveEvalRunArtifactsInput): LiveEvalRunArtifacts {
293
+ return {
294
+ ...(input.runId ? { runId: input.runId } : {}),
295
+ traceSignature: input.traceSignature,
296
+ toolStarts: input.run.toolStarts,
297
+ toolArgsPreview: input.run.toolArgs.join(" | ").slice(0, 1000),
298
+ textPreview: input.run.text.slice(0, 280),
299
+ };
300
+ }
301
+
302
+ function createFailedRunEvalResult(input: {
303
+ details: string;
304
+ context: LiveEvalResultContext;
305
+ runArtifacts: LiveEvalRunArtifacts;
306
+ }): LiveEvalResultRecord {
307
+ return createFailedEvalResult({
308
+ id: input.context.id,
309
+ label: input.context.label,
310
+ runtime: input.context.runtime,
311
+ details: input.details,
312
+ startedAt: input.context.startedAt,
313
+ ...(input.context.conversationId ? { conversationId: input.context.conversationId } : {}),
314
+ ...(input.runArtifacts.runId ? { runId: input.runArtifacts.runId } : {}),
315
+ ...(input.context.artifactPaths?.length ? { artifactPaths: input.context.artifactPaths } : {}),
316
+ traceSignature: input.runArtifacts.traceSignature,
317
+ toolStarts: input.runArtifacts.toolStarts,
318
+ toolArgsPreview: input.runArtifacts.toolArgsPreview,
319
+ textPreview: input.runArtifacts.textPreview,
320
+ });
321
+ }
322
+
323
+ function createPassedRunEvalResult(input: {
324
+ details: string;
325
+ context: LiveEvalResultContext;
326
+ runArtifacts: LiveEvalRunArtifacts;
327
+ }): LiveEvalResultRecord {
328
+ return createPassedEvalResult({
329
+ id: input.context.id,
330
+ label: input.context.label,
331
+ runtime: input.context.runtime,
332
+ details: input.details,
333
+ startedAt: input.context.startedAt,
334
+ ...(input.context.conversationId ? { conversationId: input.context.conversationId } : {}),
335
+ ...(input.runArtifacts.runId ? { runId: input.runArtifacts.runId } : {}),
336
+ ...(input.context.artifactPaths?.length ? { artifactPaths: input.context.artifactPaths } : {}),
337
+ traceSignature: input.runArtifacts.traceSignature,
338
+ toolStarts: input.runArtifacts.toolStarts,
339
+ toolArgsPreview: input.runArtifacts.toolArgsPreview,
340
+ textPreview: input.runArtifacts.textPreview,
341
+ });
342
+ }
343
+
344
+ function createStreamingFailureEvalResult(input: {
345
+ details: string;
346
+ context: LiveEvalResultContext;
347
+ progress: EvalProgressSnapshot;
348
+ }): LiveEvalResultRecord {
349
+ return createFailedEvalResult({
350
+ id: input.context.id,
351
+ label: input.context.label,
352
+ runtime: input.context.runtime,
353
+ details: `${input.details}${buildFailureSuffix(input.progress)}`,
354
+ startedAt: input.context.startedAt,
355
+ ...(input.context.conversationId ? { conversationId: input.context.conversationId } : {}),
356
+ ...(input.context.artifactPaths?.length ? { artifactPaths: input.context.artifactPaths } : {}),
357
+ toolStarts: input.progress.toolStarts,
358
+ textPreview: input.progress.textLength > 0
359
+ ? `${input.progress.textLength} characters streamed`
360
+ : undefined,
361
+ });
362
+ }
363
+
364
+ function createLiveEvalResultContext(input: {
365
+ testCase: LiveEvalCase;
366
+ runtime: LiveEvalRuntime;
367
+ startedAt: number;
368
+ conversationId: string | null;
369
+ artifactPaths: string[];
370
+ }): LiveEvalResultContext {
371
+ return {
372
+ id: input.testCase.id,
373
+ label: input.testCase.label,
374
+ runtime: input.runtime,
375
+ startedAt: input.startedAt,
376
+ ...(input.conversationId ? { conversationId: input.conversationId } : {}),
377
+ ...(input.artifactPaths.length > 0 ? { artifactPaths: input.artifactPaths } : {}),
378
+ };
379
+ }
380
+
381
+ function buildLiveEvalRunBody(input: {
382
+ config: LiveEvalRunnerConfig;
383
+ testCase: LiveEvalCase;
384
+ prepared: PreparedLiveEvalInput | null;
385
+ conversationId: string | null;
386
+ }): unknown {
387
+ const customBody = typeof input.prepared?.metadata?.customBody === "string"
388
+ ? input.prepared.metadata.customBody
389
+ : null;
390
+
391
+ if (customBody) {
392
+ return JSON.parse(customBody);
393
+ }
394
+
395
+ return buildLiveEvalRequestBody({
396
+ testCaseId: input.testCase.id,
397
+ prompt: input.prepared?.prompt ?? input.testCase.prompt ?? "",
398
+ metadata: input.prepared?.metadata,
399
+ projectId: input.config.projectId && input.testCase.requireProject
400
+ ? input.config.projectId
401
+ : null,
402
+ ...(input.config.branchId ? { branchId: input.config.branchId } : {}),
403
+ ...(input.config.model ? { model: input.config.model } : {}),
404
+ ...(input.conversationId ? { conversationId: input.conversationId } : {}),
405
+ allowedTools: input.testCase.allowedTools,
406
+ forceRuntimeOverrides: input.testCase.forceRuntimeOverrides,
407
+ maxSteps: input.testCase.maxSteps,
408
+ });
409
+ }
410
+
411
+ async function resolveCompletedLiveEvalRun(input: {
412
+ testCase: LiveEvalCase;
413
+ run: ParsedRun;
414
+ prepared: PreparedLiveEvalInput | null;
415
+ context: LiveEvalResultContext;
416
+ runId?: string;
417
+ }): Promise<LiveEvalResultRecord> {
418
+ const traceSignature = buildTraceSignature(input.run.eventTypes);
419
+ const runArtifacts = createLiveEvalRunArtifacts({
420
+ run: input.run,
421
+ runId: input.runId,
422
+ traceSignature,
423
+ });
424
+ const failure = await input.testCase.verify(input.run, input.prepared);
425
+
426
+ if (!failure && input.testCase.expectedEventSubsequence) {
427
+ if (
428
+ !containsOrderedSubsequence(input.run.eventTypes, input.testCase.expectedEventSubsequence)
429
+ ) {
430
+ return createFailedRunEvalResult({
431
+ context: input.context,
432
+ details: `Expected AG-UI event subsequence ${
433
+ input.testCase.expectedEventSubsequence.join(" -> ")
434
+ }, got ${traceSignature}`,
435
+ runArtifacts,
436
+ });
437
+ }
438
+ }
439
+
440
+ if (failure) {
441
+ return createFailedRunEvalResult({
442
+ context: input.context,
443
+ details: failure,
444
+ runArtifacts,
445
+ });
446
+ }
447
+
448
+ return createPassedRunEvalResult({
449
+ context: input.context,
450
+ details: `OK: ${input.run.toolStarts.join(", ") || "no tools"} | ${
451
+ input.run.text.slice(0, 140) || "no text"
452
+ }`,
453
+ runArtifacts,
454
+ });
455
+ }
456
+
457
+ function extractRunId(run: ParsedRun): string | null {
458
+ for (const event of run.events) {
459
+ const runId = getStringField(event, "runId") ?? getStringField(event, "run_id");
460
+ if (runId) {
461
+ return runId;
462
+ }
463
+ }
464
+
465
+ return null;
466
+ }
467
+
468
+ export function hasFinished(run: ParsedRun): boolean {
469
+ return run.eventTypes.includes(agUiSseEventTypes.runFinished) && !run.runError;
470
+ }
471
+
472
+ export function containsSkillLoad(run: ParsedRun, skillId: string): boolean {
473
+ return run.toolStarts.includes("load_skill") && run.toolArgs.join("").includes(skillId);
474
+ }
475
+
476
+ export function countStepStartedEvents(run: ParsedRun): number {
477
+ return run.eventTypes.filter((eventType) => eventType === agUiSseEventTypes.stepStarted).length;
478
+ }
479
+
480
+ export function createLiveEvalCaseSupport(config: LiveEvalRunnerConfig): {
481
+ runEval: (testCase: LiveEvalCase, runtime: LiveEvalRuntime) => Promise<LiveEvalResultRecord>;
482
+ verifyFileExists: (input: FileCheckInput) => Promise<string | null>;
483
+ withJudge: (
484
+ structuralVerify: (run: ParsedRun) => string | null,
485
+ judgeInput: LiveEvalJudgeInput,
486
+ ) => (run: ParsedRun) => Promise<string | null>;
487
+ judgeLlm: (input: LiveEvalJudgeRequest) => Promise<LiveEvalJudgeResult>;
488
+ } {
489
+ const fetchImpl = resolveFetch(config);
490
+ const log = config.log ?? console.log;
491
+ const { judgeLlm, withJudge } = createLiveEvalJudgeSupport(config);
492
+
493
+ async function verifyFileExists(input: FileCheckInput): Promise<string | null> {
494
+ if (!config.projectId || !config.readProjectFile) {
495
+ return null;
496
+ }
497
+
498
+ const file = await config.readProjectFile({
499
+ filePath: input.filePath,
500
+ requestTimeoutMs: config.requestTimeoutMs,
501
+ });
502
+
503
+ if (!file) {
504
+ return `${
505
+ input.description ?? input.filePath
506
+ }: file not found in project after task completed`;
507
+ }
508
+
509
+ if (!file.content || file.content.trim().length === 0) {
510
+ return `${input.description ?? input.filePath}: file exists but is empty`;
511
+ }
512
+
513
+ if (input.requiredContent) {
514
+ const missing = input.requiredContent.filter((keyword) =>
515
+ !file.content.toLowerCase().includes(keyword.toLowerCase())
516
+ );
517
+ if (missing.length > 0) {
518
+ return `${input.description ?? input.filePath}: missing required content: ${
519
+ missing.join(", ")
520
+ }. Got: ${file.content.slice(0, 200)}`;
521
+ }
522
+ }
523
+
524
+ return null;
525
+ }
526
+
527
+ async function runEval(
528
+ testCase: LiveEvalCase,
529
+ runtime: LiveEvalRuntime,
530
+ ): Promise<LiveEvalResultRecord> {
531
+ const startedAt = Date.now();
532
+ if (testCase.requireProject && !config.projectId) {
533
+ return createSkippedEvalResult({
534
+ id: testCase.id,
535
+ label: testCase.label,
536
+ runtime,
537
+ details: "Skipped because AG_UI_EVAL_PROJECT_ID is not set.",
538
+ startedAt,
539
+ });
540
+ }
541
+
542
+ const prepared = testCase.prepare
543
+ ? await testCase.prepare({
544
+ apiUrl: config.apiUrl,
545
+ authToken: config.authToken,
546
+ projectId: config.projectId,
547
+ })
548
+ : null;
549
+ const preparedConversationId = extractPreparedConversationId(prepared);
550
+ const preparedArtifactPaths = collectPreparedArtifactPaths(prepared);
551
+ const resultContext = createLiveEvalResultContext({
552
+ testCase,
553
+ runtime,
554
+ startedAt,
555
+ conversationId: preparedConversationId,
556
+ artifactPaths: preparedArtifactPaths,
557
+ });
558
+
559
+ try {
560
+ const sidecarCleanup = prepared?.startSidecar ? await prepared.startSidecar() : undefined;
561
+ const progressReporter = createLiveEvalProgressReporter({
562
+ caseId: testCase.id,
563
+ startedAt,
564
+ intervalMs: config.progressLogIntervalMs,
565
+ log,
566
+ });
567
+
568
+ const body = buildLiveEvalRunBody({
569
+ config,
570
+ testCase,
571
+ prepared,
572
+ conversationId: preparedConversationId,
573
+ });
574
+
575
+ try {
576
+ const response = await fetchImpl(config.endpoint, {
577
+ method: "POST",
578
+ headers: {
579
+ "Content-Type": "application/json",
580
+ Authorization: `Bearer ${config.authToken}`,
581
+ },
582
+ body: JSON.stringify(body),
583
+ signal: AbortSignal.timeout(config.requestTimeoutMs),
584
+ });
585
+
586
+ log(`[stream] ${runtime}:${testCase.id} HTTP ${response.status}`);
587
+
588
+ const run = await parseSseResponse(response, {
589
+ onProgress: progressReporter.update,
590
+ });
591
+ return resolveCompletedLiveEvalRun({
592
+ testCase,
593
+ run,
594
+ prepared,
595
+ context: resultContext,
596
+ runId: extractRunId(run) ?? undefined,
597
+ });
598
+ } catch (error) {
599
+ const message = error instanceof Error ? error.message : String(error);
600
+ return createStreamingFailureEvalResult({
601
+ context: resultContext,
602
+ details: message,
603
+ progress: progressReporter.getSnapshot(),
604
+ });
605
+ } finally {
606
+ progressReporter.stop();
607
+ await sidecarCleanup?.();
608
+ }
609
+ } finally {
610
+ await prepared?.cleanup?.();
611
+ }
612
+ }
613
+
614
+ return {
615
+ judgeLlm,
616
+ runEval,
617
+ verifyFileExists,
618
+ withJudge,
619
+ };
620
+ }
621
+
622
+ export const liveEvalRunnerInternals = {
623
+ collectPreparedArtifactPaths,
624
+ createFailedRunEvalResult,
625
+ createLiveEvalRunArtifacts,
626
+ createPassedRunEvalResult,
627
+ createStreamingFailureEvalResult,
628
+ extractRunId,
629
+ };
@@ -11,7 +11,7 @@ import type { SchemaValidator } from "../extensions/schema/index.js";
11
11
  import {
12
12
  type NodeTelemetryProvider,
13
13
  NodeTelemetryProviderName,
14
- } from "../extensions/tracing/index.js";
14
+ } from "../extensions/observability/index.js";
15
15
  import {
16
16
  type SandboxShellToolsProvider,
17
17
  SandboxShellToolsProviderName,
@@ -379,7 +379,7 @@ async function ensureDefaultAuthProvider(): Promise<void> {
379
379
  async function ensureDefaultNodeTelemetryProvider(): Promise<void> {
380
380
  if (tryResolve<NodeTelemetryProvider>(NodeTelemetryProviderName)) return;
381
381
  const { OpenTelemetryNodeTelemetryProvider } = await import(
382
- "../../extensions/ext-tracing-opentelemetry/src/index.js"
382
+ "../../extensions/ext-observability-opentelemetry/src/index.js"
383
383
  );
384
384
  register<NodeTelemetryProvider>(
385
385
  NodeTelemetryProviderName,
@@ -1,7 +1,7 @@
1
1
  /**
2
- * Tracing category barrel: tracing and Node telemetry contracts.
2
+ * Observability category barrel: tracing and Node telemetry contracts.
3
3
  *
4
- * @module extensions/tracing
4
+ * @module extensions/observability
5
5
  */
6
6
  import "../../../_dnt.polyfills.js";
7
7
 
@@ -1,9 +1,9 @@
1
1
  /**
2
2
  * Contract interface for Node.js OpenTelemetry runtime bootstrap.
3
3
  *
4
- * Default implementation: `@veryfront/ext-tracing-opentelemetry`
4
+ * Default implementation: `@veryfront/ext-observability-opentelemetry`
5
5
  *
6
- * @module extensions/tracing/node-telemetry-provider
6
+ * @module extensions/observability/node-telemetry-provider
7
7
  */
8
8
 
9
9
  export const NodeTelemetryProviderName = "NodeTelemetryProvider";
@@ -1,9 +1,9 @@
1
1
  /**
2
2
  * Contract interface for tracing/telemetry exporters.
3
3
  *
4
- * Default implementation: `@veryfront/ext-tracing-opentelemetry`
4
+ * Default implementation: `@veryfront/ext-observability-opentelemetry`
5
5
  *
6
- * @module extensions/tracing/tracing-exporter
6
+ * @module extensions/observability/tracing-exporter
7
7
  */
8
8
 
9
9
  /**
@@ -14,8 +14,8 @@ const recommendations = new Map<string, string>([
14
14
  ["ContentProcessor", "@veryfront/ext-content-mdx"],
15
15
  ["DocumentExtractor", "@veryfront/ext-document-kreuzberg"],
16
16
  ["AuthProvider", "@veryfront/ext-auth-jwt"],
17
- ["TracingExporter", "@veryfront/ext-tracing-opentelemetry"],
18
- ["NodeTelemetryProvider", "@veryfront/ext-tracing-opentelemetry"],
17
+ ["TracingExporter", "@veryfront/ext-observability-opentelemetry"],
18
+ ["NodeTelemetryProvider", "@veryfront/ext-observability-opentelemetry"],
19
19
  ["LLMProvider:openai", "@veryfront/ext-llm-openai"],
20
20
  ["LLMProvider:anthropic", "@veryfront/ext-llm-anthropic"],
21
21
  ["LLMProvider:google", "@veryfront/ext-llm-google"],
@@ -80,7 +80,7 @@ export class MetricsManager {
80
80
  }
81
81
 
82
82
  try {
83
- // The metrics API is injected by ext-tracing-opentelemetry via setGlobalMetricsAPI().
83
+ // The metrics API is injected by ext-observability-opentelemetry via setGlobalMetricsAPI().
84
84
  // When the extension is not active, metrics collection is disabled.
85
85
  const metricsApi = getGlobalMetricsAPI();
86
86
  if (!metricsApi) {