@wix/evalforge-types 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +414 -378
- package/build/index.js.map +4 -4
- package/build/index.mjs +407 -378
- package/build/index.mjs.map +4 -4
- package/build/types/agent/adapter.d.ts +6 -3
- package/build/types/common/mcp.d.ts +42 -10
- package/build/types/evaluation/eval-run.d.ts +4 -0
- package/build/types/target/index.d.ts +1 -0
- package/build/types/target/sub-agent.d.ts +34 -0
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -14,18 +14,21 @@ var TenantEntitySchema = BaseEntitySchema.extend({
|
|
|
14
14
|
|
|
15
15
|
// src/common/mcp.ts
|
|
16
16
|
import { z as z2 } from "zod";
|
|
17
|
-
var
|
|
18
|
-
|
|
19
|
-
name
|
|
20
|
-
|
|
21
|
-
command
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
17
|
+
var MCP_SERVERS_JSON_KEY = "mcpServers";
|
|
18
|
+
var MCPEntitySchema = TenantEntitySchema.extend({
|
|
19
|
+
/** Display name and key in mcp.json mcpServers object */
|
|
20
|
+
name: z2.string().min(1),
|
|
21
|
+
/** MCP server config (command/args, url/headers, etc.) - stored as-is for mcp.json */
|
|
22
|
+
config: z2.record(z2.string(), z2.unknown())
|
|
23
|
+
});
|
|
24
|
+
var CreateMcpInputSchema = MCPEntitySchema.omit({
|
|
25
|
+
id: true,
|
|
26
|
+
createdAt: true,
|
|
27
|
+
updatedAt: true,
|
|
28
|
+
deleted: true
|
|
28
29
|
});
|
|
30
|
+
var UpdateMcpInputSchema = CreateMcpInputSchema.partial();
|
|
31
|
+
var MCPServerConfigSchema = z2.record(z2.string(), z2.unknown());
|
|
29
32
|
|
|
30
33
|
// src/common/models.ts
|
|
31
34
|
import { z as z3 } from "zod";
|
|
@@ -205,11 +208,26 @@ var CreateSkillsGroupInputSchema = SkillsGroupSchema.omit({
|
|
|
205
208
|
});
|
|
206
209
|
var UpdateSkillsGroupInputSchema = CreateSkillsGroupInputSchema.partial();
|
|
207
210
|
|
|
211
|
+
// src/target/sub-agent.ts
|
|
212
|
+
import { z as z7 } from "zod";
|
|
213
|
+
var SubAgentSchema = TargetSchema.extend({
|
|
214
|
+
/** The full sub-agent markdown content (YAML frontmatter + body) */
|
|
215
|
+
subAgentMd: z7.string()
|
|
216
|
+
});
|
|
217
|
+
var SubAgentInputBaseSchema = SubAgentSchema.omit({
|
|
218
|
+
id: true,
|
|
219
|
+
createdAt: true,
|
|
220
|
+
updatedAt: true,
|
|
221
|
+
deleted: true
|
|
222
|
+
});
|
|
223
|
+
var CreateSubAgentInputSchema = SubAgentInputBaseSchema;
|
|
224
|
+
var UpdateSubAgentInputSchema = SubAgentInputBaseSchema.partial();
|
|
225
|
+
|
|
208
226
|
// src/test/index.ts
|
|
209
|
-
import { z as
|
|
227
|
+
import { z as z18 } from "zod";
|
|
210
228
|
|
|
211
229
|
// src/test/base.ts
|
|
212
|
-
import { z as
|
|
230
|
+
import { z as z8 } from "zod";
|
|
213
231
|
var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
214
232
|
TestType2["LLM"] = "LLM";
|
|
215
233
|
TestType2["TOOL"] = "TOOL";
|
|
@@ -222,7 +240,7 @@ var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
|
222
240
|
TestType2["PLAYWRIGHT_NL"] = "PLAYWRIGHT_NL";
|
|
223
241
|
return TestType2;
|
|
224
242
|
})(TestType || {});
|
|
225
|
-
var TestTypeSchema =
|
|
243
|
+
var TestTypeSchema = z8.enum(TestType);
|
|
226
244
|
var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
227
245
|
TestImportance2["LOW"] = "low";
|
|
228
246
|
TestImportance2["MEDIUM"] = "medium";
|
|
@@ -230,153 +248,153 @@ var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
|
230
248
|
TestImportance2["CRITICAL"] = "critical";
|
|
231
249
|
return TestImportance2;
|
|
232
250
|
})(TestImportance || {});
|
|
233
|
-
var TestImportanceSchema =
|
|
234
|
-
var BaseTestSchema =
|
|
235
|
-
id:
|
|
251
|
+
var TestImportanceSchema = z8.enum(TestImportance);
|
|
252
|
+
var BaseTestSchema = z8.object({
|
|
253
|
+
id: z8.string(),
|
|
236
254
|
type: TestTypeSchema,
|
|
237
|
-
name:
|
|
238
|
-
description:
|
|
255
|
+
name: z8.string().min(3),
|
|
256
|
+
description: z8.string().optional(),
|
|
239
257
|
importance: TestImportanceSchema.optional()
|
|
240
258
|
});
|
|
241
259
|
|
|
242
260
|
// src/test/llm.ts
|
|
243
|
-
import { z as
|
|
261
|
+
import { z as z9 } from "zod";
|
|
244
262
|
var LLMTestSchema = BaseTestSchema.extend({
|
|
245
|
-
type:
|
|
263
|
+
type: z9.literal("LLM" /* LLM */),
|
|
246
264
|
/** Maximum steps for the LLM to take */
|
|
247
|
-
maxSteps:
|
|
265
|
+
maxSteps: z9.number().min(1).max(100),
|
|
248
266
|
/** Prompt to send to the evaluator */
|
|
249
|
-
prompt:
|
|
267
|
+
prompt: z9.string().min(1),
|
|
250
268
|
/** ID of the evaluator agent to use */
|
|
251
|
-
evaluatorId:
|
|
269
|
+
evaluatorId: z9.string()
|
|
252
270
|
});
|
|
253
271
|
|
|
254
272
|
// src/test/tool.ts
|
|
255
|
-
import { z as
|
|
273
|
+
import { z as z10 } from "zod";
|
|
256
274
|
var ToolTestSchema = BaseTestSchema.extend({
|
|
257
|
-
type:
|
|
275
|
+
type: z10.literal("TOOL" /* TOOL */),
|
|
258
276
|
/** Name of the tool that should be called */
|
|
259
|
-
toolName:
|
|
277
|
+
toolName: z10.string().min(3),
|
|
260
278
|
/** Expected arguments for the tool call */
|
|
261
|
-
args:
|
|
279
|
+
args: z10.record(z10.string(), z10.any()),
|
|
262
280
|
/** Expected content in the tool results */
|
|
263
|
-
resultsContent:
|
|
281
|
+
resultsContent: z10.string()
|
|
264
282
|
});
|
|
265
283
|
|
|
266
284
|
// src/test/site-config.ts
|
|
267
|
-
import { z as
|
|
285
|
+
import { z as z11 } from "zod";
|
|
268
286
|
var SiteConfigTestSchema = BaseTestSchema.extend({
|
|
269
|
-
type:
|
|
287
|
+
type: z11.literal("SITE_CONFIG" /* SITE_CONFIG */),
|
|
270
288
|
/** URL to call */
|
|
271
|
-
url:
|
|
289
|
+
url: z11.string().url(),
|
|
272
290
|
/** HTTP method */
|
|
273
|
-
method:
|
|
291
|
+
method: z11.enum(["GET", "POST"]),
|
|
274
292
|
/** Request body (for POST) */
|
|
275
|
-
body:
|
|
293
|
+
body: z11.string().optional(),
|
|
276
294
|
/** Expected HTTP status code */
|
|
277
|
-
expectedStatusCode:
|
|
295
|
+
expectedStatusCode: z11.number().int().min(100).max(599),
|
|
278
296
|
/** Expected response content */
|
|
279
|
-
expectedResponse:
|
|
297
|
+
expectedResponse: z11.string().optional(),
|
|
280
298
|
/** JMESPath expression to extract from response */
|
|
281
|
-
expectedResponseJMESPath:
|
|
299
|
+
expectedResponseJMESPath: z11.string().optional()
|
|
282
300
|
});
|
|
283
301
|
|
|
284
302
|
// src/test/command-execution.ts
|
|
285
|
-
import { z as
|
|
303
|
+
import { z as z12 } from "zod";
|
|
286
304
|
var AllowedCommands = [
|
|
287
305
|
"yarn install --no-immutable && yarn build",
|
|
288
306
|
"npm run build",
|
|
289
307
|
"yarn typecheck"
|
|
290
308
|
];
|
|
291
309
|
var CommandExecutionTestSchema = BaseTestSchema.extend({
|
|
292
|
-
type:
|
|
310
|
+
type: z12.literal("COMMAND_EXECUTION" /* COMMAND_EXECUTION */),
|
|
293
311
|
/** Command to execute (must be in AllowedCommands) */
|
|
294
|
-
command:
|
|
312
|
+
command: z12.string().refine((value) => AllowedCommands.includes(value), {
|
|
295
313
|
message: `Command must be one of: ${AllowedCommands.join(", ")}`
|
|
296
314
|
}),
|
|
297
315
|
/** Expected exit code (default: 0) */
|
|
298
|
-
expectedExitCode:
|
|
316
|
+
expectedExitCode: z12.number().default(0).optional()
|
|
299
317
|
});
|
|
300
318
|
|
|
301
319
|
// src/test/file-presence.ts
|
|
302
|
-
import { z as
|
|
320
|
+
import { z as z13 } from "zod";
|
|
303
321
|
var FilePresenceTestSchema = BaseTestSchema.extend({
|
|
304
|
-
type:
|
|
322
|
+
type: z13.literal("FILE_PRESENCE" /* FILE_PRESENCE */),
|
|
305
323
|
/** Paths to check */
|
|
306
|
-
paths:
|
|
324
|
+
paths: z13.array(z13.string()),
|
|
307
325
|
/** Whether files should exist (true) or not exist (false) */
|
|
308
|
-
shouldExist:
|
|
326
|
+
shouldExist: z13.boolean()
|
|
309
327
|
});
|
|
310
328
|
|
|
311
329
|
// src/test/file-content.ts
|
|
312
|
-
import { z as
|
|
313
|
-
var FileContentCheckSchema =
|
|
330
|
+
import { z as z14 } from "zod";
|
|
331
|
+
var FileContentCheckSchema = z14.object({
|
|
314
332
|
/** Strings that must be present in the file */
|
|
315
|
-
contains:
|
|
333
|
+
contains: z14.array(z14.string()).optional(),
|
|
316
334
|
/** Strings that must NOT be present in the file */
|
|
317
|
-
notContains:
|
|
335
|
+
notContains: z14.array(z14.string()).optional(),
|
|
318
336
|
/** Regex pattern the content must match */
|
|
319
|
-
matches:
|
|
337
|
+
matches: z14.string().optional(),
|
|
320
338
|
/** JSON path checks for structured content */
|
|
321
|
-
jsonPath:
|
|
322
|
-
|
|
323
|
-
path:
|
|
324
|
-
value:
|
|
339
|
+
jsonPath: z14.array(
|
|
340
|
+
z14.object({
|
|
341
|
+
path: z14.string(),
|
|
342
|
+
value: z14.unknown()
|
|
325
343
|
})
|
|
326
344
|
).optional(),
|
|
327
345
|
/** Lines that should be added (for diff checking) */
|
|
328
|
-
added:
|
|
346
|
+
added: z14.array(z14.string()).optional(),
|
|
329
347
|
/** Lines that should be removed (for diff checking) */
|
|
330
|
-
removed:
|
|
348
|
+
removed: z14.array(z14.string()).optional()
|
|
331
349
|
});
|
|
332
350
|
var FileContentTestSchema = BaseTestSchema.extend({
|
|
333
|
-
type:
|
|
351
|
+
type: z14.literal("FILE_CONTENT" /* FILE_CONTENT */),
|
|
334
352
|
/** Path to the file to check */
|
|
335
|
-
path:
|
|
353
|
+
path: z14.string(),
|
|
336
354
|
/** Content checks to perform */
|
|
337
355
|
checks: FileContentCheckSchema
|
|
338
356
|
});
|
|
339
357
|
|
|
340
358
|
// src/test/build-check.ts
|
|
341
|
-
import { z as
|
|
359
|
+
import { z as z15 } from "zod";
|
|
342
360
|
var BuildCheckTestSchema = BaseTestSchema.extend({
|
|
343
|
-
type:
|
|
361
|
+
type: z15.literal("BUILD_CHECK" /* BUILD_CHECK */),
|
|
344
362
|
/** Build command to execute */
|
|
345
|
-
command:
|
|
363
|
+
command: z15.string(),
|
|
346
364
|
/** Whether the build should succeed */
|
|
347
|
-
expectSuccess:
|
|
365
|
+
expectSuccess: z15.boolean(),
|
|
348
366
|
/** Maximum allowed warnings (optional) */
|
|
349
|
-
allowedWarnings:
|
|
367
|
+
allowedWarnings: z15.number().optional(),
|
|
350
368
|
/** Timeout in milliseconds */
|
|
351
|
-
timeout:
|
|
369
|
+
timeout: z15.number().optional()
|
|
352
370
|
});
|
|
353
371
|
|
|
354
372
|
// src/test/vitest.ts
|
|
355
|
-
import { z as
|
|
373
|
+
import { z as z16 } from "zod";
|
|
356
374
|
var VitestTestSchema = BaseTestSchema.extend({
|
|
357
|
-
type:
|
|
375
|
+
type: z16.literal("VITEST" /* VITEST */),
|
|
358
376
|
/** Test file content */
|
|
359
|
-
testFile:
|
|
377
|
+
testFile: z16.string(),
|
|
360
378
|
/** Name of the test file */
|
|
361
|
-
testFileName:
|
|
379
|
+
testFileName: z16.string(),
|
|
362
380
|
/** Minimum pass rate required (0-100) */
|
|
363
|
-
minPassRate:
|
|
381
|
+
minPassRate: z16.number().min(0).max(100)
|
|
364
382
|
});
|
|
365
383
|
|
|
366
384
|
// src/test/playwright-nl.ts
|
|
367
|
-
import { z as
|
|
385
|
+
import { z as z17 } from "zod";
|
|
368
386
|
var PlaywrightNLTestSchema = BaseTestSchema.extend({
|
|
369
|
-
type:
|
|
387
|
+
type: z17.literal("PLAYWRIGHT_NL" /* PLAYWRIGHT_NL */),
|
|
370
388
|
/** Natural language steps to execute */
|
|
371
|
-
steps:
|
|
389
|
+
steps: z17.array(z17.string()),
|
|
372
390
|
/** Expected outcome description */
|
|
373
|
-
expectedOutcome:
|
|
391
|
+
expectedOutcome: z17.string(),
|
|
374
392
|
/** Timeout in milliseconds */
|
|
375
|
-
timeout:
|
|
393
|
+
timeout: z17.number().optional()
|
|
376
394
|
});
|
|
377
395
|
|
|
378
396
|
// src/test/index.ts
|
|
379
|
-
var TestSchema =
|
|
397
|
+
var TestSchema = z18.discriminatedUnion("type", [
|
|
380
398
|
LLMTestSchema,
|
|
381
399
|
ToolTestSchema,
|
|
382
400
|
SiteConfigTestSchema,
|
|
@@ -389,66 +407,66 @@ var TestSchema = z17.discriminatedUnion("type", [
|
|
|
389
407
|
]);
|
|
390
408
|
|
|
391
409
|
// src/scenario/assertions.ts
|
|
392
|
-
import { z as
|
|
393
|
-
var SkillWasCalledAssertionSchema =
|
|
394
|
-
type:
|
|
410
|
+
import { z as z19 } from "zod";
|
|
411
|
+
var SkillWasCalledAssertionSchema = z19.object({
|
|
412
|
+
type: z19.literal("skill_was_called"),
|
|
395
413
|
/** Name of the skill that must have been called (matched against trace Skill tool args) */
|
|
396
|
-
skillName:
|
|
414
|
+
skillName: z19.string()
|
|
397
415
|
});
|
|
398
|
-
var BuildPassedAssertionSchema =
|
|
399
|
-
type:
|
|
416
|
+
var BuildPassedAssertionSchema = z19.object({
|
|
417
|
+
type: z19.literal("build_passed"),
|
|
400
418
|
/** Command to run (default: "yarn build") */
|
|
401
|
-
command:
|
|
419
|
+
command: z19.string().optional(),
|
|
402
420
|
/** Expected exit code (default: 0) */
|
|
403
|
-
expectedExitCode:
|
|
421
|
+
expectedExitCode: z19.number().int().optional()
|
|
404
422
|
});
|
|
405
|
-
var LlmJudgeAssertionSchema =
|
|
406
|
-
type:
|
|
423
|
+
var LlmJudgeAssertionSchema = z19.object({
|
|
424
|
+
type: z19.literal("llm_judge"),
|
|
407
425
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
408
|
-
prompt:
|
|
426
|
+
prompt: z19.string(),
|
|
409
427
|
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
410
|
-
systemPrompt:
|
|
428
|
+
systemPrompt: z19.string().optional(),
|
|
411
429
|
/** Minimum score to pass (0-100, default 70) */
|
|
412
|
-
minScore:
|
|
430
|
+
minScore: z19.number().int().min(0).max(100).optional(),
|
|
413
431
|
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
414
|
-
model:
|
|
415
|
-
maxTokens:
|
|
416
|
-
temperature:
|
|
432
|
+
model: z19.string().optional(),
|
|
433
|
+
maxTokens: z19.number().int().optional(),
|
|
434
|
+
temperature: z19.number().min(0).max(1).optional()
|
|
417
435
|
});
|
|
418
|
-
var AssertionSchema =
|
|
436
|
+
var AssertionSchema = z19.union([
|
|
419
437
|
SkillWasCalledAssertionSchema,
|
|
420
438
|
BuildPassedAssertionSchema,
|
|
421
439
|
LlmJudgeAssertionSchema
|
|
422
440
|
]);
|
|
423
441
|
|
|
424
442
|
// src/scenario/environment.ts
|
|
425
|
-
import { z as
|
|
426
|
-
var LocalProjectConfigSchema =
|
|
443
|
+
import { z as z20 } from "zod";
|
|
444
|
+
var LocalProjectConfigSchema = z20.object({
|
|
427
445
|
/** Template ID to use for the local project */
|
|
428
|
-
templateId:
|
|
446
|
+
templateId: z20.string().optional(),
|
|
429
447
|
/** Files to create in the project */
|
|
430
|
-
files:
|
|
431
|
-
|
|
432
|
-
path:
|
|
433
|
-
content:
|
|
448
|
+
files: z20.array(
|
|
449
|
+
z20.object({
|
|
450
|
+
path: z20.string().min(1),
|
|
451
|
+
content: z20.string().min(1)
|
|
434
452
|
})
|
|
435
453
|
).optional()
|
|
436
454
|
});
|
|
437
|
-
var MetaSiteConfigSchema =
|
|
438
|
-
configurations:
|
|
439
|
-
|
|
440
|
-
name:
|
|
441
|
-
apiCalls:
|
|
442
|
-
|
|
443
|
-
url:
|
|
444
|
-
method:
|
|
445
|
-
body:
|
|
455
|
+
var MetaSiteConfigSchema = z20.object({
|
|
456
|
+
configurations: z20.array(
|
|
457
|
+
z20.object({
|
|
458
|
+
name: z20.string().min(1),
|
|
459
|
+
apiCalls: z20.array(
|
|
460
|
+
z20.object({
|
|
461
|
+
url: z20.string().url(),
|
|
462
|
+
method: z20.enum(["POST", "PUT"]),
|
|
463
|
+
body: z20.string()
|
|
446
464
|
})
|
|
447
465
|
)
|
|
448
466
|
})
|
|
449
467
|
).optional()
|
|
450
468
|
});
|
|
451
|
-
var EnvironmentSchema =
|
|
469
|
+
var EnvironmentSchema = z20.object({
|
|
452
470
|
/** Local project configuration */
|
|
453
471
|
localProject: LocalProjectConfigSchema.optional(),
|
|
454
472
|
/** Meta site configuration */
|
|
@@ -456,54 +474,54 @@ var EnvironmentSchema = z19.object({
|
|
|
456
474
|
});
|
|
457
475
|
|
|
458
476
|
// src/scenario/test-scenario.ts
|
|
459
|
-
import { z as
|
|
477
|
+
import { z as z22 } from "zod";
|
|
460
478
|
|
|
461
479
|
// src/assertion/assertion.ts
|
|
462
|
-
import { z as
|
|
463
|
-
var AssertionTypeSchema =
|
|
480
|
+
import { z as z21 } from "zod";
|
|
481
|
+
var AssertionTypeSchema = z21.enum([
|
|
464
482
|
"skill_was_called",
|
|
465
483
|
"build_passed",
|
|
466
484
|
"llm_judge"
|
|
467
485
|
]);
|
|
468
|
-
var AssertionParameterTypeSchema =
|
|
486
|
+
var AssertionParameterTypeSchema = z21.enum([
|
|
469
487
|
"string",
|
|
470
488
|
"number",
|
|
471
489
|
"boolean"
|
|
472
490
|
]);
|
|
473
|
-
var AssertionParameterSchema =
|
|
491
|
+
var AssertionParameterSchema = z21.object({
|
|
474
492
|
/** Parameter name (used as key in params object) */
|
|
475
|
-
name:
|
|
493
|
+
name: z21.string().min(1),
|
|
476
494
|
/** Display label for the parameter */
|
|
477
|
-
label:
|
|
495
|
+
label: z21.string().min(1),
|
|
478
496
|
/** Parameter type */
|
|
479
497
|
type: AssertionParameterTypeSchema,
|
|
480
498
|
/** Whether this parameter is required */
|
|
481
|
-
required:
|
|
499
|
+
required: z21.boolean(),
|
|
482
500
|
/** Default value (optional, used when not provided) */
|
|
483
|
-
defaultValue:
|
|
501
|
+
defaultValue: z21.union([z21.string(), z21.number(), z21.boolean()]).optional(),
|
|
484
502
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
485
|
-
advanced:
|
|
503
|
+
advanced: z21.boolean().optional()
|
|
486
504
|
});
|
|
487
|
-
var ScenarioAssertionLinkSchema =
|
|
505
|
+
var ScenarioAssertionLinkSchema = z21.object({
|
|
488
506
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
489
|
-
assertionId:
|
|
507
|
+
assertionId: z21.string(),
|
|
490
508
|
/** Parameter values for this assertion in this scenario */
|
|
491
|
-
params:
|
|
492
|
-
|
|
493
|
-
|
|
509
|
+
params: z21.record(
|
|
510
|
+
z21.string(),
|
|
511
|
+
z21.union([z21.string(), z21.number(), z21.boolean(), z21.null()])
|
|
494
512
|
).optional()
|
|
495
513
|
});
|
|
496
|
-
var SkillWasCalledConfigSchema =
|
|
514
|
+
var SkillWasCalledConfigSchema = z21.object({
|
|
497
515
|
/** Name of the skill that must have been called */
|
|
498
|
-
skillName:
|
|
516
|
+
skillName: z21.string().min(1)
|
|
499
517
|
});
|
|
500
|
-
var BuildPassedConfigSchema =
|
|
518
|
+
var BuildPassedConfigSchema = z21.strictObject({
|
|
501
519
|
/** Command to run (default: "yarn build") */
|
|
502
|
-
command:
|
|
520
|
+
command: z21.string().optional(),
|
|
503
521
|
/** Expected exit code (default: 0) */
|
|
504
|
-
expectedExitCode:
|
|
522
|
+
expectedExitCode: z21.number().int().optional()
|
|
505
523
|
});
|
|
506
|
-
var LlmJudgeConfigSchema =
|
|
524
|
+
var LlmJudgeConfigSchema = z21.object({
|
|
507
525
|
/**
|
|
508
526
|
* Prompt template with placeholders:
|
|
509
527
|
* - {{output}}: agent's final output
|
|
@@ -514,28 +532,28 @@ var LlmJudgeConfigSchema = z20.object({
|
|
|
514
532
|
* - {{trace}}: step-by-step trace of tool calls
|
|
515
533
|
* - Custom parameters defined in the parameters array
|
|
516
534
|
*/
|
|
517
|
-
prompt:
|
|
535
|
+
prompt: z21.string().min(1),
|
|
518
536
|
/** Optional system prompt for the judge */
|
|
519
|
-
systemPrompt:
|
|
537
|
+
systemPrompt: z21.string().optional(),
|
|
520
538
|
/** Minimum score to pass (0-100, default 70) */
|
|
521
|
-
minScore:
|
|
539
|
+
minScore: z21.number().int().min(0).max(100).optional(),
|
|
522
540
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
523
|
-
model:
|
|
541
|
+
model: z21.string().optional(),
|
|
524
542
|
/** Max output tokens */
|
|
525
|
-
maxTokens:
|
|
543
|
+
maxTokens: z21.number().int().optional(),
|
|
526
544
|
/** Temperature (0-1) */
|
|
527
|
-
temperature:
|
|
545
|
+
temperature: z21.number().min(0).max(1).optional(),
|
|
528
546
|
/** User-defined parameters for this assertion */
|
|
529
|
-
parameters:
|
|
547
|
+
parameters: z21.array(AssertionParameterSchema).optional()
|
|
530
548
|
});
|
|
531
|
-
var AssertionConfigSchema =
|
|
549
|
+
var AssertionConfigSchema = z21.union([
|
|
532
550
|
LlmJudgeConfigSchema,
|
|
533
551
|
// requires prompt - check first
|
|
534
552
|
SkillWasCalledConfigSchema,
|
|
535
553
|
// requires skillName
|
|
536
554
|
BuildPassedConfigSchema,
|
|
537
555
|
// all optional, uses strictObject to reject unknown keys
|
|
538
|
-
|
|
556
|
+
z21.object({})
|
|
539
557
|
// fallback empty config
|
|
540
558
|
]);
|
|
541
559
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -580,23 +598,23 @@ function getLlmJudgeConfig(assertion) {
|
|
|
580
598
|
}
|
|
581
599
|
|
|
582
600
|
// src/scenario/test-scenario.ts
|
|
583
|
-
var ExpectedFileSchema =
|
|
601
|
+
var ExpectedFileSchema = z22.object({
|
|
584
602
|
/** Relative path where the file should be created */
|
|
585
|
-
path:
|
|
603
|
+
path: z22.string(),
|
|
586
604
|
/** Optional expected content */
|
|
587
|
-
content:
|
|
605
|
+
content: z22.string().optional()
|
|
588
606
|
});
|
|
589
607
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
590
608
|
/** The prompt sent to the agent to trigger the task */
|
|
591
|
-
triggerPrompt:
|
|
609
|
+
triggerPrompt: z22.string().min(10),
|
|
592
610
|
/** ID of the template to use for this scenario (null = no template) */
|
|
593
|
-
templateId:
|
|
611
|
+
templateId: z22.string().nullish(),
|
|
594
612
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
595
|
-
assertions:
|
|
613
|
+
assertions: z22.array(AssertionSchema).optional(),
|
|
596
614
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
597
|
-
assertionIds:
|
|
615
|
+
assertionIds: z22.array(z22.string()).optional(),
|
|
598
616
|
/** Linked assertions with per-scenario parameter values */
|
|
599
|
-
assertionLinks:
|
|
617
|
+
assertionLinks: z22.array(ScenarioAssertionLinkSchema).optional()
|
|
600
618
|
});
|
|
601
619
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
602
620
|
id: true,
|
|
@@ -607,10 +625,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
607
625
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
608
626
|
|
|
609
627
|
// src/suite/test-suite.ts
|
|
610
|
-
import { z as
|
|
628
|
+
import { z as z23 } from "zod";
|
|
611
629
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
612
630
|
/** IDs of test scenarios in this suite */
|
|
613
|
-
scenarioIds:
|
|
631
|
+
scenarioIds: z23.array(z23.string())
|
|
614
632
|
});
|
|
615
633
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
616
634
|
id: true,
|
|
@@ -621,21 +639,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
621
639
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
622
640
|
|
|
623
641
|
// src/evaluation/metrics.ts
|
|
624
|
-
import { z as
|
|
625
|
-
var TokenUsageSchema =
|
|
626
|
-
prompt:
|
|
627
|
-
completion:
|
|
628
|
-
total:
|
|
629
|
-
});
|
|
630
|
-
var EvalMetricsSchema =
|
|
631
|
-
totalAssertions:
|
|
632
|
-
passed:
|
|
633
|
-
failed:
|
|
634
|
-
skipped:
|
|
635
|
-
errors:
|
|
636
|
-
passRate:
|
|
637
|
-
avgDuration:
|
|
638
|
-
totalDuration:
|
|
642
|
+
import { z as z24 } from "zod";
|
|
643
|
+
var TokenUsageSchema = z24.object({
|
|
644
|
+
prompt: z24.number(),
|
|
645
|
+
completion: z24.number(),
|
|
646
|
+
total: z24.number()
|
|
647
|
+
});
|
|
648
|
+
var EvalMetricsSchema = z24.object({
|
|
649
|
+
totalAssertions: z24.number(),
|
|
650
|
+
passed: z24.number(),
|
|
651
|
+
failed: z24.number(),
|
|
652
|
+
skipped: z24.number(),
|
|
653
|
+
errors: z24.number(),
|
|
654
|
+
passRate: z24.number(),
|
|
655
|
+
avgDuration: z24.number(),
|
|
656
|
+
totalDuration: z24.number()
|
|
639
657
|
});
|
|
640
658
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
641
659
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -645,7 +663,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
645
663
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
646
664
|
return EvalStatus2;
|
|
647
665
|
})(EvalStatus || {});
|
|
648
|
-
var EvalStatusSchema =
|
|
666
|
+
var EvalStatusSchema = z24.enum(EvalStatus);
|
|
649
667
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
650
668
|
LLMStepType2["COMPLETION"] = "completion";
|
|
651
669
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -653,52 +671,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
653
671
|
LLMStepType2["THINKING"] = "thinking";
|
|
654
672
|
return LLMStepType2;
|
|
655
673
|
})(LLMStepType || {});
|
|
656
|
-
var LLMTraceStepSchema =
|
|
657
|
-
id:
|
|
658
|
-
stepNumber:
|
|
659
|
-
type:
|
|
660
|
-
model:
|
|
661
|
-
provider:
|
|
662
|
-
startedAt:
|
|
663
|
-
durationMs:
|
|
674
|
+
var LLMTraceStepSchema = z24.object({
|
|
675
|
+
id: z24.string(),
|
|
676
|
+
stepNumber: z24.number(),
|
|
677
|
+
type: z24.enum(LLMStepType),
|
|
678
|
+
model: z24.string(),
|
|
679
|
+
provider: z24.string(),
|
|
680
|
+
startedAt: z24.string(),
|
|
681
|
+
durationMs: z24.number(),
|
|
664
682
|
tokenUsage: TokenUsageSchema,
|
|
665
|
-
costUsd:
|
|
666
|
-
toolName:
|
|
667
|
-
toolArguments:
|
|
668
|
-
inputPreview:
|
|
669
|
-
outputPreview:
|
|
670
|
-
success:
|
|
671
|
-
error:
|
|
672
|
-
});
|
|
673
|
-
var LLMBreakdownStatsSchema =
|
|
674
|
-
count:
|
|
675
|
-
durationMs:
|
|
676
|
-
tokens:
|
|
677
|
-
costUsd:
|
|
678
|
-
});
|
|
679
|
-
var LLMTraceSummarySchema =
|
|
680
|
-
totalSteps:
|
|
681
|
-
totalDurationMs:
|
|
683
|
+
costUsd: z24.number(),
|
|
684
|
+
toolName: z24.string().optional(),
|
|
685
|
+
toolArguments: z24.string().optional(),
|
|
686
|
+
inputPreview: z24.string().optional(),
|
|
687
|
+
outputPreview: z24.string().optional(),
|
|
688
|
+
success: z24.boolean(),
|
|
689
|
+
error: z24.string().optional()
|
|
690
|
+
});
|
|
691
|
+
var LLMBreakdownStatsSchema = z24.object({
|
|
692
|
+
count: z24.number(),
|
|
693
|
+
durationMs: z24.number(),
|
|
694
|
+
tokens: z24.number(),
|
|
695
|
+
costUsd: z24.number()
|
|
696
|
+
});
|
|
697
|
+
var LLMTraceSummarySchema = z24.object({
|
|
698
|
+
totalSteps: z24.number(),
|
|
699
|
+
totalDurationMs: z24.number(),
|
|
682
700
|
totalTokens: TokenUsageSchema,
|
|
683
|
-
totalCostUsd:
|
|
684
|
-
stepTypeBreakdown:
|
|
685
|
-
modelBreakdown:
|
|
686
|
-
modelsUsed:
|
|
687
|
-
});
|
|
688
|
-
var LLMTraceSchema =
|
|
689
|
-
id:
|
|
690
|
-
steps:
|
|
701
|
+
totalCostUsd: z24.number(),
|
|
702
|
+
stepTypeBreakdown: z24.record(z24.string(), LLMBreakdownStatsSchema).optional(),
|
|
703
|
+
modelBreakdown: z24.record(z24.string(), LLMBreakdownStatsSchema),
|
|
704
|
+
modelsUsed: z24.array(z24.string())
|
|
705
|
+
});
|
|
706
|
+
var LLMTraceSchema = z24.object({
|
|
707
|
+
id: z24.string(),
|
|
708
|
+
steps: z24.array(LLMTraceStepSchema),
|
|
691
709
|
summary: LLMTraceSummarySchema
|
|
692
710
|
});
|
|
693
711
|
|
|
694
712
|
// src/evaluation/eval-result.ts
|
|
695
|
-
import { z as
|
|
713
|
+
import { z as z27 } from "zod";
|
|
696
714
|
|
|
697
715
|
// src/evaluation/eval-run.ts
|
|
698
|
-
import { z as
|
|
716
|
+
import { z as z26 } from "zod";
|
|
699
717
|
|
|
700
718
|
// src/evaluation/live-trace.ts
|
|
701
|
-
import { z as
|
|
719
|
+
import { z as z25 } from "zod";
|
|
702
720
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
703
721
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
704
722
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -712,37 +730,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
712
730
|
LiveTraceEventType2["USER"] = "user";
|
|
713
731
|
return LiveTraceEventType2;
|
|
714
732
|
})(LiveTraceEventType || {});
|
|
715
|
-
var LiveTraceEventSchema =
|
|
733
|
+
var LiveTraceEventSchema = z25.object({
|
|
716
734
|
/** The evaluation run ID */
|
|
717
|
-
evalRunId:
|
|
735
|
+
evalRunId: z25.string(),
|
|
718
736
|
/** The scenario ID being executed */
|
|
719
|
-
scenarioId:
|
|
737
|
+
scenarioId: z25.string(),
|
|
720
738
|
/** The scenario name for display */
|
|
721
|
-
scenarioName:
|
|
739
|
+
scenarioName: z25.string(),
|
|
722
740
|
/** The target ID (skill, agent, etc.) */
|
|
723
|
-
targetId:
|
|
741
|
+
targetId: z25.string(),
|
|
724
742
|
/** The target name for display */
|
|
725
|
-
targetName:
|
|
743
|
+
targetName: z25.string(),
|
|
726
744
|
/** Step number in the current scenario execution */
|
|
727
|
-
stepNumber:
|
|
745
|
+
stepNumber: z25.number(),
|
|
728
746
|
/** Type of trace event */
|
|
729
|
-
type:
|
|
747
|
+
type: z25.enum(LiveTraceEventType),
|
|
730
748
|
/** Tool name if this is a tool_use event */
|
|
731
|
-
toolName:
|
|
749
|
+
toolName: z25.string().optional(),
|
|
732
750
|
/** Tool arguments preview (truncated JSON) */
|
|
733
|
-
toolArgs:
|
|
751
|
+
toolArgs: z25.string().optional(),
|
|
734
752
|
/** Output preview (truncated text) */
|
|
735
|
-
outputPreview:
|
|
753
|
+
outputPreview: z25.string().optional(),
|
|
736
754
|
/** File path for file operations */
|
|
737
|
-
filePath:
|
|
755
|
+
filePath: z25.string().optional(),
|
|
738
756
|
/** Elapsed time in milliseconds for progress events */
|
|
739
|
-
elapsedMs:
|
|
757
|
+
elapsedMs: z25.number().optional(),
|
|
740
758
|
/** Thinking/reasoning text from Claude */
|
|
741
|
-
thinking:
|
|
759
|
+
thinking: z25.string().optional(),
|
|
742
760
|
/** Timestamp when this event occurred */
|
|
743
|
-
timestamp:
|
|
761
|
+
timestamp: z25.string(),
|
|
744
762
|
/** Whether this is the final event for this scenario */
|
|
745
|
-
isComplete:
|
|
763
|
+
isComplete: z25.boolean()
|
|
746
764
|
});
|
|
747
765
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
748
766
|
function parseTraceEventLine(line) {
|
|
@@ -770,14 +788,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
770
788
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
771
789
|
return TriggerType2;
|
|
772
790
|
})(TriggerType || {});
|
|
773
|
-
var TriggerMetadataSchema =
|
|
774
|
-
version:
|
|
775
|
-
resourceUpdated:
|
|
791
|
+
var TriggerMetadataSchema = z26.object({
|
|
792
|
+
version: z26.string().optional(),
|
|
793
|
+
resourceUpdated: z26.array(z26.string()).optional()
|
|
776
794
|
});
|
|
777
|
-
var TriggerSchema =
|
|
778
|
-
id:
|
|
795
|
+
var TriggerSchema = z26.object({
|
|
796
|
+
id: z26.string(),
|
|
779
797
|
metadata: TriggerMetadataSchema.optional(),
|
|
780
|
-
type:
|
|
798
|
+
type: z26.enum(TriggerType)
|
|
781
799
|
});
|
|
782
800
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
783
801
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -795,28 +813,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
795
813
|
FailureSeverity2["LOW"] = "low";
|
|
796
814
|
return FailureSeverity2;
|
|
797
815
|
})(FailureSeverity || {});
|
|
798
|
-
var DiffLineTypeSchema =
|
|
799
|
-
var DiffLineSchema =
|
|
816
|
+
var DiffLineTypeSchema = z26.enum(["added", "removed", "unchanged"]);
|
|
817
|
+
var DiffLineSchema = z26.object({
|
|
800
818
|
type: DiffLineTypeSchema,
|
|
801
|
-
content:
|
|
802
|
-
lineNumber:
|
|
803
|
-
});
|
|
804
|
-
var DiffContentSchema =
|
|
805
|
-
path:
|
|
806
|
-
expected:
|
|
807
|
-
actual:
|
|
808
|
-
diffLines:
|
|
809
|
-
renamedFrom:
|
|
810
|
-
});
|
|
811
|
-
var CommandExecutionSchema =
|
|
812
|
-
command:
|
|
813
|
-
exitCode:
|
|
814
|
-
output:
|
|
815
|
-
duration:
|
|
816
|
-
});
|
|
817
|
-
var FileModificationSchema =
|
|
818
|
-
path:
|
|
819
|
-
action:
|
|
819
|
+
content: z26.string(),
|
|
820
|
+
lineNumber: z26.number()
|
|
821
|
+
});
|
|
822
|
+
var DiffContentSchema = z26.object({
|
|
823
|
+
path: z26.string(),
|
|
824
|
+
expected: z26.string(),
|
|
825
|
+
actual: z26.string(),
|
|
826
|
+
diffLines: z26.array(DiffLineSchema),
|
|
827
|
+
renamedFrom: z26.string().optional()
|
|
828
|
+
});
|
|
829
|
+
var CommandExecutionSchema = z26.object({
|
|
830
|
+
command: z26.string(),
|
|
831
|
+
exitCode: z26.number(),
|
|
832
|
+
output: z26.string().optional(),
|
|
833
|
+
duration: z26.number()
|
|
834
|
+
});
|
|
835
|
+
var FileModificationSchema = z26.object({
|
|
836
|
+
path: z26.string(),
|
|
837
|
+
action: z26.enum(["created", "modified", "deleted"])
|
|
820
838
|
});
|
|
821
839
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
822
840
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -824,75 +842,79 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
824
842
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
825
843
|
return TemplateFileStatus2;
|
|
826
844
|
})(TemplateFileStatus || {});
|
|
827
|
-
var TemplateFileSchema =
|
|
845
|
+
var TemplateFileSchema = z26.object({
|
|
828
846
|
/** Relative path within the template */
|
|
829
|
-
path:
|
|
847
|
+
path: z26.string(),
|
|
830
848
|
/** Full file content after execution */
|
|
831
|
-
content:
|
|
849
|
+
content: z26.string(),
|
|
832
850
|
/** File status (new, modified, unchanged) */
|
|
833
|
-
status:
|
|
834
|
-
});
|
|
835
|
-
var ApiCallSchema =
|
|
836
|
-
endpoint:
|
|
837
|
-
tokensUsed:
|
|
838
|
-
duration:
|
|
839
|
-
});
|
|
840
|
-
var ExecutionTraceSchema =
|
|
841
|
-
commands:
|
|
842
|
-
filesModified:
|
|
843
|
-
apiCalls:
|
|
844
|
-
totalDuration:
|
|
845
|
-
});
|
|
846
|
-
var FailureAnalysisSchema =
|
|
847
|
-
category:
|
|
848
|
-
severity:
|
|
849
|
-
summary:
|
|
850
|
-
details:
|
|
851
|
-
rootCause:
|
|
852
|
-
suggestedFix:
|
|
853
|
-
relatedAssertions:
|
|
854
|
-
codeSnippet:
|
|
855
|
-
similarIssues:
|
|
856
|
-
patternId:
|
|
851
|
+
status: z26.enum(["new", "modified", "unchanged"])
|
|
852
|
+
});
|
|
853
|
+
var ApiCallSchema = z26.object({
|
|
854
|
+
endpoint: z26.string(),
|
|
855
|
+
tokensUsed: z26.number(),
|
|
856
|
+
duration: z26.number()
|
|
857
|
+
});
|
|
858
|
+
var ExecutionTraceSchema = z26.object({
|
|
859
|
+
commands: z26.array(CommandExecutionSchema),
|
|
860
|
+
filesModified: z26.array(FileModificationSchema),
|
|
861
|
+
apiCalls: z26.array(ApiCallSchema),
|
|
862
|
+
totalDuration: z26.number()
|
|
863
|
+
});
|
|
864
|
+
var FailureAnalysisSchema = z26.object({
|
|
865
|
+
category: z26.enum(FailureCategory),
|
|
866
|
+
severity: z26.enum(FailureSeverity),
|
|
867
|
+
summary: z26.string(),
|
|
868
|
+
details: z26.string(),
|
|
869
|
+
rootCause: z26.string(),
|
|
870
|
+
suggestedFix: z26.string(),
|
|
871
|
+
relatedAssertions: z26.array(z26.string()),
|
|
872
|
+
codeSnippet: z26.string().optional(),
|
|
873
|
+
similarIssues: z26.array(z26.string()).optional(),
|
|
874
|
+
patternId: z26.string().optional(),
|
|
857
875
|
// Extended fields for detailed debugging
|
|
858
876
|
diff: DiffContentSchema.optional(),
|
|
859
877
|
executionTrace: ExecutionTraceSchema.optional()
|
|
860
878
|
});
|
|
861
879
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
862
880
|
/** Agent ID for this run */
|
|
863
|
-
agentId:
|
|
881
|
+
agentId: z26.string().optional(),
|
|
864
882
|
/** Skills group ID for this run */
|
|
865
|
-
skillsGroupId:
|
|
883
|
+
skillsGroupId: z26.string().optional(),
|
|
866
884
|
/** Scenario IDs to run */
|
|
867
|
-
scenarioIds:
|
|
885
|
+
scenarioIds: z26.array(z26.string()),
|
|
868
886
|
/** Current status */
|
|
869
887
|
status: EvalStatusSchema,
|
|
870
888
|
/** Progress percentage (0-100) */
|
|
871
|
-
progress:
|
|
889
|
+
progress: z26.number(),
|
|
872
890
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
873
|
-
results:
|
|
891
|
+
results: z26.array(z26.lazy(() => EvalRunResultSchema)),
|
|
874
892
|
/** Aggregated metrics across all results */
|
|
875
893
|
aggregateMetrics: EvalMetricsSchema,
|
|
876
894
|
/** Failure analyses */
|
|
877
|
-
failureAnalyses:
|
|
895
|
+
failureAnalyses: z26.array(FailureAnalysisSchema).optional(),
|
|
878
896
|
/** Aggregated LLM trace summary */
|
|
879
897
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
880
898
|
/** What triggered this run */
|
|
881
899
|
trigger: TriggerSchema.optional(),
|
|
882
900
|
/** When the run started (set when evaluation is triggered) */
|
|
883
|
-
startedAt:
|
|
901
|
+
startedAt: z26.string().optional(),
|
|
884
902
|
/** When the run completed */
|
|
885
|
-
completedAt:
|
|
903
|
+
completedAt: z26.string().optional(),
|
|
886
904
|
/** Live trace events captured during execution (for playback on results page) */
|
|
887
|
-
liveTraceEvents:
|
|
905
|
+
liveTraceEvents: z26.array(LiveTraceEventSchema).optional(),
|
|
888
906
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
889
|
-
jobId:
|
|
907
|
+
jobId: z26.string().optional(),
|
|
890
908
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
891
|
-
jobStatus:
|
|
909
|
+
jobStatus: z26.string().optional(),
|
|
892
910
|
/** Remote job error message if the job failed */
|
|
893
|
-
jobError:
|
|
911
|
+
jobError: z26.string().optional(),
|
|
894
912
|
/** Timestamp of the last job status check */
|
|
895
|
-
jobStatusCheckedAt:
|
|
913
|
+
jobStatusCheckedAt: z26.string().optional(),
|
|
914
|
+
/** MCP server IDs to enable for this run (optional) */
|
|
915
|
+
mcpIds: z26.array(z26.string()).optional(),
|
|
916
|
+
/** Sub-agent IDs to enable for this run (optional) */
|
|
917
|
+
subAgentIds: z26.array(z26.string()).optional()
|
|
896
918
|
});
|
|
897
919
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
898
920
|
id: true,
|
|
@@ -905,28 +927,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
905
927
|
startedAt: true,
|
|
906
928
|
completedAt: true
|
|
907
929
|
});
|
|
908
|
-
var EvaluationProgressSchema =
|
|
909
|
-
runId:
|
|
910
|
-
targetId:
|
|
911
|
-
totalScenarios:
|
|
912
|
-
completedScenarios:
|
|
913
|
-
scenarioProgress:
|
|
914
|
-
|
|
915
|
-
scenarioId:
|
|
916
|
-
currentStep:
|
|
917
|
-
error:
|
|
930
|
+
var EvaluationProgressSchema = z26.object({
|
|
931
|
+
runId: z26.string(),
|
|
932
|
+
targetId: z26.string(),
|
|
933
|
+
totalScenarios: z26.number(),
|
|
934
|
+
completedScenarios: z26.number(),
|
|
935
|
+
scenarioProgress: z26.array(
|
|
936
|
+
z26.object({
|
|
937
|
+
scenarioId: z26.string(),
|
|
938
|
+
currentStep: z26.string(),
|
|
939
|
+
error: z26.string().optional()
|
|
918
940
|
})
|
|
919
941
|
),
|
|
920
|
-
createdAt:
|
|
942
|
+
createdAt: z26.number()
|
|
921
943
|
});
|
|
922
|
-
var EvaluationLogSchema =
|
|
923
|
-
runId:
|
|
924
|
-
scenarioId:
|
|
925
|
-
log:
|
|
926
|
-
level:
|
|
927
|
-
message:
|
|
928
|
-
args:
|
|
929
|
-
error:
|
|
944
|
+
var EvaluationLogSchema = z26.object({
|
|
945
|
+
runId: z26.string(),
|
|
946
|
+
scenarioId: z26.string(),
|
|
947
|
+
log: z26.object({
|
|
948
|
+
level: z26.enum(["info", "error", "debug"]),
|
|
949
|
+
message: z26.string().optional(),
|
|
950
|
+
args: z26.array(z26.any()).optional(),
|
|
951
|
+
error: z26.string().optional()
|
|
930
952
|
})
|
|
931
953
|
});
|
|
932
954
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -939,91 +961,91 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
939
961
|
AssertionResultStatus2["ERROR"] = "error";
|
|
940
962
|
return AssertionResultStatus2;
|
|
941
963
|
})(AssertionResultStatus || {});
|
|
942
|
-
var AssertionResultSchema =
|
|
943
|
-
id:
|
|
944
|
-
assertionId:
|
|
945
|
-
assertionType:
|
|
946
|
-
assertionName:
|
|
947
|
-
status:
|
|
948
|
-
message:
|
|
949
|
-
expected:
|
|
950
|
-
actual:
|
|
951
|
-
duration:
|
|
952
|
-
details:
|
|
953
|
-
llmTraceSteps:
|
|
954
|
-
});
|
|
955
|
-
var EvalRunResultSchema =
|
|
956
|
-
id:
|
|
957
|
-
targetId:
|
|
958
|
-
targetName:
|
|
959
|
-
scenarioId:
|
|
960
|
-
scenarioName:
|
|
964
|
+
var AssertionResultSchema = z27.object({
|
|
965
|
+
id: z27.string(),
|
|
966
|
+
assertionId: z27.string(),
|
|
967
|
+
assertionType: z27.string(),
|
|
968
|
+
assertionName: z27.string(),
|
|
969
|
+
status: z27.enum(AssertionResultStatus),
|
|
970
|
+
message: z27.string().optional(),
|
|
971
|
+
expected: z27.string().optional(),
|
|
972
|
+
actual: z27.string().optional(),
|
|
973
|
+
duration: z27.number().optional(),
|
|
974
|
+
details: z27.record(z27.string(), z27.unknown()).optional(),
|
|
975
|
+
llmTraceSteps: z27.array(LLMTraceStepSchema).optional()
|
|
976
|
+
});
|
|
977
|
+
var EvalRunResultSchema = z27.object({
|
|
978
|
+
id: z27.string(),
|
|
979
|
+
targetId: z27.string(),
|
|
980
|
+
targetName: z27.string().optional(),
|
|
981
|
+
scenarioId: z27.string(),
|
|
982
|
+
scenarioName: z27.string(),
|
|
961
983
|
modelConfig: ModelConfigSchema.optional(),
|
|
962
|
-
assertionResults:
|
|
984
|
+
assertionResults: z27.array(AssertionResultSchema),
|
|
963
985
|
metrics: EvalMetricsSchema.optional(),
|
|
964
|
-
passed:
|
|
965
|
-
failed:
|
|
966
|
-
passRate:
|
|
967
|
-
duration:
|
|
968
|
-
outputText:
|
|
969
|
-
files:
|
|
970
|
-
fileDiffs:
|
|
986
|
+
passed: z27.number(),
|
|
987
|
+
failed: z27.number(),
|
|
988
|
+
passRate: z27.number(),
|
|
989
|
+
duration: z27.number(),
|
|
990
|
+
outputText: z27.string().optional(),
|
|
991
|
+
files: z27.array(ExpectedFileSchema).optional(),
|
|
992
|
+
fileDiffs: z27.array(DiffContentSchema).optional(),
|
|
971
993
|
/** Full template files after execution with status indicators */
|
|
972
|
-
templateFiles:
|
|
973
|
-
startedAt:
|
|
974
|
-
completedAt:
|
|
994
|
+
templateFiles: z27.array(TemplateFileSchema).optional(),
|
|
995
|
+
startedAt: z27.string().optional(),
|
|
996
|
+
completedAt: z27.string().optional(),
|
|
975
997
|
llmTrace: LLMTraceSchema.optional()
|
|
976
998
|
});
|
|
977
|
-
var PromptResultSchema =
|
|
978
|
-
text:
|
|
979
|
-
files:
|
|
980
|
-
finishReason:
|
|
981
|
-
reasoning:
|
|
982
|
-
reasoningDetails:
|
|
983
|
-
toolCalls:
|
|
984
|
-
toolResults:
|
|
985
|
-
warnings:
|
|
986
|
-
sources:
|
|
987
|
-
steps:
|
|
988
|
-
generationTimeMs:
|
|
989
|
-
prompt:
|
|
990
|
-
systemPrompt:
|
|
991
|
-
usage:
|
|
992
|
-
totalTokens:
|
|
993
|
-
totalMicrocentsSpent:
|
|
999
|
+
var PromptResultSchema = z27.object({
|
|
1000
|
+
text: z27.string(),
|
|
1001
|
+
files: z27.array(z27.unknown()).optional(),
|
|
1002
|
+
finishReason: z27.string().optional(),
|
|
1003
|
+
reasoning: z27.string().optional(),
|
|
1004
|
+
reasoningDetails: z27.unknown().optional(),
|
|
1005
|
+
toolCalls: z27.array(z27.unknown()).optional(),
|
|
1006
|
+
toolResults: z27.array(z27.unknown()).optional(),
|
|
1007
|
+
warnings: z27.array(z27.unknown()).optional(),
|
|
1008
|
+
sources: z27.array(z27.unknown()).optional(),
|
|
1009
|
+
steps: z27.array(z27.unknown()),
|
|
1010
|
+
generationTimeMs: z27.number(),
|
|
1011
|
+
prompt: z27.string(),
|
|
1012
|
+
systemPrompt: z27.string(),
|
|
1013
|
+
usage: z27.object({
|
|
1014
|
+
totalTokens: z27.number().optional(),
|
|
1015
|
+
totalMicrocentsSpent: z27.number().optional()
|
|
994
1016
|
})
|
|
995
1017
|
});
|
|
996
|
-
var EvaluationResultSchema =
|
|
997
|
-
id:
|
|
998
|
-
runId:
|
|
999
|
-
timestamp:
|
|
1018
|
+
var EvaluationResultSchema = z27.object({
|
|
1019
|
+
id: z27.string(),
|
|
1020
|
+
runId: z27.string(),
|
|
1021
|
+
timestamp: z27.number(),
|
|
1000
1022
|
promptResult: PromptResultSchema,
|
|
1001
|
-
testResults:
|
|
1002
|
-
tags:
|
|
1003
|
-
feedback:
|
|
1004
|
-
score:
|
|
1005
|
-
suiteId:
|
|
1006
|
-
});
|
|
1007
|
-
var LeanEvaluationResultSchema =
|
|
1008
|
-
id:
|
|
1009
|
-
runId:
|
|
1010
|
-
timestamp:
|
|
1011
|
-
tags:
|
|
1012
|
-
scenarioId:
|
|
1013
|
-
scenarioVersion:
|
|
1014
|
-
targetId:
|
|
1015
|
-
targetVersion:
|
|
1016
|
-
suiteId:
|
|
1017
|
-
score:
|
|
1018
|
-
time:
|
|
1019
|
-
microcentsSpent:
|
|
1023
|
+
testResults: z27.array(z27.unknown()),
|
|
1024
|
+
tags: z27.array(z27.string()).optional(),
|
|
1025
|
+
feedback: z27.string().optional(),
|
|
1026
|
+
score: z27.number(),
|
|
1027
|
+
suiteId: z27.string().optional()
|
|
1028
|
+
});
|
|
1029
|
+
var LeanEvaluationResultSchema = z27.object({
|
|
1030
|
+
id: z27.string(),
|
|
1031
|
+
runId: z27.string(),
|
|
1032
|
+
timestamp: z27.number(),
|
|
1033
|
+
tags: z27.array(z27.string()).optional(),
|
|
1034
|
+
scenarioId: z27.string(),
|
|
1035
|
+
scenarioVersion: z27.number().optional(),
|
|
1036
|
+
targetId: z27.string(),
|
|
1037
|
+
targetVersion: z27.number().optional(),
|
|
1038
|
+
suiteId: z27.string().optional(),
|
|
1039
|
+
score: z27.number(),
|
|
1040
|
+
time: z27.number().optional(),
|
|
1041
|
+
microcentsSpent: z27.number().optional()
|
|
1020
1042
|
});
|
|
1021
1043
|
|
|
1022
1044
|
// src/project/project.ts
|
|
1023
|
-
import { z as
|
|
1045
|
+
import { z as z28 } from "zod";
|
|
1024
1046
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1025
|
-
appId:
|
|
1026
|
-
appSecret:
|
|
1047
|
+
appId: z28.string().optional().describe("The ID of the app in Dev Center"),
|
|
1048
|
+
appSecret: z28.string().optional().describe("The secret of the app in Dev Center")
|
|
1027
1049
|
});
|
|
1028
1050
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1029
1051
|
id: true,
|
|
@@ -1034,10 +1056,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
1034
1056
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
1035
1057
|
|
|
1036
1058
|
// src/template/template.ts
|
|
1037
|
-
import { z as
|
|
1059
|
+
import { z as z29 } from "zod";
|
|
1038
1060
|
var TemplateSchema = TenantEntitySchema.extend({
|
|
1039
1061
|
/** URL to download the template from */
|
|
1040
|
-
downloadUrl:
|
|
1062
|
+
downloadUrl: z29.url()
|
|
1041
1063
|
});
|
|
1042
1064
|
var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
1043
1065
|
id: true,
|
|
@@ -1172,9 +1194,11 @@ export {
|
|
|
1172
1194
|
CreateAgentInputSchema,
|
|
1173
1195
|
CreateCustomAssertionInputSchema,
|
|
1174
1196
|
CreateEvalRunInputSchema,
|
|
1197
|
+
CreateMcpInputSchema,
|
|
1175
1198
|
CreateProjectInputSchema,
|
|
1176
1199
|
CreateSkillInputSchema,
|
|
1177
1200
|
CreateSkillsGroupInputSchema,
|
|
1201
|
+
CreateSubAgentInputSchema,
|
|
1178
1202
|
CreateTemplateInputSchema,
|
|
1179
1203
|
CreateTestScenarioInputSchema,
|
|
1180
1204
|
CreateTestSuiteInputSchema,
|
|
@@ -1213,7 +1237,9 @@ export {
|
|
|
1213
1237
|
LlmJudgeAssertionSchema,
|
|
1214
1238
|
LlmJudgeConfigSchema,
|
|
1215
1239
|
LocalProjectConfigSchema,
|
|
1240
|
+
MCPEntitySchema,
|
|
1216
1241
|
MCPServerConfigSchema,
|
|
1242
|
+
MCP_SERVERS_JSON_KEY,
|
|
1217
1243
|
MetaSiteConfigSchema,
|
|
1218
1244
|
ModelConfigSchema,
|
|
1219
1245
|
ModelIds,
|
|
@@ -1234,6 +1260,7 @@ export {
|
|
|
1234
1260
|
SkillWasCalledAssertionSchema,
|
|
1235
1261
|
SkillWasCalledConfigSchema,
|
|
1236
1262
|
SkillsGroupSchema,
|
|
1263
|
+
SubAgentSchema,
|
|
1237
1264
|
TRACE_EVENT_PREFIX,
|
|
1238
1265
|
TargetSchema,
|
|
1239
1266
|
TemplateFileSchema,
|
|
@@ -1254,9 +1281,11 @@ export {
|
|
|
1254
1281
|
TriggerType,
|
|
1255
1282
|
UpdateAgentInputSchema,
|
|
1256
1283
|
UpdateCustomAssertionInputSchema,
|
|
1284
|
+
UpdateMcpInputSchema,
|
|
1257
1285
|
UpdateProjectInputSchema,
|
|
1258
1286
|
UpdateSkillInputSchema,
|
|
1259
1287
|
UpdateSkillsGroupInputSchema,
|
|
1288
|
+
UpdateSubAgentInputSchema,
|
|
1260
1289
|
UpdateTemplateInputSchema,
|
|
1261
1290
|
UpdateTestScenarioInputSchema,
|
|
1262
1291
|
UpdateTestSuiteInputSchema,
|