@wix/evalforge-types 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +414 -378
- package/build/index.js.map +4 -4
- package/build/index.mjs +407 -378
- package/build/index.mjs.map +4 -4
- package/build/types/common/mcp.d.ts +42 -10
- package/build/types/evaluation/eval-run.d.ts +4 -0
- package/build/types/target/index.d.ts +1 -0
- package/build/types/target/sub-agent.d.ts +34 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -42,9 +42,11 @@ __export(index_exports, {
|
|
|
42
42
|
CreateAgentInputSchema: () => CreateAgentInputSchema,
|
|
43
43
|
CreateCustomAssertionInputSchema: () => CreateCustomAssertionInputSchema,
|
|
44
44
|
CreateEvalRunInputSchema: () => CreateEvalRunInputSchema,
|
|
45
|
+
CreateMcpInputSchema: () => CreateMcpInputSchema,
|
|
45
46
|
CreateProjectInputSchema: () => CreateProjectInputSchema,
|
|
46
47
|
CreateSkillInputSchema: () => CreateSkillInputSchema,
|
|
47
48
|
CreateSkillsGroupInputSchema: () => CreateSkillsGroupInputSchema,
|
|
49
|
+
CreateSubAgentInputSchema: () => CreateSubAgentInputSchema,
|
|
48
50
|
CreateTemplateInputSchema: () => CreateTemplateInputSchema,
|
|
49
51
|
CreateTestScenarioInputSchema: () => CreateTestScenarioInputSchema,
|
|
50
52
|
CreateTestSuiteInputSchema: () => CreateTestSuiteInputSchema,
|
|
@@ -83,7 +85,9 @@ __export(index_exports, {
|
|
|
83
85
|
LlmJudgeAssertionSchema: () => LlmJudgeAssertionSchema,
|
|
84
86
|
LlmJudgeConfigSchema: () => LlmJudgeConfigSchema,
|
|
85
87
|
LocalProjectConfigSchema: () => LocalProjectConfigSchema,
|
|
88
|
+
MCPEntitySchema: () => MCPEntitySchema,
|
|
86
89
|
MCPServerConfigSchema: () => MCPServerConfigSchema,
|
|
90
|
+
MCP_SERVERS_JSON_KEY: () => MCP_SERVERS_JSON_KEY,
|
|
87
91
|
MetaSiteConfigSchema: () => MetaSiteConfigSchema,
|
|
88
92
|
ModelConfigSchema: () => ModelConfigSchema,
|
|
89
93
|
ModelIds: () => ModelIds,
|
|
@@ -104,6 +108,7 @@ __export(index_exports, {
|
|
|
104
108
|
SkillWasCalledAssertionSchema: () => SkillWasCalledAssertionSchema,
|
|
105
109
|
SkillWasCalledConfigSchema: () => SkillWasCalledConfigSchema,
|
|
106
110
|
SkillsGroupSchema: () => SkillsGroupSchema,
|
|
111
|
+
SubAgentSchema: () => SubAgentSchema,
|
|
107
112
|
TRACE_EVENT_PREFIX: () => TRACE_EVENT_PREFIX,
|
|
108
113
|
TargetSchema: () => TargetSchema,
|
|
109
114
|
TemplateFileSchema: () => TemplateFileSchema,
|
|
@@ -124,9 +129,11 @@ __export(index_exports, {
|
|
|
124
129
|
TriggerType: () => TriggerType,
|
|
125
130
|
UpdateAgentInputSchema: () => UpdateAgentInputSchema,
|
|
126
131
|
UpdateCustomAssertionInputSchema: () => UpdateCustomAssertionInputSchema,
|
|
132
|
+
UpdateMcpInputSchema: () => UpdateMcpInputSchema,
|
|
127
133
|
UpdateProjectInputSchema: () => UpdateProjectInputSchema,
|
|
128
134
|
UpdateSkillInputSchema: () => UpdateSkillInputSchema,
|
|
129
135
|
UpdateSkillsGroupInputSchema: () => UpdateSkillsGroupInputSchema,
|
|
136
|
+
UpdateSubAgentInputSchema: () => UpdateSubAgentInputSchema,
|
|
130
137
|
UpdateTemplateInputSchema: () => UpdateTemplateInputSchema,
|
|
131
138
|
UpdateTestScenarioInputSchema: () => UpdateTestScenarioInputSchema,
|
|
132
139
|
UpdateTestSuiteInputSchema: () => UpdateTestSuiteInputSchema,
|
|
@@ -160,18 +167,21 @@ var TenantEntitySchema = BaseEntitySchema.extend({
|
|
|
160
167
|
|
|
161
168
|
// src/common/mcp.ts
|
|
162
169
|
var import_zod2 = require("zod");
|
|
163
|
-
var
|
|
164
|
-
|
|
165
|
-
name
|
|
166
|
-
|
|
167
|
-
command
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
170
|
+
var MCP_SERVERS_JSON_KEY = "mcpServers";
|
|
171
|
+
var MCPEntitySchema = TenantEntitySchema.extend({
|
|
172
|
+
/** Display name and key in mcp.json mcpServers object */
|
|
173
|
+
name: import_zod2.z.string().min(1),
|
|
174
|
+
/** MCP server config (command/args, url/headers, etc.) - stored as-is for mcp.json */
|
|
175
|
+
config: import_zod2.z.record(import_zod2.z.string(), import_zod2.z.unknown())
|
|
176
|
+
});
|
|
177
|
+
var CreateMcpInputSchema = MCPEntitySchema.omit({
|
|
178
|
+
id: true,
|
|
179
|
+
createdAt: true,
|
|
180
|
+
updatedAt: true,
|
|
181
|
+
deleted: true
|
|
174
182
|
});
|
|
183
|
+
var UpdateMcpInputSchema = CreateMcpInputSchema.partial();
|
|
184
|
+
var MCPServerConfigSchema = import_zod2.z.record(import_zod2.z.string(), import_zod2.z.unknown());
|
|
175
185
|
|
|
176
186
|
// src/common/models.ts
|
|
177
187
|
var import_zod3 = require("zod");
|
|
@@ -351,11 +361,26 @@ var CreateSkillsGroupInputSchema = SkillsGroupSchema.omit({
|
|
|
351
361
|
});
|
|
352
362
|
var UpdateSkillsGroupInputSchema = CreateSkillsGroupInputSchema.partial();
|
|
353
363
|
|
|
364
|
+
// src/target/sub-agent.ts
|
|
365
|
+
var import_zod7 = require("zod");
|
|
366
|
+
var SubAgentSchema = TargetSchema.extend({
|
|
367
|
+
/** The full sub-agent markdown content (YAML frontmatter + body) */
|
|
368
|
+
subAgentMd: import_zod7.z.string()
|
|
369
|
+
});
|
|
370
|
+
var SubAgentInputBaseSchema = SubAgentSchema.omit({
|
|
371
|
+
id: true,
|
|
372
|
+
createdAt: true,
|
|
373
|
+
updatedAt: true,
|
|
374
|
+
deleted: true
|
|
375
|
+
});
|
|
376
|
+
var CreateSubAgentInputSchema = SubAgentInputBaseSchema;
|
|
377
|
+
var UpdateSubAgentInputSchema = SubAgentInputBaseSchema.partial();
|
|
378
|
+
|
|
354
379
|
// src/test/index.ts
|
|
355
|
-
var
|
|
380
|
+
var import_zod18 = require("zod");
|
|
356
381
|
|
|
357
382
|
// src/test/base.ts
|
|
358
|
-
var
|
|
383
|
+
var import_zod8 = require("zod");
|
|
359
384
|
var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
360
385
|
TestType2["LLM"] = "LLM";
|
|
361
386
|
TestType2["TOOL"] = "TOOL";
|
|
@@ -368,7 +393,7 @@ var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
|
368
393
|
TestType2["PLAYWRIGHT_NL"] = "PLAYWRIGHT_NL";
|
|
369
394
|
return TestType2;
|
|
370
395
|
})(TestType || {});
|
|
371
|
-
var TestTypeSchema =
|
|
396
|
+
var TestTypeSchema = import_zod8.z.enum(TestType);
|
|
372
397
|
var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
373
398
|
TestImportance2["LOW"] = "low";
|
|
374
399
|
TestImportance2["MEDIUM"] = "medium";
|
|
@@ -376,153 +401,153 @@ var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
|
376
401
|
TestImportance2["CRITICAL"] = "critical";
|
|
377
402
|
return TestImportance2;
|
|
378
403
|
})(TestImportance || {});
|
|
379
|
-
var TestImportanceSchema =
|
|
380
|
-
var BaseTestSchema =
|
|
381
|
-
id:
|
|
404
|
+
var TestImportanceSchema = import_zod8.z.enum(TestImportance);
|
|
405
|
+
var BaseTestSchema = import_zod8.z.object({
|
|
406
|
+
id: import_zod8.z.string(),
|
|
382
407
|
type: TestTypeSchema,
|
|
383
|
-
name:
|
|
384
|
-
description:
|
|
408
|
+
name: import_zod8.z.string().min(3),
|
|
409
|
+
description: import_zod8.z.string().optional(),
|
|
385
410
|
importance: TestImportanceSchema.optional()
|
|
386
411
|
});
|
|
387
412
|
|
|
388
413
|
// src/test/llm.ts
|
|
389
|
-
var
|
|
414
|
+
var import_zod9 = require("zod");
|
|
390
415
|
var LLMTestSchema = BaseTestSchema.extend({
|
|
391
|
-
type:
|
|
416
|
+
type: import_zod9.z.literal("LLM" /* LLM */),
|
|
392
417
|
/** Maximum steps for the LLM to take */
|
|
393
|
-
maxSteps:
|
|
418
|
+
maxSteps: import_zod9.z.number().min(1).max(100),
|
|
394
419
|
/** Prompt to send to the evaluator */
|
|
395
|
-
prompt:
|
|
420
|
+
prompt: import_zod9.z.string().min(1),
|
|
396
421
|
/** ID of the evaluator agent to use */
|
|
397
|
-
evaluatorId:
|
|
422
|
+
evaluatorId: import_zod9.z.string()
|
|
398
423
|
});
|
|
399
424
|
|
|
400
425
|
// src/test/tool.ts
|
|
401
|
-
var
|
|
426
|
+
var import_zod10 = require("zod");
|
|
402
427
|
var ToolTestSchema = BaseTestSchema.extend({
|
|
403
|
-
type:
|
|
428
|
+
type: import_zod10.z.literal("TOOL" /* TOOL */),
|
|
404
429
|
/** Name of the tool that should be called */
|
|
405
|
-
toolName:
|
|
430
|
+
toolName: import_zod10.z.string().min(3),
|
|
406
431
|
/** Expected arguments for the tool call */
|
|
407
|
-
args:
|
|
432
|
+
args: import_zod10.z.record(import_zod10.z.string(), import_zod10.z.any()),
|
|
408
433
|
/** Expected content in the tool results */
|
|
409
|
-
resultsContent:
|
|
434
|
+
resultsContent: import_zod10.z.string()
|
|
410
435
|
});
|
|
411
436
|
|
|
412
437
|
// src/test/site-config.ts
|
|
413
|
-
var
|
|
438
|
+
var import_zod11 = require("zod");
|
|
414
439
|
var SiteConfigTestSchema = BaseTestSchema.extend({
|
|
415
|
-
type:
|
|
440
|
+
type: import_zod11.z.literal("SITE_CONFIG" /* SITE_CONFIG */),
|
|
416
441
|
/** URL to call */
|
|
417
|
-
url:
|
|
442
|
+
url: import_zod11.z.string().url(),
|
|
418
443
|
/** HTTP method */
|
|
419
|
-
method:
|
|
444
|
+
method: import_zod11.z.enum(["GET", "POST"]),
|
|
420
445
|
/** Request body (for POST) */
|
|
421
|
-
body:
|
|
446
|
+
body: import_zod11.z.string().optional(),
|
|
422
447
|
/** Expected HTTP status code */
|
|
423
|
-
expectedStatusCode:
|
|
448
|
+
expectedStatusCode: import_zod11.z.number().int().min(100).max(599),
|
|
424
449
|
/** Expected response content */
|
|
425
|
-
expectedResponse:
|
|
450
|
+
expectedResponse: import_zod11.z.string().optional(),
|
|
426
451
|
/** JMESPath expression to extract from response */
|
|
427
|
-
expectedResponseJMESPath:
|
|
452
|
+
expectedResponseJMESPath: import_zod11.z.string().optional()
|
|
428
453
|
});
|
|
429
454
|
|
|
430
455
|
// src/test/command-execution.ts
|
|
431
|
-
var
|
|
456
|
+
var import_zod12 = require("zod");
|
|
432
457
|
var AllowedCommands = [
|
|
433
458
|
"yarn install --no-immutable && yarn build",
|
|
434
459
|
"npm run build",
|
|
435
460
|
"yarn typecheck"
|
|
436
461
|
];
|
|
437
462
|
var CommandExecutionTestSchema = BaseTestSchema.extend({
|
|
438
|
-
type:
|
|
463
|
+
type: import_zod12.z.literal("COMMAND_EXECUTION" /* COMMAND_EXECUTION */),
|
|
439
464
|
/** Command to execute (must be in AllowedCommands) */
|
|
440
|
-
command:
|
|
465
|
+
command: import_zod12.z.string().refine((value) => AllowedCommands.includes(value), {
|
|
441
466
|
message: `Command must be one of: ${AllowedCommands.join(", ")}`
|
|
442
467
|
}),
|
|
443
468
|
/** Expected exit code (default: 0) */
|
|
444
|
-
expectedExitCode:
|
|
469
|
+
expectedExitCode: import_zod12.z.number().default(0).optional()
|
|
445
470
|
});
|
|
446
471
|
|
|
447
472
|
// src/test/file-presence.ts
|
|
448
|
-
var
|
|
473
|
+
var import_zod13 = require("zod");
|
|
449
474
|
var FilePresenceTestSchema = BaseTestSchema.extend({
|
|
450
|
-
type:
|
|
475
|
+
type: import_zod13.z.literal("FILE_PRESENCE" /* FILE_PRESENCE */),
|
|
451
476
|
/** Paths to check */
|
|
452
|
-
paths:
|
|
477
|
+
paths: import_zod13.z.array(import_zod13.z.string()),
|
|
453
478
|
/** Whether files should exist (true) or not exist (false) */
|
|
454
|
-
shouldExist:
|
|
479
|
+
shouldExist: import_zod13.z.boolean()
|
|
455
480
|
});
|
|
456
481
|
|
|
457
482
|
// src/test/file-content.ts
|
|
458
|
-
var
|
|
459
|
-
var FileContentCheckSchema =
|
|
483
|
+
var import_zod14 = require("zod");
|
|
484
|
+
var FileContentCheckSchema = import_zod14.z.object({
|
|
460
485
|
/** Strings that must be present in the file */
|
|
461
|
-
contains:
|
|
486
|
+
contains: import_zod14.z.array(import_zod14.z.string()).optional(),
|
|
462
487
|
/** Strings that must NOT be present in the file */
|
|
463
|
-
notContains:
|
|
488
|
+
notContains: import_zod14.z.array(import_zod14.z.string()).optional(),
|
|
464
489
|
/** Regex pattern the content must match */
|
|
465
|
-
matches:
|
|
490
|
+
matches: import_zod14.z.string().optional(),
|
|
466
491
|
/** JSON path checks for structured content */
|
|
467
|
-
jsonPath:
|
|
468
|
-
|
|
469
|
-
path:
|
|
470
|
-
value:
|
|
492
|
+
jsonPath: import_zod14.z.array(
|
|
493
|
+
import_zod14.z.object({
|
|
494
|
+
path: import_zod14.z.string(),
|
|
495
|
+
value: import_zod14.z.unknown()
|
|
471
496
|
})
|
|
472
497
|
).optional(),
|
|
473
498
|
/** Lines that should be added (for diff checking) */
|
|
474
|
-
added:
|
|
499
|
+
added: import_zod14.z.array(import_zod14.z.string()).optional(),
|
|
475
500
|
/** Lines that should be removed (for diff checking) */
|
|
476
|
-
removed:
|
|
501
|
+
removed: import_zod14.z.array(import_zod14.z.string()).optional()
|
|
477
502
|
});
|
|
478
503
|
var FileContentTestSchema = BaseTestSchema.extend({
|
|
479
|
-
type:
|
|
504
|
+
type: import_zod14.z.literal("FILE_CONTENT" /* FILE_CONTENT */),
|
|
480
505
|
/** Path to the file to check */
|
|
481
|
-
path:
|
|
506
|
+
path: import_zod14.z.string(),
|
|
482
507
|
/** Content checks to perform */
|
|
483
508
|
checks: FileContentCheckSchema
|
|
484
509
|
});
|
|
485
510
|
|
|
486
511
|
// src/test/build-check.ts
|
|
487
|
-
var
|
|
512
|
+
var import_zod15 = require("zod");
|
|
488
513
|
var BuildCheckTestSchema = BaseTestSchema.extend({
|
|
489
|
-
type:
|
|
514
|
+
type: import_zod15.z.literal("BUILD_CHECK" /* BUILD_CHECK */),
|
|
490
515
|
/** Build command to execute */
|
|
491
|
-
command:
|
|
516
|
+
command: import_zod15.z.string(),
|
|
492
517
|
/** Whether the build should succeed */
|
|
493
|
-
expectSuccess:
|
|
518
|
+
expectSuccess: import_zod15.z.boolean(),
|
|
494
519
|
/** Maximum allowed warnings (optional) */
|
|
495
|
-
allowedWarnings:
|
|
520
|
+
allowedWarnings: import_zod15.z.number().optional(),
|
|
496
521
|
/** Timeout in milliseconds */
|
|
497
|
-
timeout:
|
|
522
|
+
timeout: import_zod15.z.number().optional()
|
|
498
523
|
});
|
|
499
524
|
|
|
500
525
|
// src/test/vitest.ts
|
|
501
|
-
var
|
|
526
|
+
var import_zod16 = require("zod");
|
|
502
527
|
var VitestTestSchema = BaseTestSchema.extend({
|
|
503
|
-
type:
|
|
528
|
+
type: import_zod16.z.literal("VITEST" /* VITEST */),
|
|
504
529
|
/** Test file content */
|
|
505
|
-
testFile:
|
|
530
|
+
testFile: import_zod16.z.string(),
|
|
506
531
|
/** Name of the test file */
|
|
507
|
-
testFileName:
|
|
532
|
+
testFileName: import_zod16.z.string(),
|
|
508
533
|
/** Minimum pass rate required (0-100) */
|
|
509
|
-
minPassRate:
|
|
534
|
+
minPassRate: import_zod16.z.number().min(0).max(100)
|
|
510
535
|
});
|
|
511
536
|
|
|
512
537
|
// src/test/playwright-nl.ts
|
|
513
|
-
var
|
|
538
|
+
var import_zod17 = require("zod");
|
|
514
539
|
var PlaywrightNLTestSchema = BaseTestSchema.extend({
|
|
515
|
-
type:
|
|
540
|
+
type: import_zod17.z.literal("PLAYWRIGHT_NL" /* PLAYWRIGHT_NL */),
|
|
516
541
|
/** Natural language steps to execute */
|
|
517
|
-
steps:
|
|
542
|
+
steps: import_zod17.z.array(import_zod17.z.string()),
|
|
518
543
|
/** Expected outcome description */
|
|
519
|
-
expectedOutcome:
|
|
544
|
+
expectedOutcome: import_zod17.z.string(),
|
|
520
545
|
/** Timeout in milliseconds */
|
|
521
|
-
timeout:
|
|
546
|
+
timeout: import_zod17.z.number().optional()
|
|
522
547
|
});
|
|
523
548
|
|
|
524
549
|
// src/test/index.ts
|
|
525
|
-
var TestSchema =
|
|
550
|
+
var TestSchema = import_zod18.z.discriminatedUnion("type", [
|
|
526
551
|
LLMTestSchema,
|
|
527
552
|
ToolTestSchema,
|
|
528
553
|
SiteConfigTestSchema,
|
|
@@ -535,66 +560,66 @@ var TestSchema = import_zod17.z.discriminatedUnion("type", [
|
|
|
535
560
|
]);
|
|
536
561
|
|
|
537
562
|
// src/scenario/assertions.ts
|
|
538
|
-
var
|
|
539
|
-
var SkillWasCalledAssertionSchema =
|
|
540
|
-
type:
|
|
563
|
+
var import_zod19 = require("zod");
|
|
564
|
+
var SkillWasCalledAssertionSchema = import_zod19.z.object({
|
|
565
|
+
type: import_zod19.z.literal("skill_was_called"),
|
|
541
566
|
/** Name of the skill that must have been called (matched against trace Skill tool args) */
|
|
542
|
-
skillName:
|
|
567
|
+
skillName: import_zod19.z.string()
|
|
543
568
|
});
|
|
544
|
-
var BuildPassedAssertionSchema =
|
|
545
|
-
type:
|
|
569
|
+
var BuildPassedAssertionSchema = import_zod19.z.object({
|
|
570
|
+
type: import_zod19.z.literal("build_passed"),
|
|
546
571
|
/** Command to run (default: "yarn build") */
|
|
547
|
-
command:
|
|
572
|
+
command: import_zod19.z.string().optional(),
|
|
548
573
|
/** Expected exit code (default: 0) */
|
|
549
|
-
expectedExitCode:
|
|
574
|
+
expectedExitCode: import_zod19.z.number().int().optional()
|
|
550
575
|
});
|
|
551
|
-
var LlmJudgeAssertionSchema =
|
|
552
|
-
type:
|
|
576
|
+
var LlmJudgeAssertionSchema = import_zod19.z.object({
|
|
577
|
+
type: import_zod19.z.literal("llm_judge"),
|
|
553
578
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
554
|
-
prompt:
|
|
579
|
+
prompt: import_zod19.z.string(),
|
|
555
580
|
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
556
|
-
systemPrompt:
|
|
581
|
+
systemPrompt: import_zod19.z.string().optional(),
|
|
557
582
|
/** Minimum score to pass (0-100, default 70) */
|
|
558
|
-
minScore:
|
|
583
|
+
minScore: import_zod19.z.number().int().min(0).max(100).optional(),
|
|
559
584
|
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
560
|
-
model:
|
|
561
|
-
maxTokens:
|
|
562
|
-
temperature:
|
|
585
|
+
model: import_zod19.z.string().optional(),
|
|
586
|
+
maxTokens: import_zod19.z.number().int().optional(),
|
|
587
|
+
temperature: import_zod19.z.number().min(0).max(1).optional()
|
|
563
588
|
});
|
|
564
|
-
var AssertionSchema =
|
|
589
|
+
var AssertionSchema = import_zod19.z.union([
|
|
565
590
|
SkillWasCalledAssertionSchema,
|
|
566
591
|
BuildPassedAssertionSchema,
|
|
567
592
|
LlmJudgeAssertionSchema
|
|
568
593
|
]);
|
|
569
594
|
|
|
570
595
|
// src/scenario/environment.ts
|
|
571
|
-
var
|
|
572
|
-
var LocalProjectConfigSchema =
|
|
596
|
+
var import_zod20 = require("zod");
|
|
597
|
+
var LocalProjectConfigSchema = import_zod20.z.object({
|
|
573
598
|
/** Template ID to use for the local project */
|
|
574
|
-
templateId:
|
|
599
|
+
templateId: import_zod20.z.string().optional(),
|
|
575
600
|
/** Files to create in the project */
|
|
576
|
-
files:
|
|
577
|
-
|
|
578
|
-
path:
|
|
579
|
-
content:
|
|
601
|
+
files: import_zod20.z.array(
|
|
602
|
+
import_zod20.z.object({
|
|
603
|
+
path: import_zod20.z.string().min(1),
|
|
604
|
+
content: import_zod20.z.string().min(1)
|
|
580
605
|
})
|
|
581
606
|
).optional()
|
|
582
607
|
});
|
|
583
|
-
var MetaSiteConfigSchema =
|
|
584
|
-
configurations:
|
|
585
|
-
|
|
586
|
-
name:
|
|
587
|
-
apiCalls:
|
|
588
|
-
|
|
589
|
-
url:
|
|
590
|
-
method:
|
|
591
|
-
body:
|
|
608
|
+
var MetaSiteConfigSchema = import_zod20.z.object({
|
|
609
|
+
configurations: import_zod20.z.array(
|
|
610
|
+
import_zod20.z.object({
|
|
611
|
+
name: import_zod20.z.string().min(1),
|
|
612
|
+
apiCalls: import_zod20.z.array(
|
|
613
|
+
import_zod20.z.object({
|
|
614
|
+
url: import_zod20.z.string().url(),
|
|
615
|
+
method: import_zod20.z.enum(["POST", "PUT"]),
|
|
616
|
+
body: import_zod20.z.string()
|
|
592
617
|
})
|
|
593
618
|
)
|
|
594
619
|
})
|
|
595
620
|
).optional()
|
|
596
621
|
});
|
|
597
|
-
var EnvironmentSchema =
|
|
622
|
+
var EnvironmentSchema = import_zod20.z.object({
|
|
598
623
|
/** Local project configuration */
|
|
599
624
|
localProject: LocalProjectConfigSchema.optional(),
|
|
600
625
|
/** Meta site configuration */
|
|
@@ -602,54 +627,54 @@ var EnvironmentSchema = import_zod19.z.object({
|
|
|
602
627
|
});
|
|
603
628
|
|
|
604
629
|
// src/scenario/test-scenario.ts
|
|
605
|
-
var
|
|
630
|
+
var import_zod22 = require("zod");
|
|
606
631
|
|
|
607
632
|
// src/assertion/assertion.ts
|
|
608
|
-
var
|
|
609
|
-
var AssertionTypeSchema =
|
|
633
|
+
var import_zod21 = require("zod");
|
|
634
|
+
var AssertionTypeSchema = import_zod21.z.enum([
|
|
610
635
|
"skill_was_called",
|
|
611
636
|
"build_passed",
|
|
612
637
|
"llm_judge"
|
|
613
638
|
]);
|
|
614
|
-
var AssertionParameterTypeSchema =
|
|
639
|
+
var AssertionParameterTypeSchema = import_zod21.z.enum([
|
|
615
640
|
"string",
|
|
616
641
|
"number",
|
|
617
642
|
"boolean"
|
|
618
643
|
]);
|
|
619
|
-
var AssertionParameterSchema =
|
|
644
|
+
var AssertionParameterSchema = import_zod21.z.object({
|
|
620
645
|
/** Parameter name (used as key in params object) */
|
|
621
|
-
name:
|
|
646
|
+
name: import_zod21.z.string().min(1),
|
|
622
647
|
/** Display label for the parameter */
|
|
623
|
-
label:
|
|
648
|
+
label: import_zod21.z.string().min(1),
|
|
624
649
|
/** Parameter type */
|
|
625
650
|
type: AssertionParameterTypeSchema,
|
|
626
651
|
/** Whether this parameter is required */
|
|
627
|
-
required:
|
|
652
|
+
required: import_zod21.z.boolean(),
|
|
628
653
|
/** Default value (optional, used when not provided) */
|
|
629
|
-
defaultValue:
|
|
654
|
+
defaultValue: import_zod21.z.union([import_zod21.z.string(), import_zod21.z.number(), import_zod21.z.boolean()]).optional(),
|
|
630
655
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
631
|
-
advanced:
|
|
656
|
+
advanced: import_zod21.z.boolean().optional()
|
|
632
657
|
});
|
|
633
|
-
var ScenarioAssertionLinkSchema =
|
|
658
|
+
var ScenarioAssertionLinkSchema = import_zod21.z.object({
|
|
634
659
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
635
|
-
assertionId:
|
|
660
|
+
assertionId: import_zod21.z.string(),
|
|
636
661
|
/** Parameter values for this assertion in this scenario */
|
|
637
|
-
params:
|
|
638
|
-
|
|
639
|
-
|
|
662
|
+
params: import_zod21.z.record(
|
|
663
|
+
import_zod21.z.string(),
|
|
664
|
+
import_zod21.z.union([import_zod21.z.string(), import_zod21.z.number(), import_zod21.z.boolean(), import_zod21.z.null()])
|
|
640
665
|
).optional()
|
|
641
666
|
});
|
|
642
|
-
var SkillWasCalledConfigSchema =
|
|
667
|
+
var SkillWasCalledConfigSchema = import_zod21.z.object({
|
|
643
668
|
/** Name of the skill that must have been called */
|
|
644
|
-
skillName:
|
|
669
|
+
skillName: import_zod21.z.string().min(1)
|
|
645
670
|
});
|
|
646
|
-
var BuildPassedConfigSchema =
|
|
671
|
+
var BuildPassedConfigSchema = import_zod21.z.strictObject({
|
|
647
672
|
/** Command to run (default: "yarn build") */
|
|
648
|
-
command:
|
|
673
|
+
command: import_zod21.z.string().optional(),
|
|
649
674
|
/** Expected exit code (default: 0) */
|
|
650
|
-
expectedExitCode:
|
|
675
|
+
expectedExitCode: import_zod21.z.number().int().optional()
|
|
651
676
|
});
|
|
652
|
-
var LlmJudgeConfigSchema =
|
|
677
|
+
var LlmJudgeConfigSchema = import_zod21.z.object({
|
|
653
678
|
/**
|
|
654
679
|
* Prompt template with placeholders:
|
|
655
680
|
* - {{output}}: agent's final output
|
|
@@ -660,28 +685,28 @@ var LlmJudgeConfigSchema = import_zod20.z.object({
|
|
|
660
685
|
* - {{trace}}: step-by-step trace of tool calls
|
|
661
686
|
* - Custom parameters defined in the parameters array
|
|
662
687
|
*/
|
|
663
|
-
prompt:
|
|
688
|
+
prompt: import_zod21.z.string().min(1),
|
|
664
689
|
/** Optional system prompt for the judge */
|
|
665
|
-
systemPrompt:
|
|
690
|
+
systemPrompt: import_zod21.z.string().optional(),
|
|
666
691
|
/** Minimum score to pass (0-100, default 70) */
|
|
667
|
-
minScore:
|
|
692
|
+
minScore: import_zod21.z.number().int().min(0).max(100).optional(),
|
|
668
693
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
669
|
-
model:
|
|
694
|
+
model: import_zod21.z.string().optional(),
|
|
670
695
|
/** Max output tokens */
|
|
671
|
-
maxTokens:
|
|
696
|
+
maxTokens: import_zod21.z.number().int().optional(),
|
|
672
697
|
/** Temperature (0-1) */
|
|
673
|
-
temperature:
|
|
698
|
+
temperature: import_zod21.z.number().min(0).max(1).optional(),
|
|
674
699
|
/** User-defined parameters for this assertion */
|
|
675
|
-
parameters:
|
|
700
|
+
parameters: import_zod21.z.array(AssertionParameterSchema).optional()
|
|
676
701
|
});
|
|
677
|
-
var AssertionConfigSchema =
|
|
702
|
+
var AssertionConfigSchema = import_zod21.z.union([
|
|
678
703
|
LlmJudgeConfigSchema,
|
|
679
704
|
// requires prompt - check first
|
|
680
705
|
SkillWasCalledConfigSchema,
|
|
681
706
|
// requires skillName
|
|
682
707
|
BuildPassedConfigSchema,
|
|
683
708
|
// all optional, uses strictObject to reject unknown keys
|
|
684
|
-
|
|
709
|
+
import_zod21.z.object({})
|
|
685
710
|
// fallback empty config
|
|
686
711
|
]);
|
|
687
712
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -726,23 +751,23 @@ function getLlmJudgeConfig(assertion) {
|
|
|
726
751
|
}
|
|
727
752
|
|
|
728
753
|
// src/scenario/test-scenario.ts
|
|
729
|
-
var ExpectedFileSchema =
|
|
754
|
+
var ExpectedFileSchema = import_zod22.z.object({
|
|
730
755
|
/** Relative path where the file should be created */
|
|
731
|
-
path:
|
|
756
|
+
path: import_zod22.z.string(),
|
|
732
757
|
/** Optional expected content */
|
|
733
|
-
content:
|
|
758
|
+
content: import_zod22.z.string().optional()
|
|
734
759
|
});
|
|
735
760
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
736
761
|
/** The prompt sent to the agent to trigger the task */
|
|
737
|
-
triggerPrompt:
|
|
762
|
+
triggerPrompt: import_zod22.z.string().min(10),
|
|
738
763
|
/** ID of the template to use for this scenario (null = no template) */
|
|
739
|
-
templateId:
|
|
764
|
+
templateId: import_zod22.z.string().nullish(),
|
|
740
765
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
741
|
-
assertions:
|
|
766
|
+
assertions: import_zod22.z.array(AssertionSchema).optional(),
|
|
742
767
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
743
|
-
assertionIds:
|
|
768
|
+
assertionIds: import_zod22.z.array(import_zod22.z.string()).optional(),
|
|
744
769
|
/** Linked assertions with per-scenario parameter values */
|
|
745
|
-
assertionLinks:
|
|
770
|
+
assertionLinks: import_zod22.z.array(ScenarioAssertionLinkSchema).optional()
|
|
746
771
|
});
|
|
747
772
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
748
773
|
id: true,
|
|
@@ -753,10 +778,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
753
778
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
754
779
|
|
|
755
780
|
// src/suite/test-suite.ts
|
|
756
|
-
var
|
|
781
|
+
var import_zod23 = require("zod");
|
|
757
782
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
758
783
|
/** IDs of test scenarios in this suite */
|
|
759
|
-
scenarioIds:
|
|
784
|
+
scenarioIds: import_zod23.z.array(import_zod23.z.string())
|
|
760
785
|
});
|
|
761
786
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
762
787
|
id: true,
|
|
@@ -767,21 +792,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
767
792
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
768
793
|
|
|
769
794
|
// src/evaluation/metrics.ts
|
|
770
|
-
var
|
|
771
|
-
var TokenUsageSchema =
|
|
772
|
-
prompt:
|
|
773
|
-
completion:
|
|
774
|
-
total:
|
|
775
|
-
});
|
|
776
|
-
var EvalMetricsSchema =
|
|
777
|
-
totalAssertions:
|
|
778
|
-
passed:
|
|
779
|
-
failed:
|
|
780
|
-
skipped:
|
|
781
|
-
errors:
|
|
782
|
-
passRate:
|
|
783
|
-
avgDuration:
|
|
784
|
-
totalDuration:
|
|
795
|
+
var import_zod24 = require("zod");
|
|
796
|
+
var TokenUsageSchema = import_zod24.z.object({
|
|
797
|
+
prompt: import_zod24.z.number(),
|
|
798
|
+
completion: import_zod24.z.number(),
|
|
799
|
+
total: import_zod24.z.number()
|
|
800
|
+
});
|
|
801
|
+
var EvalMetricsSchema = import_zod24.z.object({
|
|
802
|
+
totalAssertions: import_zod24.z.number(),
|
|
803
|
+
passed: import_zod24.z.number(),
|
|
804
|
+
failed: import_zod24.z.number(),
|
|
805
|
+
skipped: import_zod24.z.number(),
|
|
806
|
+
errors: import_zod24.z.number(),
|
|
807
|
+
passRate: import_zod24.z.number(),
|
|
808
|
+
avgDuration: import_zod24.z.number(),
|
|
809
|
+
totalDuration: import_zod24.z.number()
|
|
785
810
|
});
|
|
786
811
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
787
812
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -791,7 +816,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
791
816
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
792
817
|
return EvalStatus2;
|
|
793
818
|
})(EvalStatus || {});
|
|
794
|
-
var EvalStatusSchema =
|
|
819
|
+
var EvalStatusSchema = import_zod24.z.enum(EvalStatus);
|
|
795
820
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
796
821
|
LLMStepType2["COMPLETION"] = "completion";
|
|
797
822
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -799,52 +824,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
799
824
|
LLMStepType2["THINKING"] = "thinking";
|
|
800
825
|
return LLMStepType2;
|
|
801
826
|
})(LLMStepType || {});
|
|
802
|
-
var LLMTraceStepSchema =
|
|
803
|
-
id:
|
|
804
|
-
stepNumber:
|
|
805
|
-
type:
|
|
806
|
-
model:
|
|
807
|
-
provider:
|
|
808
|
-
startedAt:
|
|
809
|
-
durationMs:
|
|
827
|
+
var LLMTraceStepSchema = import_zod24.z.object({
|
|
828
|
+
id: import_zod24.z.string(),
|
|
829
|
+
stepNumber: import_zod24.z.number(),
|
|
830
|
+
type: import_zod24.z.enum(LLMStepType),
|
|
831
|
+
model: import_zod24.z.string(),
|
|
832
|
+
provider: import_zod24.z.string(),
|
|
833
|
+
startedAt: import_zod24.z.string(),
|
|
834
|
+
durationMs: import_zod24.z.number(),
|
|
810
835
|
tokenUsage: TokenUsageSchema,
|
|
811
|
-
costUsd:
|
|
812
|
-
toolName:
|
|
813
|
-
toolArguments:
|
|
814
|
-
inputPreview:
|
|
815
|
-
outputPreview:
|
|
816
|
-
success:
|
|
817
|
-
error:
|
|
818
|
-
});
|
|
819
|
-
var LLMBreakdownStatsSchema =
|
|
820
|
-
count:
|
|
821
|
-
durationMs:
|
|
822
|
-
tokens:
|
|
823
|
-
costUsd:
|
|
824
|
-
});
|
|
825
|
-
var LLMTraceSummarySchema =
|
|
826
|
-
totalSteps:
|
|
827
|
-
totalDurationMs:
|
|
836
|
+
costUsd: import_zod24.z.number(),
|
|
837
|
+
toolName: import_zod24.z.string().optional(),
|
|
838
|
+
toolArguments: import_zod24.z.string().optional(),
|
|
839
|
+
inputPreview: import_zod24.z.string().optional(),
|
|
840
|
+
outputPreview: import_zod24.z.string().optional(),
|
|
841
|
+
success: import_zod24.z.boolean(),
|
|
842
|
+
error: import_zod24.z.string().optional()
|
|
843
|
+
});
|
|
844
|
+
var LLMBreakdownStatsSchema = import_zod24.z.object({
|
|
845
|
+
count: import_zod24.z.number(),
|
|
846
|
+
durationMs: import_zod24.z.number(),
|
|
847
|
+
tokens: import_zod24.z.number(),
|
|
848
|
+
costUsd: import_zod24.z.number()
|
|
849
|
+
});
|
|
850
|
+
var LLMTraceSummarySchema = import_zod24.z.object({
|
|
851
|
+
totalSteps: import_zod24.z.number(),
|
|
852
|
+
totalDurationMs: import_zod24.z.number(),
|
|
828
853
|
totalTokens: TokenUsageSchema,
|
|
829
|
-
totalCostUsd:
|
|
830
|
-
stepTypeBreakdown:
|
|
831
|
-
modelBreakdown:
|
|
832
|
-
modelsUsed:
|
|
833
|
-
});
|
|
834
|
-
var LLMTraceSchema =
|
|
835
|
-
id:
|
|
836
|
-
steps:
|
|
854
|
+
totalCostUsd: import_zod24.z.number(),
|
|
855
|
+
stepTypeBreakdown: import_zod24.z.record(import_zod24.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
856
|
+
modelBreakdown: import_zod24.z.record(import_zod24.z.string(), LLMBreakdownStatsSchema),
|
|
857
|
+
modelsUsed: import_zod24.z.array(import_zod24.z.string())
|
|
858
|
+
});
|
|
859
|
+
var LLMTraceSchema = import_zod24.z.object({
|
|
860
|
+
id: import_zod24.z.string(),
|
|
861
|
+
steps: import_zod24.z.array(LLMTraceStepSchema),
|
|
837
862
|
summary: LLMTraceSummarySchema
|
|
838
863
|
});
|
|
839
864
|
|
|
840
865
|
// src/evaluation/eval-result.ts
|
|
841
|
-
var
|
|
866
|
+
var import_zod27 = require("zod");
|
|
842
867
|
|
|
843
868
|
// src/evaluation/eval-run.ts
|
|
844
|
-
var
|
|
869
|
+
var import_zod26 = require("zod");
|
|
845
870
|
|
|
846
871
|
// src/evaluation/live-trace.ts
|
|
847
|
-
var
|
|
872
|
+
var import_zod25 = require("zod");
|
|
848
873
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
849
874
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
850
875
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -858,37 +883,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
858
883
|
LiveTraceEventType2["USER"] = "user";
|
|
859
884
|
return LiveTraceEventType2;
|
|
860
885
|
})(LiveTraceEventType || {});
|
|
861
|
-
var LiveTraceEventSchema =
|
|
886
|
+
var LiveTraceEventSchema = import_zod25.z.object({
|
|
862
887
|
/** The evaluation run ID */
|
|
863
|
-
evalRunId:
|
|
888
|
+
evalRunId: import_zod25.z.string(),
|
|
864
889
|
/** The scenario ID being executed */
|
|
865
|
-
scenarioId:
|
|
890
|
+
scenarioId: import_zod25.z.string(),
|
|
866
891
|
/** The scenario name for display */
|
|
867
|
-
scenarioName:
|
|
892
|
+
scenarioName: import_zod25.z.string(),
|
|
868
893
|
/** The target ID (skill, agent, etc.) */
|
|
869
|
-
targetId:
|
|
894
|
+
targetId: import_zod25.z.string(),
|
|
870
895
|
/** The target name for display */
|
|
871
|
-
targetName:
|
|
896
|
+
targetName: import_zod25.z.string(),
|
|
872
897
|
/** Step number in the current scenario execution */
|
|
873
|
-
stepNumber:
|
|
898
|
+
stepNumber: import_zod25.z.number(),
|
|
874
899
|
/** Type of trace event */
|
|
875
|
-
type:
|
|
900
|
+
type: import_zod25.z.enum(LiveTraceEventType),
|
|
876
901
|
/** Tool name if this is a tool_use event */
|
|
877
|
-
toolName:
|
|
902
|
+
toolName: import_zod25.z.string().optional(),
|
|
878
903
|
/** Tool arguments preview (truncated JSON) */
|
|
879
|
-
toolArgs:
|
|
904
|
+
toolArgs: import_zod25.z.string().optional(),
|
|
880
905
|
/** Output preview (truncated text) */
|
|
881
|
-
outputPreview:
|
|
906
|
+
outputPreview: import_zod25.z.string().optional(),
|
|
882
907
|
/** File path for file operations */
|
|
883
|
-
filePath:
|
|
908
|
+
filePath: import_zod25.z.string().optional(),
|
|
884
909
|
/** Elapsed time in milliseconds for progress events */
|
|
885
|
-
elapsedMs:
|
|
910
|
+
elapsedMs: import_zod25.z.number().optional(),
|
|
886
911
|
/** Thinking/reasoning text from Claude */
|
|
887
|
-
thinking:
|
|
912
|
+
thinking: import_zod25.z.string().optional(),
|
|
888
913
|
/** Timestamp when this event occurred */
|
|
889
|
-
timestamp:
|
|
914
|
+
timestamp: import_zod25.z.string(),
|
|
890
915
|
/** Whether this is the final event for this scenario */
|
|
891
|
-
isComplete:
|
|
916
|
+
isComplete: import_zod25.z.boolean()
|
|
892
917
|
});
|
|
893
918
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
894
919
|
function parseTraceEventLine(line) {
|
|
@@ -916,14 +941,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
916
941
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
917
942
|
return TriggerType2;
|
|
918
943
|
})(TriggerType || {});
|
|
919
|
-
var TriggerMetadataSchema =
|
|
920
|
-
version:
|
|
921
|
-
resourceUpdated:
|
|
944
|
+
var TriggerMetadataSchema = import_zod26.z.object({
|
|
945
|
+
version: import_zod26.z.string().optional(),
|
|
946
|
+
resourceUpdated: import_zod26.z.array(import_zod26.z.string()).optional()
|
|
922
947
|
});
|
|
923
|
-
var TriggerSchema =
|
|
924
|
-
id:
|
|
948
|
+
var TriggerSchema = import_zod26.z.object({
|
|
949
|
+
id: import_zod26.z.string(),
|
|
925
950
|
metadata: TriggerMetadataSchema.optional(),
|
|
926
|
-
type:
|
|
951
|
+
type: import_zod26.z.enum(TriggerType)
|
|
927
952
|
});
|
|
928
953
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
929
954
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -941,28 +966,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
941
966
|
FailureSeverity2["LOW"] = "low";
|
|
942
967
|
return FailureSeverity2;
|
|
943
968
|
})(FailureSeverity || {});
|
|
944
|
-
var DiffLineTypeSchema =
|
|
945
|
-
var DiffLineSchema =
|
|
969
|
+
var DiffLineTypeSchema = import_zod26.z.enum(["added", "removed", "unchanged"]);
|
|
970
|
+
var DiffLineSchema = import_zod26.z.object({
|
|
946
971
|
type: DiffLineTypeSchema,
|
|
947
|
-
content:
|
|
948
|
-
lineNumber:
|
|
949
|
-
});
|
|
950
|
-
var DiffContentSchema =
|
|
951
|
-
path:
|
|
952
|
-
expected:
|
|
953
|
-
actual:
|
|
954
|
-
diffLines:
|
|
955
|
-
renamedFrom:
|
|
956
|
-
});
|
|
957
|
-
var CommandExecutionSchema =
|
|
958
|
-
command:
|
|
959
|
-
exitCode:
|
|
960
|
-
output:
|
|
961
|
-
duration:
|
|
962
|
-
});
|
|
963
|
-
var FileModificationSchema =
|
|
964
|
-
path:
|
|
965
|
-
action:
|
|
972
|
+
content: import_zod26.z.string(),
|
|
973
|
+
lineNumber: import_zod26.z.number()
|
|
974
|
+
});
|
|
975
|
+
var DiffContentSchema = import_zod26.z.object({
|
|
976
|
+
path: import_zod26.z.string(),
|
|
977
|
+
expected: import_zod26.z.string(),
|
|
978
|
+
actual: import_zod26.z.string(),
|
|
979
|
+
diffLines: import_zod26.z.array(DiffLineSchema),
|
|
980
|
+
renamedFrom: import_zod26.z.string().optional()
|
|
981
|
+
});
|
|
982
|
+
var CommandExecutionSchema = import_zod26.z.object({
|
|
983
|
+
command: import_zod26.z.string(),
|
|
984
|
+
exitCode: import_zod26.z.number(),
|
|
985
|
+
output: import_zod26.z.string().optional(),
|
|
986
|
+
duration: import_zod26.z.number()
|
|
987
|
+
});
|
|
988
|
+
var FileModificationSchema = import_zod26.z.object({
|
|
989
|
+
path: import_zod26.z.string(),
|
|
990
|
+
action: import_zod26.z.enum(["created", "modified", "deleted"])
|
|
966
991
|
});
|
|
967
992
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
968
993
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -970,75 +995,79 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
970
995
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
971
996
|
return TemplateFileStatus2;
|
|
972
997
|
})(TemplateFileStatus || {});
|
|
973
|
-
var TemplateFileSchema =
|
|
998
|
+
var TemplateFileSchema = import_zod26.z.object({
|
|
974
999
|
/** Relative path within the template */
|
|
975
|
-
path:
|
|
1000
|
+
path: import_zod26.z.string(),
|
|
976
1001
|
/** Full file content after execution */
|
|
977
|
-
content:
|
|
1002
|
+
content: import_zod26.z.string(),
|
|
978
1003
|
/** File status (new, modified, unchanged) */
|
|
979
|
-
status:
|
|
980
|
-
});
|
|
981
|
-
var ApiCallSchema =
|
|
982
|
-
endpoint:
|
|
983
|
-
tokensUsed:
|
|
984
|
-
duration:
|
|
985
|
-
});
|
|
986
|
-
var ExecutionTraceSchema =
|
|
987
|
-
commands:
|
|
988
|
-
filesModified:
|
|
989
|
-
apiCalls:
|
|
990
|
-
totalDuration:
|
|
991
|
-
});
|
|
992
|
-
var FailureAnalysisSchema =
|
|
993
|
-
category:
|
|
994
|
-
severity:
|
|
995
|
-
summary:
|
|
996
|
-
details:
|
|
997
|
-
rootCause:
|
|
998
|
-
suggestedFix:
|
|
999
|
-
relatedAssertions:
|
|
1000
|
-
codeSnippet:
|
|
1001
|
-
similarIssues:
|
|
1002
|
-
patternId:
|
|
1004
|
+
status: import_zod26.z.enum(["new", "modified", "unchanged"])
|
|
1005
|
+
});
|
|
1006
|
+
var ApiCallSchema = import_zod26.z.object({
|
|
1007
|
+
endpoint: import_zod26.z.string(),
|
|
1008
|
+
tokensUsed: import_zod26.z.number(),
|
|
1009
|
+
duration: import_zod26.z.number()
|
|
1010
|
+
});
|
|
1011
|
+
var ExecutionTraceSchema = import_zod26.z.object({
|
|
1012
|
+
commands: import_zod26.z.array(CommandExecutionSchema),
|
|
1013
|
+
filesModified: import_zod26.z.array(FileModificationSchema),
|
|
1014
|
+
apiCalls: import_zod26.z.array(ApiCallSchema),
|
|
1015
|
+
totalDuration: import_zod26.z.number()
|
|
1016
|
+
});
|
|
1017
|
+
var FailureAnalysisSchema = import_zod26.z.object({
|
|
1018
|
+
category: import_zod26.z.enum(FailureCategory),
|
|
1019
|
+
severity: import_zod26.z.enum(FailureSeverity),
|
|
1020
|
+
summary: import_zod26.z.string(),
|
|
1021
|
+
details: import_zod26.z.string(),
|
|
1022
|
+
rootCause: import_zod26.z.string(),
|
|
1023
|
+
suggestedFix: import_zod26.z.string(),
|
|
1024
|
+
relatedAssertions: import_zod26.z.array(import_zod26.z.string()),
|
|
1025
|
+
codeSnippet: import_zod26.z.string().optional(),
|
|
1026
|
+
similarIssues: import_zod26.z.array(import_zod26.z.string()).optional(),
|
|
1027
|
+
patternId: import_zod26.z.string().optional(),
|
|
1003
1028
|
// Extended fields for detailed debugging
|
|
1004
1029
|
diff: DiffContentSchema.optional(),
|
|
1005
1030
|
executionTrace: ExecutionTraceSchema.optional()
|
|
1006
1031
|
});
|
|
1007
1032
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1008
1033
|
/** Agent ID for this run */
|
|
1009
|
-
agentId:
|
|
1034
|
+
agentId: import_zod26.z.string().optional(),
|
|
1010
1035
|
/** Skills group ID for this run */
|
|
1011
|
-
skillsGroupId:
|
|
1036
|
+
skillsGroupId: import_zod26.z.string().optional(),
|
|
1012
1037
|
/** Scenario IDs to run */
|
|
1013
|
-
scenarioIds:
|
|
1038
|
+
scenarioIds: import_zod26.z.array(import_zod26.z.string()),
|
|
1014
1039
|
/** Current status */
|
|
1015
1040
|
status: EvalStatusSchema,
|
|
1016
1041
|
/** Progress percentage (0-100) */
|
|
1017
|
-
progress:
|
|
1042
|
+
progress: import_zod26.z.number(),
|
|
1018
1043
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1019
|
-
results:
|
|
1044
|
+
results: import_zod26.z.array(import_zod26.z.lazy(() => EvalRunResultSchema)),
|
|
1020
1045
|
/** Aggregated metrics across all results */
|
|
1021
1046
|
aggregateMetrics: EvalMetricsSchema,
|
|
1022
1047
|
/** Failure analyses */
|
|
1023
|
-
failureAnalyses:
|
|
1048
|
+
failureAnalyses: import_zod26.z.array(FailureAnalysisSchema).optional(),
|
|
1024
1049
|
/** Aggregated LLM trace summary */
|
|
1025
1050
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
1026
1051
|
/** What triggered this run */
|
|
1027
1052
|
trigger: TriggerSchema.optional(),
|
|
1028
1053
|
/** When the run started (set when evaluation is triggered) */
|
|
1029
|
-
startedAt:
|
|
1054
|
+
startedAt: import_zod26.z.string().optional(),
|
|
1030
1055
|
/** When the run completed */
|
|
1031
|
-
completedAt:
|
|
1056
|
+
completedAt: import_zod26.z.string().optional(),
|
|
1032
1057
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1033
|
-
liveTraceEvents:
|
|
1058
|
+
liveTraceEvents: import_zod26.z.array(LiveTraceEventSchema).optional(),
|
|
1034
1059
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1035
|
-
jobId:
|
|
1060
|
+
jobId: import_zod26.z.string().optional(),
|
|
1036
1061
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1037
|
-
jobStatus:
|
|
1062
|
+
jobStatus: import_zod26.z.string().optional(),
|
|
1038
1063
|
/** Remote job error message if the job failed */
|
|
1039
|
-
jobError:
|
|
1064
|
+
jobError: import_zod26.z.string().optional(),
|
|
1040
1065
|
/** Timestamp of the last job status check */
|
|
1041
|
-
jobStatusCheckedAt:
|
|
1066
|
+
jobStatusCheckedAt: import_zod26.z.string().optional(),
|
|
1067
|
+
/** MCP server IDs to enable for this run (optional) */
|
|
1068
|
+
mcpIds: import_zod26.z.array(import_zod26.z.string()).optional(),
|
|
1069
|
+
/** Sub-agent IDs to enable for this run (optional) */
|
|
1070
|
+
subAgentIds: import_zod26.z.array(import_zod26.z.string()).optional()
|
|
1042
1071
|
});
|
|
1043
1072
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
1044
1073
|
id: true,
|
|
@@ -1051,28 +1080,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1051
1080
|
startedAt: true,
|
|
1052
1081
|
completedAt: true
|
|
1053
1082
|
});
|
|
1054
|
-
var EvaluationProgressSchema =
|
|
1055
|
-
runId:
|
|
1056
|
-
targetId:
|
|
1057
|
-
totalScenarios:
|
|
1058
|
-
completedScenarios:
|
|
1059
|
-
scenarioProgress:
|
|
1060
|
-
|
|
1061
|
-
scenarioId:
|
|
1062
|
-
currentStep:
|
|
1063
|
-
error:
|
|
1083
|
+
var EvaluationProgressSchema = import_zod26.z.object({
|
|
1084
|
+
runId: import_zod26.z.string(),
|
|
1085
|
+
targetId: import_zod26.z.string(),
|
|
1086
|
+
totalScenarios: import_zod26.z.number(),
|
|
1087
|
+
completedScenarios: import_zod26.z.number(),
|
|
1088
|
+
scenarioProgress: import_zod26.z.array(
|
|
1089
|
+
import_zod26.z.object({
|
|
1090
|
+
scenarioId: import_zod26.z.string(),
|
|
1091
|
+
currentStep: import_zod26.z.string(),
|
|
1092
|
+
error: import_zod26.z.string().optional()
|
|
1064
1093
|
})
|
|
1065
1094
|
),
|
|
1066
|
-
createdAt:
|
|
1095
|
+
createdAt: import_zod26.z.number()
|
|
1067
1096
|
});
|
|
1068
|
-
var EvaluationLogSchema =
|
|
1069
|
-
runId:
|
|
1070
|
-
scenarioId:
|
|
1071
|
-
log:
|
|
1072
|
-
level:
|
|
1073
|
-
message:
|
|
1074
|
-
args:
|
|
1075
|
-
error:
|
|
1097
|
+
var EvaluationLogSchema = import_zod26.z.object({
|
|
1098
|
+
runId: import_zod26.z.string(),
|
|
1099
|
+
scenarioId: import_zod26.z.string(),
|
|
1100
|
+
log: import_zod26.z.object({
|
|
1101
|
+
level: import_zod26.z.enum(["info", "error", "debug"]),
|
|
1102
|
+
message: import_zod26.z.string().optional(),
|
|
1103
|
+
args: import_zod26.z.array(import_zod26.z.any()).optional(),
|
|
1104
|
+
error: import_zod26.z.string().optional()
|
|
1076
1105
|
})
|
|
1077
1106
|
});
|
|
1078
1107
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -1085,91 +1114,91 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1085
1114
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1086
1115
|
return AssertionResultStatus2;
|
|
1087
1116
|
})(AssertionResultStatus || {});
|
|
1088
|
-
var AssertionResultSchema =
|
|
1089
|
-
id:
|
|
1090
|
-
assertionId:
|
|
1091
|
-
assertionType:
|
|
1092
|
-
assertionName:
|
|
1093
|
-
status:
|
|
1094
|
-
message:
|
|
1095
|
-
expected:
|
|
1096
|
-
actual:
|
|
1097
|
-
duration:
|
|
1098
|
-
details:
|
|
1099
|
-
llmTraceSteps:
|
|
1100
|
-
});
|
|
1101
|
-
var EvalRunResultSchema =
|
|
1102
|
-
id:
|
|
1103
|
-
targetId:
|
|
1104
|
-
targetName:
|
|
1105
|
-
scenarioId:
|
|
1106
|
-
scenarioName:
|
|
1117
|
+
var AssertionResultSchema = import_zod27.z.object({
|
|
1118
|
+
id: import_zod27.z.string(),
|
|
1119
|
+
assertionId: import_zod27.z.string(),
|
|
1120
|
+
assertionType: import_zod27.z.string(),
|
|
1121
|
+
assertionName: import_zod27.z.string(),
|
|
1122
|
+
status: import_zod27.z.enum(AssertionResultStatus),
|
|
1123
|
+
message: import_zod27.z.string().optional(),
|
|
1124
|
+
expected: import_zod27.z.string().optional(),
|
|
1125
|
+
actual: import_zod27.z.string().optional(),
|
|
1126
|
+
duration: import_zod27.z.number().optional(),
|
|
1127
|
+
details: import_zod27.z.record(import_zod27.z.string(), import_zod27.z.unknown()).optional(),
|
|
1128
|
+
llmTraceSteps: import_zod27.z.array(LLMTraceStepSchema).optional()
|
|
1129
|
+
});
|
|
1130
|
+
var EvalRunResultSchema = import_zod27.z.object({
|
|
1131
|
+
id: import_zod27.z.string(),
|
|
1132
|
+
targetId: import_zod27.z.string(),
|
|
1133
|
+
targetName: import_zod27.z.string().optional(),
|
|
1134
|
+
scenarioId: import_zod27.z.string(),
|
|
1135
|
+
scenarioName: import_zod27.z.string(),
|
|
1107
1136
|
modelConfig: ModelConfigSchema.optional(),
|
|
1108
|
-
assertionResults:
|
|
1137
|
+
assertionResults: import_zod27.z.array(AssertionResultSchema),
|
|
1109
1138
|
metrics: EvalMetricsSchema.optional(),
|
|
1110
|
-
passed:
|
|
1111
|
-
failed:
|
|
1112
|
-
passRate:
|
|
1113
|
-
duration:
|
|
1114
|
-
outputText:
|
|
1115
|
-
files:
|
|
1116
|
-
fileDiffs:
|
|
1139
|
+
passed: import_zod27.z.number(),
|
|
1140
|
+
failed: import_zod27.z.number(),
|
|
1141
|
+
passRate: import_zod27.z.number(),
|
|
1142
|
+
duration: import_zod27.z.number(),
|
|
1143
|
+
outputText: import_zod27.z.string().optional(),
|
|
1144
|
+
files: import_zod27.z.array(ExpectedFileSchema).optional(),
|
|
1145
|
+
fileDiffs: import_zod27.z.array(DiffContentSchema).optional(),
|
|
1117
1146
|
/** Full template files after execution with status indicators */
|
|
1118
|
-
templateFiles:
|
|
1119
|
-
startedAt:
|
|
1120
|
-
completedAt:
|
|
1147
|
+
templateFiles: import_zod27.z.array(TemplateFileSchema).optional(),
|
|
1148
|
+
startedAt: import_zod27.z.string().optional(),
|
|
1149
|
+
completedAt: import_zod27.z.string().optional(),
|
|
1121
1150
|
llmTrace: LLMTraceSchema.optional()
|
|
1122
1151
|
});
|
|
1123
|
-
var PromptResultSchema =
|
|
1124
|
-
text:
|
|
1125
|
-
files:
|
|
1126
|
-
finishReason:
|
|
1127
|
-
reasoning:
|
|
1128
|
-
reasoningDetails:
|
|
1129
|
-
toolCalls:
|
|
1130
|
-
toolResults:
|
|
1131
|
-
warnings:
|
|
1132
|
-
sources:
|
|
1133
|
-
steps:
|
|
1134
|
-
generationTimeMs:
|
|
1135
|
-
prompt:
|
|
1136
|
-
systemPrompt:
|
|
1137
|
-
usage:
|
|
1138
|
-
totalTokens:
|
|
1139
|
-
totalMicrocentsSpent:
|
|
1152
|
+
var PromptResultSchema = import_zod27.z.object({
|
|
1153
|
+
text: import_zod27.z.string(),
|
|
1154
|
+
files: import_zod27.z.array(import_zod27.z.unknown()).optional(),
|
|
1155
|
+
finishReason: import_zod27.z.string().optional(),
|
|
1156
|
+
reasoning: import_zod27.z.string().optional(),
|
|
1157
|
+
reasoningDetails: import_zod27.z.unknown().optional(),
|
|
1158
|
+
toolCalls: import_zod27.z.array(import_zod27.z.unknown()).optional(),
|
|
1159
|
+
toolResults: import_zod27.z.array(import_zod27.z.unknown()).optional(),
|
|
1160
|
+
warnings: import_zod27.z.array(import_zod27.z.unknown()).optional(),
|
|
1161
|
+
sources: import_zod27.z.array(import_zod27.z.unknown()).optional(),
|
|
1162
|
+
steps: import_zod27.z.array(import_zod27.z.unknown()),
|
|
1163
|
+
generationTimeMs: import_zod27.z.number(),
|
|
1164
|
+
prompt: import_zod27.z.string(),
|
|
1165
|
+
systemPrompt: import_zod27.z.string(),
|
|
1166
|
+
usage: import_zod27.z.object({
|
|
1167
|
+
totalTokens: import_zod27.z.number().optional(),
|
|
1168
|
+
totalMicrocentsSpent: import_zod27.z.number().optional()
|
|
1140
1169
|
})
|
|
1141
1170
|
});
|
|
1142
|
-
var EvaluationResultSchema =
|
|
1143
|
-
id:
|
|
1144
|
-
runId:
|
|
1145
|
-
timestamp:
|
|
1171
|
+
var EvaluationResultSchema = import_zod27.z.object({
|
|
1172
|
+
id: import_zod27.z.string(),
|
|
1173
|
+
runId: import_zod27.z.string(),
|
|
1174
|
+
timestamp: import_zod27.z.number(),
|
|
1146
1175
|
promptResult: PromptResultSchema,
|
|
1147
|
-
testResults:
|
|
1148
|
-
tags:
|
|
1149
|
-
feedback:
|
|
1150
|
-
score:
|
|
1151
|
-
suiteId:
|
|
1152
|
-
});
|
|
1153
|
-
var LeanEvaluationResultSchema =
|
|
1154
|
-
id:
|
|
1155
|
-
runId:
|
|
1156
|
-
timestamp:
|
|
1157
|
-
tags:
|
|
1158
|
-
scenarioId:
|
|
1159
|
-
scenarioVersion:
|
|
1160
|
-
targetId:
|
|
1161
|
-
targetVersion:
|
|
1162
|
-
suiteId:
|
|
1163
|
-
score:
|
|
1164
|
-
time:
|
|
1165
|
-
microcentsSpent:
|
|
1176
|
+
testResults: import_zod27.z.array(import_zod27.z.unknown()),
|
|
1177
|
+
tags: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
1178
|
+
feedback: import_zod27.z.string().optional(),
|
|
1179
|
+
score: import_zod27.z.number(),
|
|
1180
|
+
suiteId: import_zod27.z.string().optional()
|
|
1181
|
+
});
|
|
1182
|
+
var LeanEvaluationResultSchema = import_zod27.z.object({
|
|
1183
|
+
id: import_zod27.z.string(),
|
|
1184
|
+
runId: import_zod27.z.string(),
|
|
1185
|
+
timestamp: import_zod27.z.number(),
|
|
1186
|
+
tags: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
1187
|
+
scenarioId: import_zod27.z.string(),
|
|
1188
|
+
scenarioVersion: import_zod27.z.number().optional(),
|
|
1189
|
+
targetId: import_zod27.z.string(),
|
|
1190
|
+
targetVersion: import_zod27.z.number().optional(),
|
|
1191
|
+
suiteId: import_zod27.z.string().optional(),
|
|
1192
|
+
score: import_zod27.z.number(),
|
|
1193
|
+
time: import_zod27.z.number().optional(),
|
|
1194
|
+
microcentsSpent: import_zod27.z.number().optional()
|
|
1166
1195
|
});
|
|
1167
1196
|
|
|
1168
1197
|
// src/project/project.ts
|
|
1169
|
-
var
|
|
1198
|
+
var import_zod28 = require("zod");
|
|
1170
1199
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1171
|
-
appId:
|
|
1172
|
-
appSecret:
|
|
1200
|
+
appId: import_zod28.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
1201
|
+
appSecret: import_zod28.z.string().optional().describe("The secret of the app in Dev Center")
|
|
1173
1202
|
});
|
|
1174
1203
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1175
1204
|
id: true,
|
|
@@ -1180,10 +1209,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
1180
1209
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
1181
1210
|
|
|
1182
1211
|
// src/template/template.ts
|
|
1183
|
-
var
|
|
1212
|
+
var import_zod29 = require("zod");
|
|
1184
1213
|
var TemplateSchema = TenantEntitySchema.extend({
|
|
1185
1214
|
/** URL to download the template from */
|
|
1186
|
-
downloadUrl:
|
|
1215
|
+
downloadUrl: import_zod29.z.url()
|
|
1187
1216
|
});
|
|
1188
1217
|
var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
1189
1218
|
id: true,
|
|
@@ -1319,9 +1348,11 @@ function getSystemAssertion(id) {
|
|
|
1319
1348
|
CreateAgentInputSchema,
|
|
1320
1349
|
CreateCustomAssertionInputSchema,
|
|
1321
1350
|
CreateEvalRunInputSchema,
|
|
1351
|
+
CreateMcpInputSchema,
|
|
1322
1352
|
CreateProjectInputSchema,
|
|
1323
1353
|
CreateSkillInputSchema,
|
|
1324
1354
|
CreateSkillsGroupInputSchema,
|
|
1355
|
+
CreateSubAgentInputSchema,
|
|
1325
1356
|
CreateTemplateInputSchema,
|
|
1326
1357
|
CreateTestScenarioInputSchema,
|
|
1327
1358
|
CreateTestSuiteInputSchema,
|
|
@@ -1360,7 +1391,9 @@ function getSystemAssertion(id) {
|
|
|
1360
1391
|
LlmJudgeAssertionSchema,
|
|
1361
1392
|
LlmJudgeConfigSchema,
|
|
1362
1393
|
LocalProjectConfigSchema,
|
|
1394
|
+
MCPEntitySchema,
|
|
1363
1395
|
MCPServerConfigSchema,
|
|
1396
|
+
MCP_SERVERS_JSON_KEY,
|
|
1364
1397
|
MetaSiteConfigSchema,
|
|
1365
1398
|
ModelConfigSchema,
|
|
1366
1399
|
ModelIds,
|
|
@@ -1381,6 +1414,7 @@ function getSystemAssertion(id) {
|
|
|
1381
1414
|
SkillWasCalledAssertionSchema,
|
|
1382
1415
|
SkillWasCalledConfigSchema,
|
|
1383
1416
|
SkillsGroupSchema,
|
|
1417
|
+
SubAgentSchema,
|
|
1384
1418
|
TRACE_EVENT_PREFIX,
|
|
1385
1419
|
TargetSchema,
|
|
1386
1420
|
TemplateFileSchema,
|
|
@@ -1401,9 +1435,11 @@ function getSystemAssertion(id) {
|
|
|
1401
1435
|
TriggerType,
|
|
1402
1436
|
UpdateAgentInputSchema,
|
|
1403
1437
|
UpdateCustomAssertionInputSchema,
|
|
1438
|
+
UpdateMcpInputSchema,
|
|
1404
1439
|
UpdateProjectInputSchema,
|
|
1405
1440
|
UpdateSkillInputSchema,
|
|
1406
1441
|
UpdateSkillsGroupInputSchema,
|
|
1442
|
+
UpdateSubAgentInputSchema,
|
|
1407
1443
|
UpdateTemplateInputSchema,
|
|
1408
1444
|
UpdateTestScenarioInputSchema,
|
|
1409
1445
|
UpdateTestSuiteInputSchema,
|