@wix/evalforge-types 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +288 -246
- package/build/index.js.map +4 -4
- package/build/index.mjs +284 -246
- package/build/index.mjs.map +4 -4
- package/build/types/scenario/assertions.d.ts +57 -0
- package/build/types/scenario/index.d.ts +1 -0
- package/build/types/scenario/test-scenario.d.ts +48 -0
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -374,34 +374,67 @@ var TestSchema = z17.discriminatedUnion("type", [
|
|
|
374
374
|
PlaywrightNLTestSchema
|
|
375
375
|
]);
|
|
376
376
|
|
|
377
|
-
// src/scenario/
|
|
377
|
+
// src/scenario/assertions.ts
|
|
378
378
|
import { z as z18 } from "zod";
|
|
379
|
-
var
|
|
379
|
+
var SkillWasCalledAssertionSchema = z18.object({
|
|
380
|
+
type: z18.literal("skill_was_called"),
|
|
381
|
+
/** Name of the skill that must have been called (matched against trace Skill tool args) */
|
|
382
|
+
skillName: z18.string()
|
|
383
|
+
});
|
|
384
|
+
var BuildPassedAssertionSchema = z18.object({
|
|
385
|
+
type: z18.literal("build_passed"),
|
|
386
|
+
/** Command to run (default: "yarn build") */
|
|
387
|
+
command: z18.string().optional(),
|
|
388
|
+
/** Expected exit code (default: 0) */
|
|
389
|
+
expectedExitCode: z18.number().int().optional()
|
|
390
|
+
});
|
|
391
|
+
var LlmJudgeAssertionSchema = z18.object({
|
|
392
|
+
type: z18.literal("llm_judge"),
|
|
393
|
+
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
394
|
+
prompt: z18.string(),
|
|
395
|
+
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
396
|
+
systemPrompt: z18.string().optional(),
|
|
397
|
+
/** Minimum score to pass (0–100, default 70) */
|
|
398
|
+
minScore: z18.number().int().min(0).max(100).optional(),
|
|
399
|
+
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
400
|
+
model: z18.string().optional(),
|
|
401
|
+
maxTokens: z18.number().int().optional(),
|
|
402
|
+
temperature: z18.number().min(0).max(1).optional()
|
|
403
|
+
});
|
|
404
|
+
var AssertionSchema = z18.discriminatedUnion("type", [
|
|
405
|
+
SkillWasCalledAssertionSchema,
|
|
406
|
+
BuildPassedAssertionSchema,
|
|
407
|
+
LlmJudgeAssertionSchema
|
|
408
|
+
]);
|
|
409
|
+
|
|
410
|
+
// src/scenario/environment.ts
|
|
411
|
+
import { z as z19 } from "zod";
|
|
412
|
+
var LocalProjectConfigSchema = z19.object({
|
|
380
413
|
/** Template ID to use for the local project */
|
|
381
|
-
templateId:
|
|
414
|
+
templateId: z19.string().optional(),
|
|
382
415
|
/** Files to create in the project */
|
|
383
|
-
files:
|
|
384
|
-
|
|
385
|
-
path:
|
|
386
|
-
content:
|
|
416
|
+
files: z19.array(
|
|
417
|
+
z19.object({
|
|
418
|
+
path: z19.string().min(1),
|
|
419
|
+
content: z19.string().min(1)
|
|
387
420
|
})
|
|
388
421
|
).optional()
|
|
389
422
|
});
|
|
390
|
-
var MetaSiteConfigSchema =
|
|
391
|
-
configurations:
|
|
392
|
-
|
|
393
|
-
name:
|
|
394
|
-
apiCalls:
|
|
395
|
-
|
|
396
|
-
url:
|
|
397
|
-
method:
|
|
398
|
-
body:
|
|
423
|
+
var MetaSiteConfigSchema = z19.object({
|
|
424
|
+
configurations: z19.array(
|
|
425
|
+
z19.object({
|
|
426
|
+
name: z19.string().min(1),
|
|
427
|
+
apiCalls: z19.array(
|
|
428
|
+
z19.object({
|
|
429
|
+
url: z19.string().url(),
|
|
430
|
+
method: z19.enum(["POST", "PUT"]),
|
|
431
|
+
body: z19.string()
|
|
399
432
|
})
|
|
400
433
|
)
|
|
401
434
|
})
|
|
402
435
|
).optional()
|
|
403
436
|
});
|
|
404
|
-
var EnvironmentSchema =
|
|
437
|
+
var EnvironmentSchema = z19.object({
|
|
405
438
|
/** Local project configuration */
|
|
406
439
|
localProject: LocalProjectConfigSchema.optional(),
|
|
407
440
|
/** Meta site configuration */
|
|
@@ -409,18 +442,20 @@ var EnvironmentSchema = z18.object({
|
|
|
409
442
|
});
|
|
410
443
|
|
|
411
444
|
// src/scenario/test-scenario.ts
|
|
412
|
-
import { z as
|
|
413
|
-
var ExpectedFileSchema =
|
|
445
|
+
import { z as z20 } from "zod";
|
|
446
|
+
var ExpectedFileSchema = z20.object({
|
|
414
447
|
/** Relative path where the file should be created */
|
|
415
|
-
path:
|
|
448
|
+
path: z20.string(),
|
|
416
449
|
/** Optional expected content */
|
|
417
|
-
content:
|
|
450
|
+
content: z20.string().optional()
|
|
418
451
|
});
|
|
419
452
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
420
453
|
/** The prompt sent to the agent to trigger the task */
|
|
421
|
-
triggerPrompt:
|
|
454
|
+
triggerPrompt: z20.string().min(10),
|
|
422
455
|
/** ID of the template to use for this scenario */
|
|
423
|
-
templateId:
|
|
456
|
+
templateId: z20.string().optional(),
|
|
457
|
+
/** Assertions to evaluate for this scenario */
|
|
458
|
+
assertions: z20.array(AssertionSchema).optional()
|
|
424
459
|
});
|
|
425
460
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
426
461
|
id: true,
|
|
@@ -431,10 +466,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
431
466
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
432
467
|
|
|
433
468
|
// src/suite/test-suite.ts
|
|
434
|
-
import { z as
|
|
469
|
+
import { z as z21 } from "zod";
|
|
435
470
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
436
471
|
/** IDs of test scenarios in this suite */
|
|
437
|
-
scenarioIds:
|
|
472
|
+
scenarioIds: z21.array(z21.string())
|
|
438
473
|
});
|
|
439
474
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
440
475
|
id: true,
|
|
@@ -445,21 +480,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
445
480
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
446
481
|
|
|
447
482
|
// src/evaluation/metrics.ts
|
|
448
|
-
import { z as
|
|
449
|
-
var TokenUsageSchema =
|
|
450
|
-
prompt:
|
|
451
|
-
completion:
|
|
452
|
-
total:
|
|
453
|
-
});
|
|
454
|
-
var EvalMetricsSchema =
|
|
455
|
-
totalAssertions:
|
|
456
|
-
passed:
|
|
457
|
-
failed:
|
|
458
|
-
skipped:
|
|
459
|
-
errors:
|
|
460
|
-
passRate:
|
|
461
|
-
avgDuration:
|
|
462
|
-
totalDuration:
|
|
483
|
+
import { z as z22 } from "zod";
|
|
484
|
+
var TokenUsageSchema = z22.object({
|
|
485
|
+
prompt: z22.number(),
|
|
486
|
+
completion: z22.number(),
|
|
487
|
+
total: z22.number()
|
|
488
|
+
});
|
|
489
|
+
var EvalMetricsSchema = z22.object({
|
|
490
|
+
totalAssertions: z22.number(),
|
|
491
|
+
passed: z22.number(),
|
|
492
|
+
failed: z22.number(),
|
|
493
|
+
skipped: z22.number(),
|
|
494
|
+
errors: z22.number(),
|
|
495
|
+
passRate: z22.number(),
|
|
496
|
+
avgDuration: z22.number(),
|
|
497
|
+
totalDuration: z22.number()
|
|
463
498
|
});
|
|
464
499
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
465
500
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -469,7 +504,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
469
504
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
470
505
|
return EvalStatus2;
|
|
471
506
|
})(EvalStatus || {});
|
|
472
|
-
var EvalStatusSchema =
|
|
507
|
+
var EvalStatusSchema = z22.enum(EvalStatus);
|
|
473
508
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
474
509
|
LLMStepType2["COMPLETION"] = "completion";
|
|
475
510
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -477,52 +512,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
477
512
|
LLMStepType2["THINKING"] = "thinking";
|
|
478
513
|
return LLMStepType2;
|
|
479
514
|
})(LLMStepType || {});
|
|
480
|
-
var LLMTraceStepSchema =
|
|
481
|
-
id:
|
|
482
|
-
stepNumber:
|
|
483
|
-
type:
|
|
484
|
-
model:
|
|
485
|
-
provider:
|
|
486
|
-
startedAt:
|
|
487
|
-
durationMs:
|
|
515
|
+
var LLMTraceStepSchema = z22.object({
|
|
516
|
+
id: z22.string(),
|
|
517
|
+
stepNumber: z22.number(),
|
|
518
|
+
type: z22.enum(LLMStepType),
|
|
519
|
+
model: z22.string(),
|
|
520
|
+
provider: z22.string(),
|
|
521
|
+
startedAt: z22.string(),
|
|
522
|
+
durationMs: z22.number(),
|
|
488
523
|
tokenUsage: TokenUsageSchema,
|
|
489
|
-
costUsd:
|
|
490
|
-
toolName:
|
|
491
|
-
toolArguments:
|
|
492
|
-
inputPreview:
|
|
493
|
-
outputPreview:
|
|
494
|
-
success:
|
|
495
|
-
error:
|
|
496
|
-
});
|
|
497
|
-
var LLMBreakdownStatsSchema =
|
|
498
|
-
count:
|
|
499
|
-
durationMs:
|
|
500
|
-
tokens:
|
|
501
|
-
costUsd:
|
|
502
|
-
});
|
|
503
|
-
var LLMTraceSummarySchema =
|
|
504
|
-
totalSteps:
|
|
505
|
-
totalDurationMs:
|
|
524
|
+
costUsd: z22.number(),
|
|
525
|
+
toolName: z22.string().optional(),
|
|
526
|
+
toolArguments: z22.string().optional(),
|
|
527
|
+
inputPreview: z22.string().optional(),
|
|
528
|
+
outputPreview: z22.string().optional(),
|
|
529
|
+
success: z22.boolean(),
|
|
530
|
+
error: z22.string().optional()
|
|
531
|
+
});
|
|
532
|
+
var LLMBreakdownStatsSchema = z22.object({
|
|
533
|
+
count: z22.number(),
|
|
534
|
+
durationMs: z22.number(),
|
|
535
|
+
tokens: z22.number(),
|
|
536
|
+
costUsd: z22.number()
|
|
537
|
+
});
|
|
538
|
+
var LLMTraceSummarySchema = z22.object({
|
|
539
|
+
totalSteps: z22.number(),
|
|
540
|
+
totalDurationMs: z22.number(),
|
|
506
541
|
totalTokens: TokenUsageSchema,
|
|
507
|
-
totalCostUsd:
|
|
508
|
-
stepTypeBreakdown:
|
|
509
|
-
modelBreakdown:
|
|
510
|
-
modelsUsed:
|
|
511
|
-
});
|
|
512
|
-
var LLMTraceSchema =
|
|
513
|
-
id:
|
|
514
|
-
steps:
|
|
542
|
+
totalCostUsd: z22.number(),
|
|
543
|
+
stepTypeBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema).optional(),
|
|
544
|
+
modelBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema),
|
|
545
|
+
modelsUsed: z22.array(z22.string())
|
|
546
|
+
});
|
|
547
|
+
var LLMTraceSchema = z22.object({
|
|
548
|
+
id: z22.string(),
|
|
549
|
+
steps: z22.array(LLMTraceStepSchema),
|
|
515
550
|
summary: LLMTraceSummarySchema
|
|
516
551
|
});
|
|
517
552
|
|
|
518
553
|
// src/evaluation/eval-result.ts
|
|
519
|
-
import { z as
|
|
554
|
+
import { z as z25 } from "zod";
|
|
520
555
|
|
|
521
556
|
// src/evaluation/eval-run.ts
|
|
522
|
-
import { z as
|
|
557
|
+
import { z as z24 } from "zod";
|
|
523
558
|
|
|
524
559
|
// src/evaluation/live-trace.ts
|
|
525
|
-
import { z as
|
|
560
|
+
import { z as z23 } from "zod";
|
|
526
561
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
527
562
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
528
563
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -531,31 +566,31 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
531
566
|
LiveTraceEventType2["DIAGNOSTIC"] = "diagnostic";
|
|
532
567
|
return LiveTraceEventType2;
|
|
533
568
|
})(LiveTraceEventType || {});
|
|
534
|
-
var LiveTraceEventSchema =
|
|
569
|
+
var LiveTraceEventSchema = z23.object({
|
|
535
570
|
/** The evaluation run ID */
|
|
536
|
-
evalRunId:
|
|
571
|
+
evalRunId: z23.string(),
|
|
537
572
|
/** The scenario ID being executed */
|
|
538
|
-
scenarioId:
|
|
573
|
+
scenarioId: z23.string(),
|
|
539
574
|
/** The scenario name for display */
|
|
540
|
-
scenarioName:
|
|
575
|
+
scenarioName: z23.string(),
|
|
541
576
|
/** The target ID (skill, agent, etc.) */
|
|
542
|
-
targetId:
|
|
577
|
+
targetId: z23.string(),
|
|
543
578
|
/** The target name for display */
|
|
544
|
-
targetName:
|
|
579
|
+
targetName: z23.string(),
|
|
545
580
|
/** Step number in the current scenario execution */
|
|
546
|
-
stepNumber:
|
|
581
|
+
stepNumber: z23.number(),
|
|
547
582
|
/** Type of trace event */
|
|
548
|
-
type:
|
|
583
|
+
type: z23.enum(LiveTraceEventType),
|
|
549
584
|
/** Tool name if this is a tool_use event */
|
|
550
|
-
toolName:
|
|
585
|
+
toolName: z23.string().optional(),
|
|
551
586
|
/** Tool arguments preview (truncated JSON) */
|
|
552
|
-
toolArgs:
|
|
587
|
+
toolArgs: z23.string().optional(),
|
|
553
588
|
/** Output preview (truncated text) */
|
|
554
|
-
outputPreview:
|
|
589
|
+
outputPreview: z23.string().optional(),
|
|
555
590
|
/** Timestamp when this event occurred */
|
|
556
|
-
timestamp:
|
|
591
|
+
timestamp: z23.string(),
|
|
557
592
|
/** Whether this is the final event for this scenario */
|
|
558
|
-
isComplete:
|
|
593
|
+
isComplete: z23.boolean()
|
|
559
594
|
});
|
|
560
595
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
561
596
|
function parseTraceEventLine(line) {
|
|
@@ -583,14 +618,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
583
618
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
584
619
|
return TriggerType2;
|
|
585
620
|
})(TriggerType || {});
|
|
586
|
-
var TriggerMetadataSchema =
|
|
587
|
-
version:
|
|
588
|
-
resourceUpdated:
|
|
621
|
+
var TriggerMetadataSchema = z24.object({
|
|
622
|
+
version: z24.string().optional(),
|
|
623
|
+
resourceUpdated: z24.array(z24.string()).optional()
|
|
589
624
|
});
|
|
590
|
-
var TriggerSchema =
|
|
591
|
-
id:
|
|
625
|
+
var TriggerSchema = z24.object({
|
|
626
|
+
id: z24.string(),
|
|
592
627
|
metadata: TriggerMetadataSchema.optional(),
|
|
593
|
-
type:
|
|
628
|
+
type: z24.enum(TriggerType)
|
|
594
629
|
});
|
|
595
630
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
596
631
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -608,89 +643,89 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
608
643
|
FailureSeverity2["LOW"] = "low";
|
|
609
644
|
return FailureSeverity2;
|
|
610
645
|
})(FailureSeverity || {});
|
|
611
|
-
var DiffLineTypeSchema =
|
|
612
|
-
var DiffLineSchema =
|
|
646
|
+
var DiffLineTypeSchema = z24.enum(["added", "removed", "unchanged"]);
|
|
647
|
+
var DiffLineSchema = z24.object({
|
|
613
648
|
type: DiffLineTypeSchema,
|
|
614
|
-
content:
|
|
615
|
-
lineNumber:
|
|
616
|
-
});
|
|
617
|
-
var DiffContentSchema =
|
|
618
|
-
path:
|
|
619
|
-
expected:
|
|
620
|
-
actual:
|
|
621
|
-
diffLines:
|
|
622
|
-
});
|
|
623
|
-
var CommandExecutionSchema =
|
|
624
|
-
command:
|
|
625
|
-
exitCode:
|
|
626
|
-
output:
|
|
627
|
-
duration:
|
|
628
|
-
});
|
|
629
|
-
var FileModificationSchema =
|
|
630
|
-
path:
|
|
631
|
-
action:
|
|
632
|
-
});
|
|
633
|
-
var ApiCallSchema =
|
|
634
|
-
endpoint:
|
|
635
|
-
tokensUsed:
|
|
636
|
-
duration:
|
|
637
|
-
});
|
|
638
|
-
var ExecutionTraceSchema =
|
|
639
|
-
commands:
|
|
640
|
-
filesModified:
|
|
641
|
-
apiCalls:
|
|
642
|
-
totalDuration:
|
|
643
|
-
});
|
|
644
|
-
var FailureAnalysisSchema =
|
|
645
|
-
category:
|
|
646
|
-
severity:
|
|
647
|
-
summary:
|
|
648
|
-
details:
|
|
649
|
-
rootCause:
|
|
650
|
-
suggestedFix:
|
|
651
|
-
relatedAssertions:
|
|
652
|
-
codeSnippet:
|
|
653
|
-
similarIssues:
|
|
654
|
-
patternId:
|
|
649
|
+
content: z24.string(),
|
|
650
|
+
lineNumber: z24.number()
|
|
651
|
+
});
|
|
652
|
+
var DiffContentSchema = z24.object({
|
|
653
|
+
path: z24.string(),
|
|
654
|
+
expected: z24.string(),
|
|
655
|
+
actual: z24.string(),
|
|
656
|
+
diffLines: z24.array(DiffLineSchema)
|
|
657
|
+
});
|
|
658
|
+
var CommandExecutionSchema = z24.object({
|
|
659
|
+
command: z24.string(),
|
|
660
|
+
exitCode: z24.number(),
|
|
661
|
+
output: z24.string().optional(),
|
|
662
|
+
duration: z24.number()
|
|
663
|
+
});
|
|
664
|
+
var FileModificationSchema = z24.object({
|
|
665
|
+
path: z24.string(),
|
|
666
|
+
action: z24.enum(["created", "modified", "deleted"])
|
|
667
|
+
});
|
|
668
|
+
var ApiCallSchema = z24.object({
|
|
669
|
+
endpoint: z24.string(),
|
|
670
|
+
tokensUsed: z24.number(),
|
|
671
|
+
duration: z24.number()
|
|
672
|
+
});
|
|
673
|
+
var ExecutionTraceSchema = z24.object({
|
|
674
|
+
commands: z24.array(CommandExecutionSchema),
|
|
675
|
+
filesModified: z24.array(FileModificationSchema),
|
|
676
|
+
apiCalls: z24.array(ApiCallSchema),
|
|
677
|
+
totalDuration: z24.number()
|
|
678
|
+
});
|
|
679
|
+
var FailureAnalysisSchema = z24.object({
|
|
680
|
+
category: z24.enum(FailureCategory),
|
|
681
|
+
severity: z24.enum(FailureSeverity),
|
|
682
|
+
summary: z24.string(),
|
|
683
|
+
details: z24.string(),
|
|
684
|
+
rootCause: z24.string(),
|
|
685
|
+
suggestedFix: z24.string(),
|
|
686
|
+
relatedAssertions: z24.array(z24.string()),
|
|
687
|
+
codeSnippet: z24.string().optional(),
|
|
688
|
+
similarIssues: z24.array(z24.string()).optional(),
|
|
689
|
+
patternId: z24.string().optional(),
|
|
655
690
|
// Extended fields for detailed debugging
|
|
656
691
|
diff: DiffContentSchema.optional(),
|
|
657
692
|
executionTrace: ExecutionTraceSchema.optional()
|
|
658
693
|
});
|
|
659
694
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
660
695
|
/** Agent ID for this run */
|
|
661
|
-
agentId:
|
|
696
|
+
agentId: z24.string().optional(),
|
|
662
697
|
/** Skills group ID for this run */
|
|
663
|
-
skillsGroupId:
|
|
698
|
+
skillsGroupId: z24.string().optional(),
|
|
664
699
|
/** Scenario IDs to run */
|
|
665
|
-
scenarioIds:
|
|
700
|
+
scenarioIds: z24.array(z24.string()),
|
|
666
701
|
/** Current status */
|
|
667
702
|
status: EvalStatusSchema,
|
|
668
703
|
/** Progress percentage (0-100) */
|
|
669
|
-
progress:
|
|
704
|
+
progress: z24.number(),
|
|
670
705
|
/** Results for each scenario/target combination */
|
|
671
|
-
results:
|
|
706
|
+
results: z24.array(EvalRunResultSchema),
|
|
672
707
|
/** Aggregated metrics across all results */
|
|
673
708
|
aggregateMetrics: EvalMetricsSchema,
|
|
674
709
|
/** Failure analyses */
|
|
675
|
-
failureAnalyses:
|
|
710
|
+
failureAnalyses: z24.array(FailureAnalysisSchema).optional(),
|
|
676
711
|
/** Aggregated LLM trace summary */
|
|
677
712
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
678
713
|
/** What triggered this run */
|
|
679
714
|
trigger: TriggerSchema.optional(),
|
|
680
715
|
/** When the run started (set when evaluation is triggered) */
|
|
681
|
-
startedAt:
|
|
716
|
+
startedAt: z24.string().optional(),
|
|
682
717
|
/** When the run completed */
|
|
683
|
-
completedAt:
|
|
718
|
+
completedAt: z24.string().optional(),
|
|
684
719
|
/** Live trace events captured during execution (for playback on results page) */
|
|
685
|
-
liveTraceEvents:
|
|
720
|
+
liveTraceEvents: z24.array(LiveTraceEventSchema).optional(),
|
|
686
721
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
687
|
-
jobId:
|
|
722
|
+
jobId: z24.string().optional(),
|
|
688
723
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
689
|
-
jobStatus:
|
|
724
|
+
jobStatus: z24.string().optional(),
|
|
690
725
|
/** Remote job error message if the job failed */
|
|
691
|
-
jobError:
|
|
726
|
+
jobError: z24.string().optional(),
|
|
692
727
|
/** Timestamp of the last job status check */
|
|
693
|
-
jobStatusCheckedAt:
|
|
728
|
+
jobStatusCheckedAt: z24.string().optional()
|
|
694
729
|
});
|
|
695
730
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
696
731
|
id: true,
|
|
@@ -703,28 +738,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
703
738
|
startedAt: true,
|
|
704
739
|
completedAt: true
|
|
705
740
|
});
|
|
706
|
-
var EvaluationProgressSchema =
|
|
707
|
-
runId:
|
|
708
|
-
targetId:
|
|
709
|
-
totalScenarios:
|
|
710
|
-
completedScenarios:
|
|
711
|
-
scenarioProgress:
|
|
712
|
-
|
|
713
|
-
scenarioId:
|
|
714
|
-
currentStep:
|
|
715
|
-
error:
|
|
741
|
+
var EvaluationProgressSchema = z24.object({
|
|
742
|
+
runId: z24.string(),
|
|
743
|
+
targetId: z24.string(),
|
|
744
|
+
totalScenarios: z24.number(),
|
|
745
|
+
completedScenarios: z24.number(),
|
|
746
|
+
scenarioProgress: z24.array(
|
|
747
|
+
z24.object({
|
|
748
|
+
scenarioId: z24.string(),
|
|
749
|
+
currentStep: z24.string(),
|
|
750
|
+
error: z24.string().optional()
|
|
716
751
|
})
|
|
717
752
|
),
|
|
718
|
-
createdAt:
|
|
753
|
+
createdAt: z24.number()
|
|
719
754
|
});
|
|
720
|
-
var EvaluationLogSchema =
|
|
721
|
-
runId:
|
|
722
|
-
scenarioId:
|
|
723
|
-
log:
|
|
724
|
-
level:
|
|
725
|
-
message:
|
|
726
|
-
args:
|
|
727
|
-
error:
|
|
755
|
+
var EvaluationLogSchema = z24.object({
|
|
756
|
+
runId: z24.string(),
|
|
757
|
+
scenarioId: z24.string(),
|
|
758
|
+
log: z24.object({
|
|
759
|
+
level: z24.enum(["info", "error", "debug"]),
|
|
760
|
+
message: z24.string().optional(),
|
|
761
|
+
args: z24.array(z24.any()).optional(),
|
|
762
|
+
error: z24.string().optional()
|
|
728
763
|
})
|
|
729
764
|
});
|
|
730
765
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -737,90 +772,89 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
737
772
|
AssertionResultStatus2["ERROR"] = "error";
|
|
738
773
|
return AssertionResultStatus2;
|
|
739
774
|
})(AssertionResultStatus || {});
|
|
740
|
-
var AssertionResultSchema =
|
|
741
|
-
id:
|
|
742
|
-
assertionId:
|
|
743
|
-
assertionType:
|
|
744
|
-
assertionName:
|
|
745
|
-
status:
|
|
746
|
-
message:
|
|
747
|
-
expected:
|
|
748
|
-
actual:
|
|
749
|
-
duration:
|
|
750
|
-
details:
|
|
751
|
-
llmTraceSteps:
|
|
752
|
-
});
|
|
753
|
-
var EvalRunResultSchema =
|
|
754
|
-
id:
|
|
755
|
-
targetId:
|
|
756
|
-
targetName:
|
|
757
|
-
scenarioId:
|
|
758
|
-
scenarioName:
|
|
775
|
+
var AssertionResultSchema = z25.object({
|
|
776
|
+
id: z25.string(),
|
|
777
|
+
assertionId: z25.string(),
|
|
778
|
+
assertionType: z25.string(),
|
|
779
|
+
assertionName: z25.string(),
|
|
780
|
+
status: z25.enum(AssertionResultStatus),
|
|
781
|
+
message: z25.string().optional(),
|
|
782
|
+
expected: z25.string().optional(),
|
|
783
|
+
actual: z25.string().optional(),
|
|
784
|
+
duration: z25.number().optional(),
|
|
785
|
+
details: z25.record(z25.string(), z25.unknown()).optional(),
|
|
786
|
+
llmTraceSteps: z25.array(LLMTraceStepSchema).optional()
|
|
787
|
+
});
|
|
788
|
+
var EvalRunResultSchema = z25.object({
|
|
789
|
+
id: z25.string(),
|
|
790
|
+
targetId: z25.string(),
|
|
791
|
+
targetName: z25.string().optional(),
|
|
792
|
+
scenarioId: z25.string(),
|
|
793
|
+
scenarioName: z25.string(),
|
|
759
794
|
modelConfig: ModelConfigSchema.optional(),
|
|
760
|
-
assertionResults:
|
|
795
|
+
assertionResults: z25.array(AssertionResultSchema),
|
|
761
796
|
metrics: EvalMetricsSchema.optional(),
|
|
762
|
-
passed:
|
|
763
|
-
failed:
|
|
764
|
-
passRate:
|
|
765
|
-
duration:
|
|
766
|
-
outputText:
|
|
767
|
-
files:
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
completedAt: z24.string().optional(),
|
|
797
|
+
passed: z25.number(),
|
|
798
|
+
failed: z25.number(),
|
|
799
|
+
passRate: z25.number(),
|
|
800
|
+
duration: z25.number(),
|
|
801
|
+
outputText: z25.string().optional(),
|
|
802
|
+
files: z25.array(ExpectedFileSchema).optional(),
|
|
803
|
+
fileDiffs: z25.array(DiffContentSchema).optional(),
|
|
804
|
+
startedAt: z25.string().optional(),
|
|
805
|
+
completedAt: z25.string().optional(),
|
|
772
806
|
llmTrace: LLMTraceSchema.optional()
|
|
773
807
|
});
|
|
774
|
-
var PromptResultSchema =
|
|
775
|
-
text:
|
|
776
|
-
files:
|
|
777
|
-
finishReason:
|
|
778
|
-
reasoning:
|
|
779
|
-
reasoningDetails:
|
|
780
|
-
toolCalls:
|
|
781
|
-
toolResults:
|
|
782
|
-
warnings:
|
|
783
|
-
sources:
|
|
784
|
-
steps:
|
|
785
|
-
generationTimeMs:
|
|
786
|
-
prompt:
|
|
787
|
-
systemPrompt:
|
|
788
|
-
usage:
|
|
789
|
-
totalTokens:
|
|
790
|
-
totalMicrocentsSpent:
|
|
808
|
+
var PromptResultSchema = z25.object({
|
|
809
|
+
text: z25.string(),
|
|
810
|
+
files: z25.array(z25.unknown()).optional(),
|
|
811
|
+
finishReason: z25.string().optional(),
|
|
812
|
+
reasoning: z25.string().optional(),
|
|
813
|
+
reasoningDetails: z25.unknown().optional(),
|
|
814
|
+
toolCalls: z25.array(z25.unknown()).optional(),
|
|
815
|
+
toolResults: z25.array(z25.unknown()).optional(),
|
|
816
|
+
warnings: z25.array(z25.unknown()).optional(),
|
|
817
|
+
sources: z25.array(z25.unknown()).optional(),
|
|
818
|
+
steps: z25.array(z25.unknown()),
|
|
819
|
+
generationTimeMs: z25.number(),
|
|
820
|
+
prompt: z25.string(),
|
|
821
|
+
systemPrompt: z25.string(),
|
|
822
|
+
usage: z25.object({
|
|
823
|
+
totalTokens: z25.number().optional(),
|
|
824
|
+
totalMicrocentsSpent: z25.number().optional()
|
|
791
825
|
})
|
|
792
826
|
});
|
|
793
|
-
var EvaluationResultSchema =
|
|
794
|
-
id:
|
|
795
|
-
runId:
|
|
796
|
-
timestamp:
|
|
827
|
+
var EvaluationResultSchema = z25.object({
|
|
828
|
+
id: z25.string(),
|
|
829
|
+
runId: z25.string(),
|
|
830
|
+
timestamp: z25.number(),
|
|
797
831
|
promptResult: PromptResultSchema,
|
|
798
|
-
testResults:
|
|
799
|
-
tags:
|
|
800
|
-
feedback:
|
|
801
|
-
score:
|
|
802
|
-
suiteId:
|
|
803
|
-
});
|
|
804
|
-
var LeanEvaluationResultSchema =
|
|
805
|
-
id:
|
|
806
|
-
runId:
|
|
807
|
-
timestamp:
|
|
808
|
-
tags:
|
|
809
|
-
scenarioId:
|
|
810
|
-
scenarioVersion:
|
|
811
|
-
targetId:
|
|
812
|
-
targetVersion:
|
|
813
|
-
suiteId:
|
|
814
|
-
score:
|
|
815
|
-
time:
|
|
816
|
-
microcentsSpent:
|
|
832
|
+
testResults: z25.array(z25.unknown()),
|
|
833
|
+
tags: z25.array(z25.string()).optional(),
|
|
834
|
+
feedback: z25.string().optional(),
|
|
835
|
+
score: z25.number(),
|
|
836
|
+
suiteId: z25.string().optional()
|
|
837
|
+
});
|
|
838
|
+
var LeanEvaluationResultSchema = z25.object({
|
|
839
|
+
id: z25.string(),
|
|
840
|
+
runId: z25.string(),
|
|
841
|
+
timestamp: z25.number(),
|
|
842
|
+
tags: z25.array(z25.string()).optional(),
|
|
843
|
+
scenarioId: z25.string(),
|
|
844
|
+
scenarioVersion: z25.number().optional(),
|
|
845
|
+
targetId: z25.string(),
|
|
846
|
+
targetVersion: z25.number().optional(),
|
|
847
|
+
suiteId: z25.string().optional(),
|
|
848
|
+
score: z25.number(),
|
|
849
|
+
time: z25.number().optional(),
|
|
850
|
+
microcentsSpent: z25.number().optional()
|
|
817
851
|
});
|
|
818
852
|
|
|
819
853
|
// src/project/project.ts
|
|
820
|
-
import { z as
|
|
854
|
+
import { z as z26 } from "zod";
|
|
821
855
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
822
|
-
appId:
|
|
823
|
-
appSecret:
|
|
856
|
+
appId: z26.string().optional().describe("The ID of the app in Dev Center"),
|
|
857
|
+
appSecret: z26.string().optional().describe("The secret of the app in Dev Center")
|
|
824
858
|
});
|
|
825
859
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
826
860
|
id: true,
|
|
@@ -831,10 +865,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
831
865
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
832
866
|
|
|
833
867
|
// src/template/template.ts
|
|
834
|
-
import { z as
|
|
868
|
+
import { z as z27 } from "zod";
|
|
835
869
|
var TemplateSchema = TenantEntitySchema.extend({
|
|
836
870
|
/** URL to download the template from */
|
|
837
|
-
downloadUrl:
|
|
871
|
+
downloadUrl: z27.url()
|
|
838
872
|
});
|
|
839
873
|
var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
840
874
|
id: true,
|
|
@@ -851,9 +885,11 @@ export {
|
|
|
851
885
|
ApiCallSchema,
|
|
852
886
|
AssertionResultSchema,
|
|
853
887
|
AssertionResultStatus,
|
|
888
|
+
AssertionSchema,
|
|
854
889
|
BaseEntitySchema,
|
|
855
890
|
BaseTestSchema,
|
|
856
891
|
BuildCheckTestSchema,
|
|
892
|
+
BuildPassedAssertionSchema,
|
|
857
893
|
CommandExecutionSchema,
|
|
858
894
|
CommandExecutionTestSchema,
|
|
859
895
|
CreateAgentInputSchema,
|
|
@@ -895,6 +931,7 @@ export {
|
|
|
895
931
|
LeanEvaluationResultSchema,
|
|
896
932
|
LiveTraceEventSchema,
|
|
897
933
|
LiveTraceEventType,
|
|
934
|
+
LlmJudgeAssertionSchema,
|
|
898
935
|
LocalProjectConfigSchema,
|
|
899
936
|
MCPServerConfigSchema,
|
|
900
937
|
MetaSiteConfigSchema,
|
|
@@ -910,6 +947,7 @@ export {
|
|
|
910
947
|
SkillMetadataSchema,
|
|
911
948
|
SkillSchema,
|
|
912
949
|
SkillVersionSchema,
|
|
950
|
+
SkillWasCalledAssertionSchema,
|
|
913
951
|
SkillsGroupSchema,
|
|
914
952
|
TRACE_EVENT_PREFIX,
|
|
915
953
|
TargetSchema,
|