@wix/evalforge-types 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +288 -246
- package/build/index.js.map +4 -4
- package/build/index.mjs +284 -246
- package/build/index.mjs.map +4 -4
- package/build/types/scenario/assertions.d.ts +57 -0
- package/build/types/scenario/index.d.ts +1 -0
- package/build/types/scenario/test-scenario.d.ts +48 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -27,9 +27,11 @@ __export(index_exports, {
|
|
|
27
27
|
ApiCallSchema: () => ApiCallSchema,
|
|
28
28
|
AssertionResultSchema: () => AssertionResultSchema,
|
|
29
29
|
AssertionResultStatus: () => AssertionResultStatus,
|
|
30
|
+
AssertionSchema: () => AssertionSchema,
|
|
30
31
|
BaseEntitySchema: () => BaseEntitySchema,
|
|
31
32
|
BaseTestSchema: () => BaseTestSchema,
|
|
32
33
|
BuildCheckTestSchema: () => BuildCheckTestSchema,
|
|
34
|
+
BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
|
|
33
35
|
CommandExecutionSchema: () => CommandExecutionSchema,
|
|
34
36
|
CommandExecutionTestSchema: () => CommandExecutionTestSchema,
|
|
35
37
|
CreateAgentInputSchema: () => CreateAgentInputSchema,
|
|
@@ -71,6 +73,7 @@ __export(index_exports, {
|
|
|
71
73
|
LeanEvaluationResultSchema: () => LeanEvaluationResultSchema,
|
|
72
74
|
LiveTraceEventSchema: () => LiveTraceEventSchema,
|
|
73
75
|
LiveTraceEventType: () => LiveTraceEventType,
|
|
76
|
+
LlmJudgeAssertionSchema: () => LlmJudgeAssertionSchema,
|
|
74
77
|
LocalProjectConfigSchema: () => LocalProjectConfigSchema,
|
|
75
78
|
MCPServerConfigSchema: () => MCPServerConfigSchema,
|
|
76
79
|
MetaSiteConfigSchema: () => MetaSiteConfigSchema,
|
|
@@ -86,6 +89,7 @@ __export(index_exports, {
|
|
|
86
89
|
SkillMetadataSchema: () => SkillMetadataSchema,
|
|
87
90
|
SkillSchema: () => SkillSchema,
|
|
88
91
|
SkillVersionSchema: () => SkillVersionSchema,
|
|
92
|
+
SkillWasCalledAssertionSchema: () => SkillWasCalledAssertionSchema,
|
|
89
93
|
SkillsGroupSchema: () => SkillsGroupSchema,
|
|
90
94
|
TRACE_EVENT_PREFIX: () => TRACE_EVENT_PREFIX,
|
|
91
95
|
TargetSchema: () => TargetSchema,
|
|
@@ -492,34 +496,67 @@ var TestSchema = import_zod17.z.discriminatedUnion("type", [
|
|
|
492
496
|
PlaywrightNLTestSchema
|
|
493
497
|
]);
|
|
494
498
|
|
|
495
|
-
// src/scenario/
|
|
499
|
+
// src/scenario/assertions.ts
|
|
496
500
|
var import_zod18 = require("zod");
|
|
497
|
-
var
|
|
501
|
+
var SkillWasCalledAssertionSchema = import_zod18.z.object({
|
|
502
|
+
type: import_zod18.z.literal("skill_was_called"),
|
|
503
|
+
/** Name of the skill that must have been called (matched against trace Skill tool args) */
|
|
504
|
+
skillName: import_zod18.z.string()
|
|
505
|
+
});
|
|
506
|
+
var BuildPassedAssertionSchema = import_zod18.z.object({
|
|
507
|
+
type: import_zod18.z.literal("build_passed"),
|
|
508
|
+
/** Command to run (default: "yarn build") */
|
|
509
|
+
command: import_zod18.z.string().optional(),
|
|
510
|
+
/** Expected exit code (default: 0) */
|
|
511
|
+
expectedExitCode: import_zod18.z.number().int().optional()
|
|
512
|
+
});
|
|
513
|
+
var LlmJudgeAssertionSchema = import_zod18.z.object({
|
|
514
|
+
type: import_zod18.z.literal("llm_judge"),
|
|
515
|
+
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
516
|
+
prompt: import_zod18.z.string(),
|
|
517
|
+
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
518
|
+
systemPrompt: import_zod18.z.string().optional(),
|
|
519
|
+
/** Minimum score to pass (0–100, default 70) */
|
|
520
|
+
minScore: import_zod18.z.number().int().min(0).max(100).optional(),
|
|
521
|
+
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
522
|
+
model: import_zod18.z.string().optional(),
|
|
523
|
+
maxTokens: import_zod18.z.number().int().optional(),
|
|
524
|
+
temperature: import_zod18.z.number().min(0).max(1).optional()
|
|
525
|
+
});
|
|
526
|
+
var AssertionSchema = import_zod18.z.discriminatedUnion("type", [
|
|
527
|
+
SkillWasCalledAssertionSchema,
|
|
528
|
+
BuildPassedAssertionSchema,
|
|
529
|
+
LlmJudgeAssertionSchema
|
|
530
|
+
]);
|
|
531
|
+
|
|
532
|
+
// src/scenario/environment.ts
|
|
533
|
+
var import_zod19 = require("zod");
|
|
534
|
+
var LocalProjectConfigSchema = import_zod19.z.object({
|
|
498
535
|
/** Template ID to use for the local project */
|
|
499
|
-
templateId:
|
|
536
|
+
templateId: import_zod19.z.string().optional(),
|
|
500
537
|
/** Files to create in the project */
|
|
501
|
-
files:
|
|
502
|
-
|
|
503
|
-
path:
|
|
504
|
-
content:
|
|
538
|
+
files: import_zod19.z.array(
|
|
539
|
+
import_zod19.z.object({
|
|
540
|
+
path: import_zod19.z.string().min(1),
|
|
541
|
+
content: import_zod19.z.string().min(1)
|
|
505
542
|
})
|
|
506
543
|
).optional()
|
|
507
544
|
});
|
|
508
|
-
var MetaSiteConfigSchema =
|
|
509
|
-
configurations:
|
|
510
|
-
|
|
511
|
-
name:
|
|
512
|
-
apiCalls:
|
|
513
|
-
|
|
514
|
-
url:
|
|
515
|
-
method:
|
|
516
|
-
body:
|
|
545
|
+
var MetaSiteConfigSchema = import_zod19.z.object({
|
|
546
|
+
configurations: import_zod19.z.array(
|
|
547
|
+
import_zod19.z.object({
|
|
548
|
+
name: import_zod19.z.string().min(1),
|
|
549
|
+
apiCalls: import_zod19.z.array(
|
|
550
|
+
import_zod19.z.object({
|
|
551
|
+
url: import_zod19.z.string().url(),
|
|
552
|
+
method: import_zod19.z.enum(["POST", "PUT"]),
|
|
553
|
+
body: import_zod19.z.string()
|
|
517
554
|
})
|
|
518
555
|
)
|
|
519
556
|
})
|
|
520
557
|
).optional()
|
|
521
558
|
});
|
|
522
|
-
var EnvironmentSchema =
|
|
559
|
+
var EnvironmentSchema = import_zod19.z.object({
|
|
523
560
|
/** Local project configuration */
|
|
524
561
|
localProject: LocalProjectConfigSchema.optional(),
|
|
525
562
|
/** Meta site configuration */
|
|
@@ -527,18 +564,20 @@ var EnvironmentSchema = import_zod18.z.object({
|
|
|
527
564
|
});
|
|
528
565
|
|
|
529
566
|
// src/scenario/test-scenario.ts
|
|
530
|
-
var
|
|
531
|
-
var ExpectedFileSchema =
|
|
567
|
+
var import_zod20 = require("zod");
|
|
568
|
+
var ExpectedFileSchema = import_zod20.z.object({
|
|
532
569
|
/** Relative path where the file should be created */
|
|
533
|
-
path:
|
|
570
|
+
path: import_zod20.z.string(),
|
|
534
571
|
/** Optional expected content */
|
|
535
|
-
content:
|
|
572
|
+
content: import_zod20.z.string().optional()
|
|
536
573
|
});
|
|
537
574
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
538
575
|
/** The prompt sent to the agent to trigger the task */
|
|
539
|
-
triggerPrompt:
|
|
576
|
+
triggerPrompt: import_zod20.z.string().min(10),
|
|
540
577
|
/** ID of the template to use for this scenario */
|
|
541
|
-
templateId:
|
|
578
|
+
templateId: import_zod20.z.string().optional(),
|
|
579
|
+
/** Assertions to evaluate for this scenario */
|
|
580
|
+
assertions: import_zod20.z.array(AssertionSchema).optional()
|
|
542
581
|
});
|
|
543
582
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
544
583
|
id: true,
|
|
@@ -549,10 +588,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
549
588
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
550
589
|
|
|
551
590
|
// src/suite/test-suite.ts
|
|
552
|
-
var
|
|
591
|
+
var import_zod21 = require("zod");
|
|
553
592
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
554
593
|
/** IDs of test scenarios in this suite */
|
|
555
|
-
scenarioIds:
|
|
594
|
+
scenarioIds: import_zod21.z.array(import_zod21.z.string())
|
|
556
595
|
});
|
|
557
596
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
558
597
|
id: true,
|
|
@@ -563,21 +602,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
563
602
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
564
603
|
|
|
565
604
|
// src/evaluation/metrics.ts
|
|
566
|
-
var
|
|
567
|
-
var TokenUsageSchema =
|
|
568
|
-
prompt:
|
|
569
|
-
completion:
|
|
570
|
-
total:
|
|
571
|
-
});
|
|
572
|
-
var EvalMetricsSchema =
|
|
573
|
-
totalAssertions:
|
|
574
|
-
passed:
|
|
575
|
-
failed:
|
|
576
|
-
skipped:
|
|
577
|
-
errors:
|
|
578
|
-
passRate:
|
|
579
|
-
avgDuration:
|
|
580
|
-
totalDuration:
|
|
605
|
+
var import_zod22 = require("zod");
|
|
606
|
+
var TokenUsageSchema = import_zod22.z.object({
|
|
607
|
+
prompt: import_zod22.z.number(),
|
|
608
|
+
completion: import_zod22.z.number(),
|
|
609
|
+
total: import_zod22.z.number()
|
|
610
|
+
});
|
|
611
|
+
var EvalMetricsSchema = import_zod22.z.object({
|
|
612
|
+
totalAssertions: import_zod22.z.number(),
|
|
613
|
+
passed: import_zod22.z.number(),
|
|
614
|
+
failed: import_zod22.z.number(),
|
|
615
|
+
skipped: import_zod22.z.number(),
|
|
616
|
+
errors: import_zod22.z.number(),
|
|
617
|
+
passRate: import_zod22.z.number(),
|
|
618
|
+
avgDuration: import_zod22.z.number(),
|
|
619
|
+
totalDuration: import_zod22.z.number()
|
|
581
620
|
});
|
|
582
621
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
583
622
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -587,7 +626,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
587
626
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
588
627
|
return EvalStatus2;
|
|
589
628
|
})(EvalStatus || {});
|
|
590
|
-
var EvalStatusSchema =
|
|
629
|
+
var EvalStatusSchema = import_zod22.z.enum(EvalStatus);
|
|
591
630
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
592
631
|
LLMStepType2["COMPLETION"] = "completion";
|
|
593
632
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -595,52 +634,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
595
634
|
LLMStepType2["THINKING"] = "thinking";
|
|
596
635
|
return LLMStepType2;
|
|
597
636
|
})(LLMStepType || {});
|
|
598
|
-
var LLMTraceStepSchema =
|
|
599
|
-
id:
|
|
600
|
-
stepNumber:
|
|
601
|
-
type:
|
|
602
|
-
model:
|
|
603
|
-
provider:
|
|
604
|
-
startedAt:
|
|
605
|
-
durationMs:
|
|
637
|
+
var LLMTraceStepSchema = import_zod22.z.object({
|
|
638
|
+
id: import_zod22.z.string(),
|
|
639
|
+
stepNumber: import_zod22.z.number(),
|
|
640
|
+
type: import_zod22.z.enum(LLMStepType),
|
|
641
|
+
model: import_zod22.z.string(),
|
|
642
|
+
provider: import_zod22.z.string(),
|
|
643
|
+
startedAt: import_zod22.z.string(),
|
|
644
|
+
durationMs: import_zod22.z.number(),
|
|
606
645
|
tokenUsage: TokenUsageSchema,
|
|
607
|
-
costUsd:
|
|
608
|
-
toolName:
|
|
609
|
-
toolArguments:
|
|
610
|
-
inputPreview:
|
|
611
|
-
outputPreview:
|
|
612
|
-
success:
|
|
613
|
-
error:
|
|
614
|
-
});
|
|
615
|
-
var LLMBreakdownStatsSchema =
|
|
616
|
-
count:
|
|
617
|
-
durationMs:
|
|
618
|
-
tokens:
|
|
619
|
-
costUsd:
|
|
620
|
-
});
|
|
621
|
-
var LLMTraceSummarySchema =
|
|
622
|
-
totalSteps:
|
|
623
|
-
totalDurationMs:
|
|
646
|
+
costUsd: import_zod22.z.number(),
|
|
647
|
+
toolName: import_zod22.z.string().optional(),
|
|
648
|
+
toolArguments: import_zod22.z.string().optional(),
|
|
649
|
+
inputPreview: import_zod22.z.string().optional(),
|
|
650
|
+
outputPreview: import_zod22.z.string().optional(),
|
|
651
|
+
success: import_zod22.z.boolean(),
|
|
652
|
+
error: import_zod22.z.string().optional()
|
|
653
|
+
});
|
|
654
|
+
var LLMBreakdownStatsSchema = import_zod22.z.object({
|
|
655
|
+
count: import_zod22.z.number(),
|
|
656
|
+
durationMs: import_zod22.z.number(),
|
|
657
|
+
tokens: import_zod22.z.number(),
|
|
658
|
+
costUsd: import_zod22.z.number()
|
|
659
|
+
});
|
|
660
|
+
var LLMTraceSummarySchema = import_zod22.z.object({
|
|
661
|
+
totalSteps: import_zod22.z.number(),
|
|
662
|
+
totalDurationMs: import_zod22.z.number(),
|
|
624
663
|
totalTokens: TokenUsageSchema,
|
|
625
|
-
totalCostUsd:
|
|
626
|
-
stepTypeBreakdown:
|
|
627
|
-
modelBreakdown:
|
|
628
|
-
modelsUsed:
|
|
629
|
-
});
|
|
630
|
-
var LLMTraceSchema =
|
|
631
|
-
id:
|
|
632
|
-
steps:
|
|
664
|
+
totalCostUsd: import_zod22.z.number(),
|
|
665
|
+
stepTypeBreakdown: import_zod22.z.record(import_zod22.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
666
|
+
modelBreakdown: import_zod22.z.record(import_zod22.z.string(), LLMBreakdownStatsSchema),
|
|
667
|
+
modelsUsed: import_zod22.z.array(import_zod22.z.string())
|
|
668
|
+
});
|
|
669
|
+
var LLMTraceSchema = import_zod22.z.object({
|
|
670
|
+
id: import_zod22.z.string(),
|
|
671
|
+
steps: import_zod22.z.array(LLMTraceStepSchema),
|
|
633
672
|
summary: LLMTraceSummarySchema
|
|
634
673
|
});
|
|
635
674
|
|
|
636
675
|
// src/evaluation/eval-result.ts
|
|
637
|
-
var
|
|
676
|
+
var import_zod25 = require("zod");
|
|
638
677
|
|
|
639
678
|
// src/evaluation/eval-run.ts
|
|
640
|
-
var
|
|
679
|
+
var import_zod24 = require("zod");
|
|
641
680
|
|
|
642
681
|
// src/evaluation/live-trace.ts
|
|
643
|
-
var
|
|
682
|
+
var import_zod23 = require("zod");
|
|
644
683
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
645
684
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
646
685
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -649,31 +688,31 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
649
688
|
LiveTraceEventType2["DIAGNOSTIC"] = "diagnostic";
|
|
650
689
|
return LiveTraceEventType2;
|
|
651
690
|
})(LiveTraceEventType || {});
|
|
652
|
-
var LiveTraceEventSchema =
|
|
691
|
+
var LiveTraceEventSchema = import_zod23.z.object({
|
|
653
692
|
/** The evaluation run ID */
|
|
654
|
-
evalRunId:
|
|
693
|
+
evalRunId: import_zod23.z.string(),
|
|
655
694
|
/** The scenario ID being executed */
|
|
656
|
-
scenarioId:
|
|
695
|
+
scenarioId: import_zod23.z.string(),
|
|
657
696
|
/** The scenario name for display */
|
|
658
|
-
scenarioName:
|
|
697
|
+
scenarioName: import_zod23.z.string(),
|
|
659
698
|
/** The target ID (skill, agent, etc.) */
|
|
660
|
-
targetId:
|
|
699
|
+
targetId: import_zod23.z.string(),
|
|
661
700
|
/** The target name for display */
|
|
662
|
-
targetName:
|
|
701
|
+
targetName: import_zod23.z.string(),
|
|
663
702
|
/** Step number in the current scenario execution */
|
|
664
|
-
stepNumber:
|
|
703
|
+
stepNumber: import_zod23.z.number(),
|
|
665
704
|
/** Type of trace event */
|
|
666
|
-
type:
|
|
705
|
+
type: import_zod23.z.enum(LiveTraceEventType),
|
|
667
706
|
/** Tool name if this is a tool_use event */
|
|
668
|
-
toolName:
|
|
707
|
+
toolName: import_zod23.z.string().optional(),
|
|
669
708
|
/** Tool arguments preview (truncated JSON) */
|
|
670
|
-
toolArgs:
|
|
709
|
+
toolArgs: import_zod23.z.string().optional(),
|
|
671
710
|
/** Output preview (truncated text) */
|
|
672
|
-
outputPreview:
|
|
711
|
+
outputPreview: import_zod23.z.string().optional(),
|
|
673
712
|
/** Timestamp when this event occurred */
|
|
674
|
-
timestamp:
|
|
713
|
+
timestamp: import_zod23.z.string(),
|
|
675
714
|
/** Whether this is the final event for this scenario */
|
|
676
|
-
isComplete:
|
|
715
|
+
isComplete: import_zod23.z.boolean()
|
|
677
716
|
});
|
|
678
717
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
679
718
|
function parseTraceEventLine(line) {
|
|
@@ -701,14 +740,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
701
740
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
702
741
|
return TriggerType2;
|
|
703
742
|
})(TriggerType || {});
|
|
704
|
-
var TriggerMetadataSchema =
|
|
705
|
-
version:
|
|
706
|
-
resourceUpdated:
|
|
743
|
+
var TriggerMetadataSchema = import_zod24.z.object({
|
|
744
|
+
version: import_zod24.z.string().optional(),
|
|
745
|
+
resourceUpdated: import_zod24.z.array(import_zod24.z.string()).optional()
|
|
707
746
|
});
|
|
708
|
-
var TriggerSchema =
|
|
709
|
-
id:
|
|
747
|
+
var TriggerSchema = import_zod24.z.object({
|
|
748
|
+
id: import_zod24.z.string(),
|
|
710
749
|
metadata: TriggerMetadataSchema.optional(),
|
|
711
|
-
type:
|
|
750
|
+
type: import_zod24.z.enum(TriggerType)
|
|
712
751
|
});
|
|
713
752
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
714
753
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -726,89 +765,89 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
726
765
|
FailureSeverity2["LOW"] = "low";
|
|
727
766
|
return FailureSeverity2;
|
|
728
767
|
})(FailureSeverity || {});
|
|
729
|
-
var DiffLineTypeSchema =
|
|
730
|
-
var DiffLineSchema =
|
|
768
|
+
var DiffLineTypeSchema = import_zod24.z.enum(["added", "removed", "unchanged"]);
|
|
769
|
+
var DiffLineSchema = import_zod24.z.object({
|
|
731
770
|
type: DiffLineTypeSchema,
|
|
732
|
-
content:
|
|
733
|
-
lineNumber:
|
|
734
|
-
});
|
|
735
|
-
var DiffContentSchema =
|
|
736
|
-
path:
|
|
737
|
-
expected:
|
|
738
|
-
actual:
|
|
739
|
-
diffLines:
|
|
740
|
-
});
|
|
741
|
-
var CommandExecutionSchema =
|
|
742
|
-
command:
|
|
743
|
-
exitCode:
|
|
744
|
-
output:
|
|
745
|
-
duration:
|
|
746
|
-
});
|
|
747
|
-
var FileModificationSchema =
|
|
748
|
-
path:
|
|
749
|
-
action:
|
|
750
|
-
});
|
|
751
|
-
var ApiCallSchema =
|
|
752
|
-
endpoint:
|
|
753
|
-
tokensUsed:
|
|
754
|
-
duration:
|
|
755
|
-
});
|
|
756
|
-
var ExecutionTraceSchema =
|
|
757
|
-
commands:
|
|
758
|
-
filesModified:
|
|
759
|
-
apiCalls:
|
|
760
|
-
totalDuration:
|
|
761
|
-
});
|
|
762
|
-
var FailureAnalysisSchema =
|
|
763
|
-
category:
|
|
764
|
-
severity:
|
|
765
|
-
summary:
|
|
766
|
-
details:
|
|
767
|
-
rootCause:
|
|
768
|
-
suggestedFix:
|
|
769
|
-
relatedAssertions:
|
|
770
|
-
codeSnippet:
|
|
771
|
-
similarIssues:
|
|
772
|
-
patternId:
|
|
771
|
+
content: import_zod24.z.string(),
|
|
772
|
+
lineNumber: import_zod24.z.number()
|
|
773
|
+
});
|
|
774
|
+
var DiffContentSchema = import_zod24.z.object({
|
|
775
|
+
path: import_zod24.z.string(),
|
|
776
|
+
expected: import_zod24.z.string(),
|
|
777
|
+
actual: import_zod24.z.string(),
|
|
778
|
+
diffLines: import_zod24.z.array(DiffLineSchema)
|
|
779
|
+
});
|
|
780
|
+
var CommandExecutionSchema = import_zod24.z.object({
|
|
781
|
+
command: import_zod24.z.string(),
|
|
782
|
+
exitCode: import_zod24.z.number(),
|
|
783
|
+
output: import_zod24.z.string().optional(),
|
|
784
|
+
duration: import_zod24.z.number()
|
|
785
|
+
});
|
|
786
|
+
var FileModificationSchema = import_zod24.z.object({
|
|
787
|
+
path: import_zod24.z.string(),
|
|
788
|
+
action: import_zod24.z.enum(["created", "modified", "deleted"])
|
|
789
|
+
});
|
|
790
|
+
var ApiCallSchema = import_zod24.z.object({
|
|
791
|
+
endpoint: import_zod24.z.string(),
|
|
792
|
+
tokensUsed: import_zod24.z.number(),
|
|
793
|
+
duration: import_zod24.z.number()
|
|
794
|
+
});
|
|
795
|
+
var ExecutionTraceSchema = import_zod24.z.object({
|
|
796
|
+
commands: import_zod24.z.array(CommandExecutionSchema),
|
|
797
|
+
filesModified: import_zod24.z.array(FileModificationSchema),
|
|
798
|
+
apiCalls: import_zod24.z.array(ApiCallSchema),
|
|
799
|
+
totalDuration: import_zod24.z.number()
|
|
800
|
+
});
|
|
801
|
+
var FailureAnalysisSchema = import_zod24.z.object({
|
|
802
|
+
category: import_zod24.z.enum(FailureCategory),
|
|
803
|
+
severity: import_zod24.z.enum(FailureSeverity),
|
|
804
|
+
summary: import_zod24.z.string(),
|
|
805
|
+
details: import_zod24.z.string(),
|
|
806
|
+
rootCause: import_zod24.z.string(),
|
|
807
|
+
suggestedFix: import_zod24.z.string(),
|
|
808
|
+
relatedAssertions: import_zod24.z.array(import_zod24.z.string()),
|
|
809
|
+
codeSnippet: import_zod24.z.string().optional(),
|
|
810
|
+
similarIssues: import_zod24.z.array(import_zod24.z.string()).optional(),
|
|
811
|
+
patternId: import_zod24.z.string().optional(),
|
|
773
812
|
// Extended fields for detailed debugging
|
|
774
813
|
diff: DiffContentSchema.optional(),
|
|
775
814
|
executionTrace: ExecutionTraceSchema.optional()
|
|
776
815
|
});
|
|
777
816
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
778
817
|
/** Agent ID for this run */
|
|
779
|
-
agentId:
|
|
818
|
+
agentId: import_zod24.z.string().optional(),
|
|
780
819
|
/** Skills group ID for this run */
|
|
781
|
-
skillsGroupId:
|
|
820
|
+
skillsGroupId: import_zod24.z.string().optional(),
|
|
782
821
|
/** Scenario IDs to run */
|
|
783
|
-
scenarioIds:
|
|
822
|
+
scenarioIds: import_zod24.z.array(import_zod24.z.string()),
|
|
784
823
|
/** Current status */
|
|
785
824
|
status: EvalStatusSchema,
|
|
786
825
|
/** Progress percentage (0-100) */
|
|
787
|
-
progress:
|
|
826
|
+
progress: import_zod24.z.number(),
|
|
788
827
|
/** Results for each scenario/target combination */
|
|
789
|
-
results:
|
|
828
|
+
results: import_zod24.z.array(EvalRunResultSchema),
|
|
790
829
|
/** Aggregated metrics across all results */
|
|
791
830
|
aggregateMetrics: EvalMetricsSchema,
|
|
792
831
|
/** Failure analyses */
|
|
793
|
-
failureAnalyses:
|
|
832
|
+
failureAnalyses: import_zod24.z.array(FailureAnalysisSchema).optional(),
|
|
794
833
|
/** Aggregated LLM trace summary */
|
|
795
834
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
796
835
|
/** What triggered this run */
|
|
797
836
|
trigger: TriggerSchema.optional(),
|
|
798
837
|
/** When the run started (set when evaluation is triggered) */
|
|
799
|
-
startedAt:
|
|
838
|
+
startedAt: import_zod24.z.string().optional(),
|
|
800
839
|
/** When the run completed */
|
|
801
|
-
completedAt:
|
|
840
|
+
completedAt: import_zod24.z.string().optional(),
|
|
802
841
|
/** Live trace events captured during execution (for playback on results page) */
|
|
803
|
-
liveTraceEvents:
|
|
842
|
+
liveTraceEvents: import_zod24.z.array(LiveTraceEventSchema).optional(),
|
|
804
843
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
805
|
-
jobId:
|
|
844
|
+
jobId: import_zod24.z.string().optional(),
|
|
806
845
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
807
|
-
jobStatus:
|
|
846
|
+
jobStatus: import_zod24.z.string().optional(),
|
|
808
847
|
/** Remote job error message if the job failed */
|
|
809
|
-
jobError:
|
|
848
|
+
jobError: import_zod24.z.string().optional(),
|
|
810
849
|
/** Timestamp of the last job status check */
|
|
811
|
-
jobStatusCheckedAt:
|
|
850
|
+
jobStatusCheckedAt: import_zod24.z.string().optional()
|
|
812
851
|
});
|
|
813
852
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
814
853
|
id: true,
|
|
@@ -821,28 +860,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
821
860
|
startedAt: true,
|
|
822
861
|
completedAt: true
|
|
823
862
|
});
|
|
824
|
-
var EvaluationProgressSchema =
|
|
825
|
-
runId:
|
|
826
|
-
targetId:
|
|
827
|
-
totalScenarios:
|
|
828
|
-
completedScenarios:
|
|
829
|
-
scenarioProgress:
|
|
830
|
-
|
|
831
|
-
scenarioId:
|
|
832
|
-
currentStep:
|
|
833
|
-
error:
|
|
863
|
+
var EvaluationProgressSchema = import_zod24.z.object({
|
|
864
|
+
runId: import_zod24.z.string(),
|
|
865
|
+
targetId: import_zod24.z.string(),
|
|
866
|
+
totalScenarios: import_zod24.z.number(),
|
|
867
|
+
completedScenarios: import_zod24.z.number(),
|
|
868
|
+
scenarioProgress: import_zod24.z.array(
|
|
869
|
+
import_zod24.z.object({
|
|
870
|
+
scenarioId: import_zod24.z.string(),
|
|
871
|
+
currentStep: import_zod24.z.string(),
|
|
872
|
+
error: import_zod24.z.string().optional()
|
|
834
873
|
})
|
|
835
874
|
),
|
|
836
|
-
createdAt:
|
|
875
|
+
createdAt: import_zod24.z.number()
|
|
837
876
|
});
|
|
838
|
-
var EvaluationLogSchema =
|
|
839
|
-
runId:
|
|
840
|
-
scenarioId:
|
|
841
|
-
log:
|
|
842
|
-
level:
|
|
843
|
-
message:
|
|
844
|
-
args:
|
|
845
|
-
error:
|
|
877
|
+
var EvaluationLogSchema = import_zod24.z.object({
|
|
878
|
+
runId: import_zod24.z.string(),
|
|
879
|
+
scenarioId: import_zod24.z.string(),
|
|
880
|
+
log: import_zod24.z.object({
|
|
881
|
+
level: import_zod24.z.enum(["info", "error", "debug"]),
|
|
882
|
+
message: import_zod24.z.string().optional(),
|
|
883
|
+
args: import_zod24.z.array(import_zod24.z.any()).optional(),
|
|
884
|
+
error: import_zod24.z.string().optional()
|
|
846
885
|
})
|
|
847
886
|
});
|
|
848
887
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -855,90 +894,89 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
855
894
|
AssertionResultStatus2["ERROR"] = "error";
|
|
856
895
|
return AssertionResultStatus2;
|
|
857
896
|
})(AssertionResultStatus || {});
|
|
858
|
-
var AssertionResultSchema =
|
|
859
|
-
id:
|
|
860
|
-
assertionId:
|
|
861
|
-
assertionType:
|
|
862
|
-
assertionName:
|
|
863
|
-
status:
|
|
864
|
-
message:
|
|
865
|
-
expected:
|
|
866
|
-
actual:
|
|
867
|
-
duration:
|
|
868
|
-
details:
|
|
869
|
-
llmTraceSteps:
|
|
870
|
-
});
|
|
871
|
-
var EvalRunResultSchema =
|
|
872
|
-
id:
|
|
873
|
-
targetId:
|
|
874
|
-
targetName:
|
|
875
|
-
scenarioId:
|
|
876
|
-
scenarioName:
|
|
897
|
+
var AssertionResultSchema = import_zod25.z.object({
|
|
898
|
+
id: import_zod25.z.string(),
|
|
899
|
+
assertionId: import_zod25.z.string(),
|
|
900
|
+
assertionType: import_zod25.z.string(),
|
|
901
|
+
assertionName: import_zod25.z.string(),
|
|
902
|
+
status: import_zod25.z.enum(AssertionResultStatus),
|
|
903
|
+
message: import_zod25.z.string().optional(),
|
|
904
|
+
expected: import_zod25.z.string().optional(),
|
|
905
|
+
actual: import_zod25.z.string().optional(),
|
|
906
|
+
duration: import_zod25.z.number().optional(),
|
|
907
|
+
details: import_zod25.z.record(import_zod25.z.string(), import_zod25.z.unknown()).optional(),
|
|
908
|
+
llmTraceSteps: import_zod25.z.array(LLMTraceStepSchema).optional()
|
|
909
|
+
});
|
|
910
|
+
var EvalRunResultSchema = import_zod25.z.object({
|
|
911
|
+
id: import_zod25.z.string(),
|
|
912
|
+
targetId: import_zod25.z.string(),
|
|
913
|
+
targetName: import_zod25.z.string().optional(),
|
|
914
|
+
scenarioId: import_zod25.z.string(),
|
|
915
|
+
scenarioName: import_zod25.z.string(),
|
|
877
916
|
modelConfig: ModelConfigSchema.optional(),
|
|
878
|
-
assertionResults:
|
|
917
|
+
assertionResults: import_zod25.z.array(AssertionResultSchema),
|
|
879
918
|
metrics: EvalMetricsSchema.optional(),
|
|
880
|
-
passed:
|
|
881
|
-
failed:
|
|
882
|
-
passRate:
|
|
883
|
-
duration:
|
|
884
|
-
outputText:
|
|
885
|
-
files:
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
completedAt: import_zod24.z.string().optional(),
|
|
919
|
+
passed: import_zod25.z.number(),
|
|
920
|
+
failed: import_zod25.z.number(),
|
|
921
|
+
passRate: import_zod25.z.number(),
|
|
922
|
+
duration: import_zod25.z.number(),
|
|
923
|
+
outputText: import_zod25.z.string().optional(),
|
|
924
|
+
files: import_zod25.z.array(ExpectedFileSchema).optional(),
|
|
925
|
+
fileDiffs: import_zod25.z.array(DiffContentSchema).optional(),
|
|
926
|
+
startedAt: import_zod25.z.string().optional(),
|
|
927
|
+
completedAt: import_zod25.z.string().optional(),
|
|
890
928
|
llmTrace: LLMTraceSchema.optional()
|
|
891
929
|
});
|
|
892
|
-
var PromptResultSchema =
|
|
893
|
-
text:
|
|
894
|
-
files:
|
|
895
|
-
finishReason:
|
|
896
|
-
reasoning:
|
|
897
|
-
reasoningDetails:
|
|
898
|
-
toolCalls:
|
|
899
|
-
toolResults:
|
|
900
|
-
warnings:
|
|
901
|
-
sources:
|
|
902
|
-
steps:
|
|
903
|
-
generationTimeMs:
|
|
904
|
-
prompt:
|
|
905
|
-
systemPrompt:
|
|
906
|
-
usage:
|
|
907
|
-
totalTokens:
|
|
908
|
-
totalMicrocentsSpent:
|
|
930
|
+
var PromptResultSchema = import_zod25.z.object({
|
|
931
|
+
text: import_zod25.z.string(),
|
|
932
|
+
files: import_zod25.z.array(import_zod25.z.unknown()).optional(),
|
|
933
|
+
finishReason: import_zod25.z.string().optional(),
|
|
934
|
+
reasoning: import_zod25.z.string().optional(),
|
|
935
|
+
reasoningDetails: import_zod25.z.unknown().optional(),
|
|
936
|
+
toolCalls: import_zod25.z.array(import_zod25.z.unknown()).optional(),
|
|
937
|
+
toolResults: import_zod25.z.array(import_zod25.z.unknown()).optional(),
|
|
938
|
+
warnings: import_zod25.z.array(import_zod25.z.unknown()).optional(),
|
|
939
|
+
sources: import_zod25.z.array(import_zod25.z.unknown()).optional(),
|
|
940
|
+
steps: import_zod25.z.array(import_zod25.z.unknown()),
|
|
941
|
+
generationTimeMs: import_zod25.z.number(),
|
|
942
|
+
prompt: import_zod25.z.string(),
|
|
943
|
+
systemPrompt: import_zod25.z.string(),
|
|
944
|
+
usage: import_zod25.z.object({
|
|
945
|
+
totalTokens: import_zod25.z.number().optional(),
|
|
946
|
+
totalMicrocentsSpent: import_zod25.z.number().optional()
|
|
909
947
|
})
|
|
910
948
|
});
|
|
911
|
-
var EvaluationResultSchema =
|
|
912
|
-
id:
|
|
913
|
-
runId:
|
|
914
|
-
timestamp:
|
|
949
|
+
var EvaluationResultSchema = import_zod25.z.object({
|
|
950
|
+
id: import_zod25.z.string(),
|
|
951
|
+
runId: import_zod25.z.string(),
|
|
952
|
+
timestamp: import_zod25.z.number(),
|
|
915
953
|
promptResult: PromptResultSchema,
|
|
916
|
-
testResults:
|
|
917
|
-
tags:
|
|
918
|
-
feedback:
|
|
919
|
-
score:
|
|
920
|
-
suiteId:
|
|
921
|
-
});
|
|
922
|
-
var LeanEvaluationResultSchema =
|
|
923
|
-
id:
|
|
924
|
-
runId:
|
|
925
|
-
timestamp:
|
|
926
|
-
tags:
|
|
927
|
-
scenarioId:
|
|
928
|
-
scenarioVersion:
|
|
929
|
-
targetId:
|
|
930
|
-
targetVersion:
|
|
931
|
-
suiteId:
|
|
932
|
-
score:
|
|
933
|
-
time:
|
|
934
|
-
microcentsSpent:
|
|
954
|
+
testResults: import_zod25.z.array(import_zod25.z.unknown()),
|
|
955
|
+
tags: import_zod25.z.array(import_zod25.z.string()).optional(),
|
|
956
|
+
feedback: import_zod25.z.string().optional(),
|
|
957
|
+
score: import_zod25.z.number(),
|
|
958
|
+
suiteId: import_zod25.z.string().optional()
|
|
959
|
+
});
|
|
960
|
+
var LeanEvaluationResultSchema = import_zod25.z.object({
|
|
961
|
+
id: import_zod25.z.string(),
|
|
962
|
+
runId: import_zod25.z.string(),
|
|
963
|
+
timestamp: import_zod25.z.number(),
|
|
964
|
+
tags: import_zod25.z.array(import_zod25.z.string()).optional(),
|
|
965
|
+
scenarioId: import_zod25.z.string(),
|
|
966
|
+
scenarioVersion: import_zod25.z.number().optional(),
|
|
967
|
+
targetId: import_zod25.z.string(),
|
|
968
|
+
targetVersion: import_zod25.z.number().optional(),
|
|
969
|
+
suiteId: import_zod25.z.string().optional(),
|
|
970
|
+
score: import_zod25.z.number(),
|
|
971
|
+
time: import_zod25.z.number().optional(),
|
|
972
|
+
microcentsSpent: import_zod25.z.number().optional()
|
|
935
973
|
});
|
|
936
974
|
|
|
937
975
|
// src/project/project.ts
|
|
938
|
-
var
|
|
976
|
+
var import_zod26 = require("zod");
|
|
939
977
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
940
|
-
appId:
|
|
941
|
-
appSecret:
|
|
978
|
+
appId: import_zod26.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
979
|
+
appSecret: import_zod26.z.string().optional().describe("The secret of the app in Dev Center")
|
|
942
980
|
});
|
|
943
981
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
944
982
|
id: true,
|
|
@@ -949,10 +987,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
949
987
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
950
988
|
|
|
951
989
|
// src/template/template.ts
|
|
952
|
-
var
|
|
990
|
+
var import_zod27 = require("zod");
|
|
953
991
|
var TemplateSchema = TenantEntitySchema.extend({
|
|
954
992
|
/** URL to download the template from */
|
|
955
|
-
downloadUrl:
|
|
993
|
+
downloadUrl: import_zod27.z.url()
|
|
956
994
|
});
|
|
957
995
|
var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
958
996
|
id: true,
|
|
@@ -970,9 +1008,11 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
970
1008
|
ApiCallSchema,
|
|
971
1009
|
AssertionResultSchema,
|
|
972
1010
|
AssertionResultStatus,
|
|
1011
|
+
AssertionSchema,
|
|
973
1012
|
BaseEntitySchema,
|
|
974
1013
|
BaseTestSchema,
|
|
975
1014
|
BuildCheckTestSchema,
|
|
1015
|
+
BuildPassedAssertionSchema,
|
|
976
1016
|
CommandExecutionSchema,
|
|
977
1017
|
CommandExecutionTestSchema,
|
|
978
1018
|
CreateAgentInputSchema,
|
|
@@ -1014,6 +1054,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
1014
1054
|
LeanEvaluationResultSchema,
|
|
1015
1055
|
LiveTraceEventSchema,
|
|
1016
1056
|
LiveTraceEventType,
|
|
1057
|
+
LlmJudgeAssertionSchema,
|
|
1017
1058
|
LocalProjectConfigSchema,
|
|
1018
1059
|
MCPServerConfigSchema,
|
|
1019
1060
|
MetaSiteConfigSchema,
|
|
@@ -1029,6 +1070,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
1029
1070
|
SkillMetadataSchema,
|
|
1030
1071
|
SkillSchema,
|
|
1031
1072
|
SkillVersionSchema,
|
|
1073
|
+
SkillWasCalledAssertionSchema,
|
|
1032
1074
|
SkillsGroupSchema,
|
|
1033
1075
|
TRACE_EVENT_PREFIX,
|
|
1034
1076
|
TargetSchema,
|