@wix/evalforge-types 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +439 -320
- package/build/index.js.map +4 -4
- package/build/index.mjs +431 -320
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +95 -11
- package/build/types/assertion/index.d.ts +2 -1
- package/build/types/assertion/system-assertions.d.ts +42 -0
- package/build/types/scenario/test-scenario.d.ts +12 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -26,6 +26,8 @@ __export(index_exports, {
|
|
|
26
26
|
AllowedCommands: () => AllowedCommands,
|
|
27
27
|
ApiCallSchema: () => ApiCallSchema,
|
|
28
28
|
AssertionConfigSchema: () => AssertionConfigSchema,
|
|
29
|
+
AssertionParameterSchema: () => AssertionParameterSchema,
|
|
30
|
+
AssertionParameterTypeSchema: () => AssertionParameterTypeSchema,
|
|
29
31
|
AssertionResultSchema: () => AssertionResultSchema,
|
|
30
32
|
AssertionResultStatus: () => AssertionResultStatus,
|
|
31
33
|
AssertionSchema: () => AssertionSchema,
|
|
@@ -92,6 +94,9 @@ __export(index_exports, {
|
|
|
92
94
|
ProjectSchema: () => ProjectSchema,
|
|
93
95
|
PromptResultSchema: () => PromptResultSchema,
|
|
94
96
|
SKILL_FOLDER_NAME_REGEX: () => SKILL_FOLDER_NAME_REGEX,
|
|
97
|
+
SYSTEM_ASSERTIONS: () => SYSTEM_ASSERTIONS,
|
|
98
|
+
SYSTEM_ASSERTION_IDS: () => SYSTEM_ASSERTION_IDS,
|
|
99
|
+
ScenarioAssertionLinkSchema: () => ScenarioAssertionLinkSchema,
|
|
95
100
|
SiteConfigTestSchema: () => SiteConfigTestSchema,
|
|
96
101
|
SkillMetadataSchema: () => SkillMetadataSchema,
|
|
97
102
|
SkillSchema: () => SkillSchema,
|
|
@@ -130,6 +135,9 @@ __export(index_exports, {
|
|
|
130
135
|
getBuildPassedConfig: () => getBuildPassedConfig,
|
|
131
136
|
getLlmJudgeConfig: () => getLlmJudgeConfig,
|
|
132
137
|
getSkillWasCalledConfig: () => getSkillWasCalledConfig,
|
|
138
|
+
getSystemAssertion: () => getSystemAssertion,
|
|
139
|
+
getSystemAssertions: () => getSystemAssertions,
|
|
140
|
+
isSystemAssertionId: () => isSystemAssertionId,
|
|
133
141
|
isValidSkillFolderName: () => isValidSkillFolderName,
|
|
134
142
|
parseTraceEventLine: () => parseTraceEventLine,
|
|
135
143
|
validateAssertionConfig: () => validateAssertionConfig
|
|
@@ -592,22 +600,145 @@ var EnvironmentSchema = import_zod19.z.object({
|
|
|
592
600
|
});
|
|
593
601
|
|
|
594
602
|
// src/scenario/test-scenario.ts
|
|
603
|
+
var import_zod21 = require("zod");
|
|
604
|
+
|
|
605
|
+
// src/assertion/assertion.ts
|
|
595
606
|
var import_zod20 = require("zod");
|
|
596
|
-
var
|
|
607
|
+
var AssertionTypeSchema = import_zod20.z.enum([
|
|
608
|
+
"skill_was_called",
|
|
609
|
+
"build_passed",
|
|
610
|
+
"llm_judge"
|
|
611
|
+
]);
|
|
612
|
+
var AssertionParameterTypeSchema = import_zod20.z.enum([
|
|
613
|
+
"string",
|
|
614
|
+
"number",
|
|
615
|
+
"boolean"
|
|
616
|
+
]);
|
|
617
|
+
var AssertionParameterSchema = import_zod20.z.object({
|
|
618
|
+
/** Parameter name (used as key in params object) */
|
|
619
|
+
name: import_zod20.z.string().min(1),
|
|
620
|
+
/** Display label for the parameter */
|
|
621
|
+
label: import_zod20.z.string().min(1),
|
|
622
|
+
/** Parameter type */
|
|
623
|
+
type: AssertionParameterTypeSchema,
|
|
624
|
+
/** Whether this parameter is required */
|
|
625
|
+
required: import_zod20.z.boolean(),
|
|
626
|
+
/** Default value (optional, used when not provided) */
|
|
627
|
+
defaultValue: import_zod20.z.union([import_zod20.z.string(), import_zod20.z.number(), import_zod20.z.boolean()]).optional()
|
|
628
|
+
});
|
|
629
|
+
var ScenarioAssertionLinkSchema = import_zod20.z.object({
|
|
630
|
+
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
631
|
+
assertionId: import_zod20.z.string(),
|
|
632
|
+
/** Parameter values for this assertion in this scenario */
|
|
633
|
+
params: import_zod20.z.record(
|
|
634
|
+
import_zod20.z.string(),
|
|
635
|
+
import_zod20.z.union([import_zod20.z.string(), import_zod20.z.number(), import_zod20.z.boolean(), import_zod20.z.null()])
|
|
636
|
+
).optional()
|
|
637
|
+
});
|
|
638
|
+
var SkillWasCalledConfigSchema = import_zod20.z.object({
|
|
639
|
+
/** Name of the skill that must have been called */
|
|
640
|
+
skillName: import_zod20.z.string().min(1)
|
|
641
|
+
});
|
|
642
|
+
var BuildPassedConfigSchema = import_zod20.z.strictObject({
|
|
643
|
+
/** Command to run (default: "yarn build") */
|
|
644
|
+
command: import_zod20.z.string().optional(),
|
|
645
|
+
/** Expected exit code (default: 0) */
|
|
646
|
+
expectedExitCode: import_zod20.z.number().int().optional()
|
|
647
|
+
});
|
|
648
|
+
var LlmJudgeConfigSchema = import_zod20.z.object({
|
|
649
|
+
/**
|
|
650
|
+
* Prompt template with placeholders:
|
|
651
|
+
* - {{output}}: agent's final output
|
|
652
|
+
* - {{cwd}}: working directory
|
|
653
|
+
* - {{changedFiles}}: all files changed (new, modified)
|
|
654
|
+
* - {{modifiedFiles}}: only existing files that were modified
|
|
655
|
+
* - {{newFiles}}: only new files that were created
|
|
656
|
+
* - {{trace}}: step-by-step trace of tool calls
|
|
657
|
+
* - Custom parameters defined in the parameters array
|
|
658
|
+
*/
|
|
659
|
+
prompt: import_zod20.z.string().min(1),
|
|
660
|
+
/** Optional system prompt for the judge */
|
|
661
|
+
systemPrompt: import_zod20.z.string().optional(),
|
|
662
|
+
/** Minimum score to pass (0-100, default 70) */
|
|
663
|
+
minScore: import_zod20.z.number().int().min(0).max(100).optional(),
|
|
664
|
+
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
665
|
+
model: import_zod20.z.string().optional(),
|
|
666
|
+
/** Max output tokens */
|
|
667
|
+
maxTokens: import_zod20.z.number().int().optional(),
|
|
668
|
+
/** Temperature (0-1) */
|
|
669
|
+
temperature: import_zod20.z.number().min(0).max(1).optional(),
|
|
670
|
+
/** User-defined parameters for this assertion */
|
|
671
|
+
parameters: import_zod20.z.array(AssertionParameterSchema).optional()
|
|
672
|
+
});
|
|
673
|
+
var AssertionConfigSchema = import_zod20.z.union([
|
|
674
|
+
LlmJudgeConfigSchema,
|
|
675
|
+
// requires prompt - check first
|
|
676
|
+
SkillWasCalledConfigSchema,
|
|
677
|
+
// requires skillName
|
|
678
|
+
BuildPassedConfigSchema,
|
|
679
|
+
// all optional, uses strictObject to reject unknown keys
|
|
680
|
+
import_zod20.z.object({})
|
|
681
|
+
// fallback empty config
|
|
682
|
+
]);
|
|
683
|
+
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
684
|
+
/** The assertion type */
|
|
685
|
+
type: AssertionTypeSchema,
|
|
686
|
+
/** Type-specific configuration */
|
|
687
|
+
config: AssertionConfigSchema
|
|
688
|
+
});
|
|
689
|
+
var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
|
|
690
|
+
id: true,
|
|
691
|
+
createdAt: true,
|
|
692
|
+
updatedAt: true,
|
|
693
|
+
deleted: true
|
|
694
|
+
});
|
|
695
|
+
var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
|
|
696
|
+
function validateAssertionConfig(type, config) {
|
|
697
|
+
switch (type) {
|
|
698
|
+
case "skill_was_called":
|
|
699
|
+
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
700
|
+
case "build_passed":
|
|
701
|
+
return BuildPassedConfigSchema.safeParse(config).success;
|
|
702
|
+
case "llm_judge":
|
|
703
|
+
return LlmJudgeConfigSchema.safeParse(config).success;
|
|
704
|
+
default:
|
|
705
|
+
return false;
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
function getSkillWasCalledConfig(assertion) {
|
|
709
|
+
if (assertion.type !== "skill_was_called") return null;
|
|
710
|
+
const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
|
|
711
|
+
return result.success ? result.data : null;
|
|
712
|
+
}
|
|
713
|
+
function getBuildPassedConfig(assertion) {
|
|
714
|
+
if (assertion.type !== "build_passed") return null;
|
|
715
|
+
const result = BuildPassedConfigSchema.safeParse(assertion.config);
|
|
716
|
+
return result.success ? result.data : null;
|
|
717
|
+
}
|
|
718
|
+
function getLlmJudgeConfig(assertion) {
|
|
719
|
+
if (assertion.type !== "llm_judge") return null;
|
|
720
|
+
const result = LlmJudgeConfigSchema.safeParse(assertion.config);
|
|
721
|
+
return result.success ? result.data : null;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
// src/scenario/test-scenario.ts
|
|
725
|
+
var ExpectedFileSchema = import_zod21.z.object({
|
|
597
726
|
/** Relative path where the file should be created */
|
|
598
|
-
path:
|
|
727
|
+
path: import_zod21.z.string(),
|
|
599
728
|
/** Optional expected content */
|
|
600
|
-
content:
|
|
729
|
+
content: import_zod21.z.string().optional()
|
|
601
730
|
});
|
|
602
731
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
603
732
|
/** The prompt sent to the agent to trigger the task */
|
|
604
|
-
triggerPrompt:
|
|
733
|
+
triggerPrompt: import_zod21.z.string().min(10),
|
|
605
734
|
/** ID of the template to use for this scenario (null = no template) */
|
|
606
|
-
templateId:
|
|
735
|
+
templateId: import_zod21.z.string().nullish(),
|
|
607
736
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
608
|
-
assertions:
|
|
609
|
-
/** IDs of saved assertions to evaluate (from assertions table) */
|
|
610
|
-
assertionIds:
|
|
737
|
+
assertions: import_zod21.z.array(AssertionSchema).optional(),
|
|
738
|
+
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
739
|
+
assertionIds: import_zod21.z.array(import_zod21.z.string()).optional(),
|
|
740
|
+
/** Linked assertions with per-scenario parameter values */
|
|
741
|
+
assertionLinks: import_zod21.z.array(ScenarioAssertionLinkSchema).optional()
|
|
611
742
|
});
|
|
612
743
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
613
744
|
id: true,
|
|
@@ -618,10 +749,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
618
749
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
619
750
|
|
|
620
751
|
// src/suite/test-suite.ts
|
|
621
|
-
var
|
|
752
|
+
var import_zod22 = require("zod");
|
|
622
753
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
623
754
|
/** IDs of test scenarios in this suite */
|
|
624
|
-
scenarioIds:
|
|
755
|
+
scenarioIds: import_zod22.z.array(import_zod22.z.string())
|
|
625
756
|
});
|
|
626
757
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
627
758
|
id: true,
|
|
@@ -632,21 +763,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
632
763
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
633
764
|
|
|
634
765
|
// src/evaluation/metrics.ts
|
|
635
|
-
var
|
|
636
|
-
var TokenUsageSchema =
|
|
637
|
-
prompt:
|
|
638
|
-
completion:
|
|
639
|
-
total:
|
|
640
|
-
});
|
|
641
|
-
var EvalMetricsSchema =
|
|
642
|
-
totalAssertions:
|
|
643
|
-
passed:
|
|
644
|
-
failed:
|
|
645
|
-
skipped:
|
|
646
|
-
errors:
|
|
647
|
-
passRate:
|
|
648
|
-
avgDuration:
|
|
649
|
-
totalDuration:
|
|
766
|
+
var import_zod23 = require("zod");
|
|
767
|
+
var TokenUsageSchema = import_zod23.z.object({
|
|
768
|
+
prompt: import_zod23.z.number(),
|
|
769
|
+
completion: import_zod23.z.number(),
|
|
770
|
+
total: import_zod23.z.number()
|
|
771
|
+
});
|
|
772
|
+
var EvalMetricsSchema = import_zod23.z.object({
|
|
773
|
+
totalAssertions: import_zod23.z.number(),
|
|
774
|
+
passed: import_zod23.z.number(),
|
|
775
|
+
failed: import_zod23.z.number(),
|
|
776
|
+
skipped: import_zod23.z.number(),
|
|
777
|
+
errors: import_zod23.z.number(),
|
|
778
|
+
passRate: import_zod23.z.number(),
|
|
779
|
+
avgDuration: import_zod23.z.number(),
|
|
780
|
+
totalDuration: import_zod23.z.number()
|
|
650
781
|
});
|
|
651
782
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
652
783
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -656,7 +787,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
656
787
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
657
788
|
return EvalStatus2;
|
|
658
789
|
})(EvalStatus || {});
|
|
659
|
-
var EvalStatusSchema =
|
|
790
|
+
var EvalStatusSchema = import_zod23.z.enum(EvalStatus);
|
|
660
791
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
661
792
|
LLMStepType2["COMPLETION"] = "completion";
|
|
662
793
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -664,52 +795,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
664
795
|
LLMStepType2["THINKING"] = "thinking";
|
|
665
796
|
return LLMStepType2;
|
|
666
797
|
})(LLMStepType || {});
|
|
667
|
-
var LLMTraceStepSchema =
|
|
668
|
-
id:
|
|
669
|
-
stepNumber:
|
|
670
|
-
type:
|
|
671
|
-
model:
|
|
672
|
-
provider:
|
|
673
|
-
startedAt:
|
|
674
|
-
durationMs:
|
|
798
|
+
var LLMTraceStepSchema = import_zod23.z.object({
|
|
799
|
+
id: import_zod23.z.string(),
|
|
800
|
+
stepNumber: import_zod23.z.number(),
|
|
801
|
+
type: import_zod23.z.enum(LLMStepType),
|
|
802
|
+
model: import_zod23.z.string(),
|
|
803
|
+
provider: import_zod23.z.string(),
|
|
804
|
+
startedAt: import_zod23.z.string(),
|
|
805
|
+
durationMs: import_zod23.z.number(),
|
|
675
806
|
tokenUsage: TokenUsageSchema,
|
|
676
|
-
costUsd:
|
|
677
|
-
toolName:
|
|
678
|
-
toolArguments:
|
|
679
|
-
inputPreview:
|
|
680
|
-
outputPreview:
|
|
681
|
-
success:
|
|
682
|
-
error:
|
|
683
|
-
});
|
|
684
|
-
var LLMBreakdownStatsSchema =
|
|
685
|
-
count:
|
|
686
|
-
durationMs:
|
|
687
|
-
tokens:
|
|
688
|
-
costUsd:
|
|
689
|
-
});
|
|
690
|
-
var LLMTraceSummarySchema =
|
|
691
|
-
totalSteps:
|
|
692
|
-
totalDurationMs:
|
|
807
|
+
costUsd: import_zod23.z.number(),
|
|
808
|
+
toolName: import_zod23.z.string().optional(),
|
|
809
|
+
toolArguments: import_zod23.z.string().optional(),
|
|
810
|
+
inputPreview: import_zod23.z.string().optional(),
|
|
811
|
+
outputPreview: import_zod23.z.string().optional(),
|
|
812
|
+
success: import_zod23.z.boolean(),
|
|
813
|
+
error: import_zod23.z.string().optional()
|
|
814
|
+
});
|
|
815
|
+
var LLMBreakdownStatsSchema = import_zod23.z.object({
|
|
816
|
+
count: import_zod23.z.number(),
|
|
817
|
+
durationMs: import_zod23.z.number(),
|
|
818
|
+
tokens: import_zod23.z.number(),
|
|
819
|
+
costUsd: import_zod23.z.number()
|
|
820
|
+
});
|
|
821
|
+
var LLMTraceSummarySchema = import_zod23.z.object({
|
|
822
|
+
totalSteps: import_zod23.z.number(),
|
|
823
|
+
totalDurationMs: import_zod23.z.number(),
|
|
693
824
|
totalTokens: TokenUsageSchema,
|
|
694
|
-
totalCostUsd:
|
|
695
|
-
stepTypeBreakdown:
|
|
696
|
-
modelBreakdown:
|
|
697
|
-
modelsUsed:
|
|
698
|
-
});
|
|
699
|
-
var LLMTraceSchema =
|
|
700
|
-
id:
|
|
701
|
-
steps:
|
|
825
|
+
totalCostUsd: import_zod23.z.number(),
|
|
826
|
+
stepTypeBreakdown: import_zod23.z.record(import_zod23.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
827
|
+
modelBreakdown: import_zod23.z.record(import_zod23.z.string(), LLMBreakdownStatsSchema),
|
|
828
|
+
modelsUsed: import_zod23.z.array(import_zod23.z.string())
|
|
829
|
+
});
|
|
830
|
+
var LLMTraceSchema = import_zod23.z.object({
|
|
831
|
+
id: import_zod23.z.string(),
|
|
832
|
+
steps: import_zod23.z.array(LLMTraceStepSchema),
|
|
702
833
|
summary: LLMTraceSummarySchema
|
|
703
834
|
});
|
|
704
835
|
|
|
705
836
|
// src/evaluation/eval-result.ts
|
|
706
|
-
var
|
|
837
|
+
var import_zod26 = require("zod");
|
|
707
838
|
|
|
708
839
|
// src/evaluation/eval-run.ts
|
|
709
|
-
var
|
|
840
|
+
var import_zod25 = require("zod");
|
|
710
841
|
|
|
711
842
|
// src/evaluation/live-trace.ts
|
|
712
|
-
var
|
|
843
|
+
var import_zod24 = require("zod");
|
|
713
844
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
714
845
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
715
846
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -723,37 +854,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
723
854
|
LiveTraceEventType2["USER"] = "user";
|
|
724
855
|
return LiveTraceEventType2;
|
|
725
856
|
})(LiveTraceEventType || {});
|
|
726
|
-
var LiveTraceEventSchema =
|
|
857
|
+
var LiveTraceEventSchema = import_zod24.z.object({
|
|
727
858
|
/** The evaluation run ID */
|
|
728
|
-
evalRunId:
|
|
859
|
+
evalRunId: import_zod24.z.string(),
|
|
729
860
|
/** The scenario ID being executed */
|
|
730
|
-
scenarioId:
|
|
861
|
+
scenarioId: import_zod24.z.string(),
|
|
731
862
|
/** The scenario name for display */
|
|
732
|
-
scenarioName:
|
|
863
|
+
scenarioName: import_zod24.z.string(),
|
|
733
864
|
/** The target ID (skill, agent, etc.) */
|
|
734
|
-
targetId:
|
|
865
|
+
targetId: import_zod24.z.string(),
|
|
735
866
|
/** The target name for display */
|
|
736
|
-
targetName:
|
|
867
|
+
targetName: import_zod24.z.string(),
|
|
737
868
|
/** Step number in the current scenario execution */
|
|
738
|
-
stepNumber:
|
|
869
|
+
stepNumber: import_zod24.z.number(),
|
|
739
870
|
/** Type of trace event */
|
|
740
|
-
type:
|
|
871
|
+
type: import_zod24.z.enum(LiveTraceEventType),
|
|
741
872
|
/** Tool name if this is a tool_use event */
|
|
742
|
-
toolName:
|
|
873
|
+
toolName: import_zod24.z.string().optional(),
|
|
743
874
|
/** Tool arguments preview (truncated JSON) */
|
|
744
|
-
toolArgs:
|
|
875
|
+
toolArgs: import_zod24.z.string().optional(),
|
|
745
876
|
/** Output preview (truncated text) */
|
|
746
|
-
outputPreview:
|
|
877
|
+
outputPreview: import_zod24.z.string().optional(),
|
|
747
878
|
/** File path for file operations */
|
|
748
|
-
filePath:
|
|
879
|
+
filePath: import_zod24.z.string().optional(),
|
|
749
880
|
/** Elapsed time in milliseconds for progress events */
|
|
750
|
-
elapsedMs:
|
|
881
|
+
elapsedMs: import_zod24.z.number().optional(),
|
|
751
882
|
/** Thinking/reasoning text from Claude */
|
|
752
|
-
thinking:
|
|
883
|
+
thinking: import_zod24.z.string().optional(),
|
|
753
884
|
/** Timestamp when this event occurred */
|
|
754
|
-
timestamp:
|
|
885
|
+
timestamp: import_zod24.z.string(),
|
|
755
886
|
/** Whether this is the final event for this scenario */
|
|
756
|
-
isComplete:
|
|
887
|
+
isComplete: import_zod24.z.boolean()
|
|
757
888
|
});
|
|
758
889
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
759
890
|
function parseTraceEventLine(line) {
|
|
@@ -781,14 +912,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
781
912
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
782
913
|
return TriggerType2;
|
|
783
914
|
})(TriggerType || {});
|
|
784
|
-
var TriggerMetadataSchema =
|
|
785
|
-
version:
|
|
786
|
-
resourceUpdated:
|
|
915
|
+
var TriggerMetadataSchema = import_zod25.z.object({
|
|
916
|
+
version: import_zod25.z.string().optional(),
|
|
917
|
+
resourceUpdated: import_zod25.z.array(import_zod25.z.string()).optional()
|
|
787
918
|
});
|
|
788
|
-
var TriggerSchema =
|
|
789
|
-
id:
|
|
919
|
+
var TriggerSchema = import_zod25.z.object({
|
|
920
|
+
id: import_zod25.z.string(),
|
|
790
921
|
metadata: TriggerMetadataSchema.optional(),
|
|
791
|
-
type:
|
|
922
|
+
type: import_zod25.z.enum(TriggerType)
|
|
792
923
|
});
|
|
793
924
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
794
925
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -806,28 +937,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
806
937
|
FailureSeverity2["LOW"] = "low";
|
|
807
938
|
return FailureSeverity2;
|
|
808
939
|
})(FailureSeverity || {});
|
|
809
|
-
var DiffLineTypeSchema =
|
|
810
|
-
var DiffLineSchema =
|
|
940
|
+
var DiffLineTypeSchema = import_zod25.z.enum(["added", "removed", "unchanged"]);
|
|
941
|
+
var DiffLineSchema = import_zod25.z.object({
|
|
811
942
|
type: DiffLineTypeSchema,
|
|
812
|
-
content:
|
|
813
|
-
lineNumber:
|
|
814
|
-
});
|
|
815
|
-
var DiffContentSchema =
|
|
816
|
-
path:
|
|
817
|
-
expected:
|
|
818
|
-
actual:
|
|
819
|
-
diffLines:
|
|
820
|
-
renamedFrom:
|
|
821
|
-
});
|
|
822
|
-
var CommandExecutionSchema =
|
|
823
|
-
command:
|
|
824
|
-
exitCode:
|
|
825
|
-
output:
|
|
826
|
-
duration:
|
|
827
|
-
});
|
|
828
|
-
var FileModificationSchema =
|
|
829
|
-
path:
|
|
830
|
-
action:
|
|
943
|
+
content: import_zod25.z.string(),
|
|
944
|
+
lineNumber: import_zod25.z.number()
|
|
945
|
+
});
|
|
946
|
+
var DiffContentSchema = import_zod25.z.object({
|
|
947
|
+
path: import_zod25.z.string(),
|
|
948
|
+
expected: import_zod25.z.string(),
|
|
949
|
+
actual: import_zod25.z.string(),
|
|
950
|
+
diffLines: import_zod25.z.array(DiffLineSchema),
|
|
951
|
+
renamedFrom: import_zod25.z.string().optional()
|
|
952
|
+
});
|
|
953
|
+
var CommandExecutionSchema = import_zod25.z.object({
|
|
954
|
+
command: import_zod25.z.string(),
|
|
955
|
+
exitCode: import_zod25.z.number(),
|
|
956
|
+
output: import_zod25.z.string().optional(),
|
|
957
|
+
duration: import_zod25.z.number()
|
|
958
|
+
});
|
|
959
|
+
var FileModificationSchema = import_zod25.z.object({
|
|
960
|
+
path: import_zod25.z.string(),
|
|
961
|
+
action: import_zod25.z.enum(["created", "modified", "deleted"])
|
|
831
962
|
});
|
|
832
963
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
833
964
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -835,75 +966,75 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
835
966
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
836
967
|
return TemplateFileStatus2;
|
|
837
968
|
})(TemplateFileStatus || {});
|
|
838
|
-
var TemplateFileSchema =
|
|
969
|
+
var TemplateFileSchema = import_zod25.z.object({
|
|
839
970
|
/** Relative path within the template */
|
|
840
|
-
path:
|
|
971
|
+
path: import_zod25.z.string(),
|
|
841
972
|
/** Full file content after execution */
|
|
842
|
-
content:
|
|
973
|
+
content: import_zod25.z.string(),
|
|
843
974
|
/** File status (new, modified, unchanged) */
|
|
844
|
-
status:
|
|
845
|
-
});
|
|
846
|
-
var ApiCallSchema =
|
|
847
|
-
endpoint:
|
|
848
|
-
tokensUsed:
|
|
849
|
-
duration:
|
|
850
|
-
});
|
|
851
|
-
var ExecutionTraceSchema =
|
|
852
|
-
commands:
|
|
853
|
-
filesModified:
|
|
854
|
-
apiCalls:
|
|
855
|
-
totalDuration:
|
|
856
|
-
});
|
|
857
|
-
var FailureAnalysisSchema =
|
|
858
|
-
category:
|
|
859
|
-
severity:
|
|
860
|
-
summary:
|
|
861
|
-
details:
|
|
862
|
-
rootCause:
|
|
863
|
-
suggestedFix:
|
|
864
|
-
relatedAssertions:
|
|
865
|
-
codeSnippet:
|
|
866
|
-
similarIssues:
|
|
867
|
-
patternId:
|
|
975
|
+
status: import_zod25.z.enum(["new", "modified", "unchanged"])
|
|
976
|
+
});
|
|
977
|
+
var ApiCallSchema = import_zod25.z.object({
|
|
978
|
+
endpoint: import_zod25.z.string(),
|
|
979
|
+
tokensUsed: import_zod25.z.number(),
|
|
980
|
+
duration: import_zod25.z.number()
|
|
981
|
+
});
|
|
982
|
+
var ExecutionTraceSchema = import_zod25.z.object({
|
|
983
|
+
commands: import_zod25.z.array(CommandExecutionSchema),
|
|
984
|
+
filesModified: import_zod25.z.array(FileModificationSchema),
|
|
985
|
+
apiCalls: import_zod25.z.array(ApiCallSchema),
|
|
986
|
+
totalDuration: import_zod25.z.number()
|
|
987
|
+
});
|
|
988
|
+
var FailureAnalysisSchema = import_zod25.z.object({
|
|
989
|
+
category: import_zod25.z.enum(FailureCategory),
|
|
990
|
+
severity: import_zod25.z.enum(FailureSeverity),
|
|
991
|
+
summary: import_zod25.z.string(),
|
|
992
|
+
details: import_zod25.z.string(),
|
|
993
|
+
rootCause: import_zod25.z.string(),
|
|
994
|
+
suggestedFix: import_zod25.z.string(),
|
|
995
|
+
relatedAssertions: import_zod25.z.array(import_zod25.z.string()),
|
|
996
|
+
codeSnippet: import_zod25.z.string().optional(),
|
|
997
|
+
similarIssues: import_zod25.z.array(import_zod25.z.string()).optional(),
|
|
998
|
+
patternId: import_zod25.z.string().optional(),
|
|
868
999
|
// Extended fields for detailed debugging
|
|
869
1000
|
diff: DiffContentSchema.optional(),
|
|
870
1001
|
executionTrace: ExecutionTraceSchema.optional()
|
|
871
1002
|
});
|
|
872
1003
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
873
1004
|
/** Agent ID for this run */
|
|
874
|
-
agentId:
|
|
1005
|
+
agentId: import_zod25.z.string().optional(),
|
|
875
1006
|
/** Skills group ID for this run */
|
|
876
|
-
skillsGroupId:
|
|
1007
|
+
skillsGroupId: import_zod25.z.string().optional(),
|
|
877
1008
|
/** Scenario IDs to run */
|
|
878
|
-
scenarioIds:
|
|
1009
|
+
scenarioIds: import_zod25.z.array(import_zod25.z.string()),
|
|
879
1010
|
/** Current status */
|
|
880
1011
|
status: EvalStatusSchema,
|
|
881
1012
|
/** Progress percentage (0-100) */
|
|
882
|
-
progress:
|
|
1013
|
+
progress: import_zod25.z.number(),
|
|
883
1014
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
884
|
-
results:
|
|
1015
|
+
results: import_zod25.z.array(import_zod25.z.lazy(() => EvalRunResultSchema)),
|
|
885
1016
|
/** Aggregated metrics across all results */
|
|
886
1017
|
aggregateMetrics: EvalMetricsSchema,
|
|
887
1018
|
/** Failure analyses */
|
|
888
|
-
failureAnalyses:
|
|
1019
|
+
failureAnalyses: import_zod25.z.array(FailureAnalysisSchema).optional(),
|
|
889
1020
|
/** Aggregated LLM trace summary */
|
|
890
1021
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
891
1022
|
/** What triggered this run */
|
|
892
1023
|
trigger: TriggerSchema.optional(),
|
|
893
1024
|
/** When the run started (set when evaluation is triggered) */
|
|
894
|
-
startedAt:
|
|
1025
|
+
startedAt: import_zod25.z.string().optional(),
|
|
895
1026
|
/** When the run completed */
|
|
896
|
-
completedAt:
|
|
1027
|
+
completedAt: import_zod25.z.string().optional(),
|
|
897
1028
|
/** Live trace events captured during execution (for playback on results page) */
|
|
898
|
-
liveTraceEvents:
|
|
1029
|
+
liveTraceEvents: import_zod25.z.array(LiveTraceEventSchema).optional(),
|
|
899
1030
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
900
|
-
jobId:
|
|
1031
|
+
jobId: import_zod25.z.string().optional(),
|
|
901
1032
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
902
|
-
jobStatus:
|
|
1033
|
+
jobStatus: import_zod25.z.string().optional(),
|
|
903
1034
|
/** Remote job error message if the job failed */
|
|
904
|
-
jobError:
|
|
1035
|
+
jobError: import_zod25.z.string().optional(),
|
|
905
1036
|
/** Timestamp of the last job status check */
|
|
906
|
-
jobStatusCheckedAt:
|
|
1037
|
+
jobStatusCheckedAt: import_zod25.z.string().optional()
|
|
907
1038
|
});
|
|
908
1039
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
909
1040
|
id: true,
|
|
@@ -916,28 +1047,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
916
1047
|
startedAt: true,
|
|
917
1048
|
completedAt: true
|
|
918
1049
|
});
|
|
919
|
-
var EvaluationProgressSchema =
|
|
920
|
-
runId:
|
|
921
|
-
targetId:
|
|
922
|
-
totalScenarios:
|
|
923
|
-
completedScenarios:
|
|
924
|
-
scenarioProgress:
|
|
925
|
-
|
|
926
|
-
scenarioId:
|
|
927
|
-
currentStep:
|
|
928
|
-
error:
|
|
1050
|
+
var EvaluationProgressSchema = import_zod25.z.object({
|
|
1051
|
+
runId: import_zod25.z.string(),
|
|
1052
|
+
targetId: import_zod25.z.string(),
|
|
1053
|
+
totalScenarios: import_zod25.z.number(),
|
|
1054
|
+
completedScenarios: import_zod25.z.number(),
|
|
1055
|
+
scenarioProgress: import_zod25.z.array(
|
|
1056
|
+
import_zod25.z.object({
|
|
1057
|
+
scenarioId: import_zod25.z.string(),
|
|
1058
|
+
currentStep: import_zod25.z.string(),
|
|
1059
|
+
error: import_zod25.z.string().optional()
|
|
929
1060
|
})
|
|
930
1061
|
),
|
|
931
|
-
createdAt:
|
|
1062
|
+
createdAt: import_zod25.z.number()
|
|
932
1063
|
});
|
|
933
|
-
var EvaluationLogSchema =
|
|
934
|
-
runId:
|
|
935
|
-
scenarioId:
|
|
936
|
-
log:
|
|
937
|
-
level:
|
|
938
|
-
message:
|
|
939
|
-
args:
|
|
940
|
-
error:
|
|
1064
|
+
var EvaluationLogSchema = import_zod25.z.object({
|
|
1065
|
+
runId: import_zod25.z.string(),
|
|
1066
|
+
scenarioId: import_zod25.z.string(),
|
|
1067
|
+
log: import_zod25.z.object({
|
|
1068
|
+
level: import_zod25.z.enum(["info", "error", "debug"]),
|
|
1069
|
+
message: import_zod25.z.string().optional(),
|
|
1070
|
+
args: import_zod25.z.array(import_zod25.z.any()).optional(),
|
|
1071
|
+
error: import_zod25.z.string().optional()
|
|
941
1072
|
})
|
|
942
1073
|
});
|
|
943
1074
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -950,91 +1081,91 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
950
1081
|
AssertionResultStatus2["ERROR"] = "error";
|
|
951
1082
|
return AssertionResultStatus2;
|
|
952
1083
|
})(AssertionResultStatus || {});
|
|
953
|
-
var AssertionResultSchema =
|
|
954
|
-
id:
|
|
955
|
-
assertionId:
|
|
956
|
-
assertionType:
|
|
957
|
-
assertionName:
|
|
958
|
-
status:
|
|
959
|
-
message:
|
|
960
|
-
expected:
|
|
961
|
-
actual:
|
|
962
|
-
duration:
|
|
963
|
-
details:
|
|
964
|
-
llmTraceSteps:
|
|
965
|
-
});
|
|
966
|
-
var EvalRunResultSchema =
|
|
967
|
-
id:
|
|
968
|
-
targetId:
|
|
969
|
-
targetName:
|
|
970
|
-
scenarioId:
|
|
971
|
-
scenarioName:
|
|
1084
|
+
var AssertionResultSchema = import_zod26.z.object({
|
|
1085
|
+
id: import_zod26.z.string(),
|
|
1086
|
+
assertionId: import_zod26.z.string(),
|
|
1087
|
+
assertionType: import_zod26.z.string(),
|
|
1088
|
+
assertionName: import_zod26.z.string(),
|
|
1089
|
+
status: import_zod26.z.enum(AssertionResultStatus),
|
|
1090
|
+
message: import_zod26.z.string().optional(),
|
|
1091
|
+
expected: import_zod26.z.string().optional(),
|
|
1092
|
+
actual: import_zod26.z.string().optional(),
|
|
1093
|
+
duration: import_zod26.z.number().optional(),
|
|
1094
|
+
details: import_zod26.z.record(import_zod26.z.string(), import_zod26.z.unknown()).optional(),
|
|
1095
|
+
llmTraceSteps: import_zod26.z.array(LLMTraceStepSchema).optional()
|
|
1096
|
+
});
|
|
1097
|
+
var EvalRunResultSchema = import_zod26.z.object({
|
|
1098
|
+
id: import_zod26.z.string(),
|
|
1099
|
+
targetId: import_zod26.z.string(),
|
|
1100
|
+
targetName: import_zod26.z.string().optional(),
|
|
1101
|
+
scenarioId: import_zod26.z.string(),
|
|
1102
|
+
scenarioName: import_zod26.z.string(),
|
|
972
1103
|
modelConfig: ModelConfigSchema.optional(),
|
|
973
|
-
assertionResults:
|
|
1104
|
+
assertionResults: import_zod26.z.array(AssertionResultSchema),
|
|
974
1105
|
metrics: EvalMetricsSchema.optional(),
|
|
975
|
-
passed:
|
|
976
|
-
failed:
|
|
977
|
-
passRate:
|
|
978
|
-
duration:
|
|
979
|
-
outputText:
|
|
980
|
-
files:
|
|
981
|
-
fileDiffs:
|
|
1106
|
+
passed: import_zod26.z.number(),
|
|
1107
|
+
failed: import_zod26.z.number(),
|
|
1108
|
+
passRate: import_zod26.z.number(),
|
|
1109
|
+
duration: import_zod26.z.number(),
|
|
1110
|
+
outputText: import_zod26.z.string().optional(),
|
|
1111
|
+
files: import_zod26.z.array(ExpectedFileSchema).optional(),
|
|
1112
|
+
fileDiffs: import_zod26.z.array(DiffContentSchema).optional(),
|
|
982
1113
|
/** Full template files after execution with status indicators */
|
|
983
|
-
templateFiles:
|
|
984
|
-
startedAt:
|
|
985
|
-
completedAt:
|
|
1114
|
+
templateFiles: import_zod26.z.array(TemplateFileSchema).optional(),
|
|
1115
|
+
startedAt: import_zod26.z.string().optional(),
|
|
1116
|
+
completedAt: import_zod26.z.string().optional(),
|
|
986
1117
|
llmTrace: LLMTraceSchema.optional()
|
|
987
1118
|
});
|
|
988
|
-
var PromptResultSchema =
|
|
989
|
-
text:
|
|
990
|
-
files:
|
|
991
|
-
finishReason:
|
|
992
|
-
reasoning:
|
|
993
|
-
reasoningDetails:
|
|
994
|
-
toolCalls:
|
|
995
|
-
toolResults:
|
|
996
|
-
warnings:
|
|
997
|
-
sources:
|
|
998
|
-
steps:
|
|
999
|
-
generationTimeMs:
|
|
1000
|
-
prompt:
|
|
1001
|
-
systemPrompt:
|
|
1002
|
-
usage:
|
|
1003
|
-
totalTokens:
|
|
1004
|
-
totalMicrocentsSpent:
|
|
1119
|
+
var PromptResultSchema = import_zod26.z.object({
|
|
1120
|
+
text: import_zod26.z.string(),
|
|
1121
|
+
files: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1122
|
+
finishReason: import_zod26.z.string().optional(),
|
|
1123
|
+
reasoning: import_zod26.z.string().optional(),
|
|
1124
|
+
reasoningDetails: import_zod26.z.unknown().optional(),
|
|
1125
|
+
toolCalls: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1126
|
+
toolResults: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1127
|
+
warnings: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1128
|
+
sources: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1129
|
+
steps: import_zod26.z.array(import_zod26.z.unknown()),
|
|
1130
|
+
generationTimeMs: import_zod26.z.number(),
|
|
1131
|
+
prompt: import_zod26.z.string(),
|
|
1132
|
+
systemPrompt: import_zod26.z.string(),
|
|
1133
|
+
usage: import_zod26.z.object({
|
|
1134
|
+
totalTokens: import_zod26.z.number().optional(),
|
|
1135
|
+
totalMicrocentsSpent: import_zod26.z.number().optional()
|
|
1005
1136
|
})
|
|
1006
1137
|
});
|
|
1007
|
-
var EvaluationResultSchema =
|
|
1008
|
-
id:
|
|
1009
|
-
runId:
|
|
1010
|
-
timestamp:
|
|
1138
|
+
var EvaluationResultSchema = import_zod26.z.object({
|
|
1139
|
+
id: import_zod26.z.string(),
|
|
1140
|
+
runId: import_zod26.z.string(),
|
|
1141
|
+
timestamp: import_zod26.z.number(),
|
|
1011
1142
|
promptResult: PromptResultSchema,
|
|
1012
|
-
testResults:
|
|
1013
|
-
tags:
|
|
1014
|
-
feedback:
|
|
1015
|
-
score:
|
|
1016
|
-
suiteId:
|
|
1017
|
-
});
|
|
1018
|
-
var LeanEvaluationResultSchema =
|
|
1019
|
-
id:
|
|
1020
|
-
runId:
|
|
1021
|
-
timestamp:
|
|
1022
|
-
tags:
|
|
1023
|
-
scenarioId:
|
|
1024
|
-
scenarioVersion:
|
|
1025
|
-
targetId:
|
|
1026
|
-
targetVersion:
|
|
1027
|
-
suiteId:
|
|
1028
|
-
score:
|
|
1029
|
-
time:
|
|
1030
|
-
microcentsSpent:
|
|
1143
|
+
testResults: import_zod26.z.array(import_zod26.z.unknown()),
|
|
1144
|
+
tags: import_zod26.z.array(import_zod26.z.string()).optional(),
|
|
1145
|
+
feedback: import_zod26.z.string().optional(),
|
|
1146
|
+
score: import_zod26.z.number(),
|
|
1147
|
+
suiteId: import_zod26.z.string().optional()
|
|
1148
|
+
});
|
|
1149
|
+
var LeanEvaluationResultSchema = import_zod26.z.object({
|
|
1150
|
+
id: import_zod26.z.string(),
|
|
1151
|
+
runId: import_zod26.z.string(),
|
|
1152
|
+
timestamp: import_zod26.z.number(),
|
|
1153
|
+
tags: import_zod26.z.array(import_zod26.z.string()).optional(),
|
|
1154
|
+
scenarioId: import_zod26.z.string(),
|
|
1155
|
+
scenarioVersion: import_zod26.z.number().optional(),
|
|
1156
|
+
targetId: import_zod26.z.string(),
|
|
1157
|
+
targetVersion: import_zod26.z.number().optional(),
|
|
1158
|
+
suiteId: import_zod26.z.string().optional(),
|
|
1159
|
+
score: import_zod26.z.number(),
|
|
1160
|
+
time: import_zod26.z.number().optional(),
|
|
1161
|
+
microcentsSpent: import_zod26.z.number().optional()
|
|
1031
1162
|
});
|
|
1032
1163
|
|
|
1033
1164
|
// src/project/project.ts
|
|
1034
|
-
var
|
|
1165
|
+
var import_zod27 = require("zod");
|
|
1035
1166
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1036
|
-
appId:
|
|
1037
|
-
appSecret:
|
|
1167
|
+
appId: import_zod27.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
1168
|
+
appSecret: import_zod27.z.string().optional().describe("The secret of the app in Dev Center")
|
|
1038
1169
|
});
|
|
1039
1170
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1040
1171
|
id: true,
|
|
@@ -1045,10 +1176,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
1045
1176
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
1046
1177
|
|
|
1047
1178
|
// src/template/template.ts
|
|
1048
|
-
var
|
|
1179
|
+
var import_zod28 = require("zod");
|
|
1049
1180
|
var TemplateSchema = TenantEntitySchema.extend({
|
|
1050
1181
|
/** URL to download the template from */
|
|
1051
|
-
downloadUrl:
|
|
1182
|
+
downloadUrl: import_zod28.z.url()
|
|
1052
1183
|
});
|
|
1053
1184
|
var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
1054
1185
|
id: true,
|
|
@@ -1058,89 +1189,69 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
|
1058
1189
|
});
|
|
1059
1190
|
var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
1060
1191
|
|
|
1061
|
-
// src/assertion/
|
|
1062
|
-
var
|
|
1063
|
-
|
|
1064
|
-
"
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
"
|
|
1068
|
-
]);
|
|
1069
|
-
var SkillWasCalledConfigSchema = import_zod28.z.object({
|
|
1070
|
-
/** Name of the skill that must have been called */
|
|
1071
|
-
skillName: import_zod28.z.string().min(1)
|
|
1072
|
-
});
|
|
1073
|
-
var BuildPassedConfigSchema = import_zod28.z.strictObject({
|
|
1074
|
-
/** Command to run (default: "yarn build") */
|
|
1075
|
-
command: import_zod28.z.string().optional(),
|
|
1076
|
-
/** Expected exit code (default: 0) */
|
|
1077
|
-
expectedExitCode: import_zod28.z.number().int().optional()
|
|
1078
|
-
});
|
|
1079
|
-
var LlmJudgeConfigSchema = import_zod28.z.object({
|
|
1080
|
-
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1081
|
-
prompt: import_zod28.z.string().min(1),
|
|
1082
|
-
/** Optional system prompt for the judge */
|
|
1083
|
-
systemPrompt: import_zod28.z.string().optional(),
|
|
1084
|
-
/** Minimum score to pass (0-100, default 70) */
|
|
1085
|
-
minScore: import_zod28.z.number().int().min(0).max(100).optional(),
|
|
1086
|
-
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1087
|
-
model: import_zod28.z.string().optional(),
|
|
1088
|
-
/** Max output tokens */
|
|
1089
|
-
maxTokens: import_zod28.z.number().int().optional(),
|
|
1090
|
-
/** Temperature (0-1) */
|
|
1091
|
-
temperature: import_zod28.z.number().min(0).max(1).optional()
|
|
1092
|
-
});
|
|
1093
|
-
var AssertionConfigSchema = import_zod28.z.union([
|
|
1094
|
-
LlmJudgeConfigSchema,
|
|
1095
|
-
// requires prompt - check first
|
|
1096
|
-
SkillWasCalledConfigSchema,
|
|
1097
|
-
// requires skillName
|
|
1098
|
-
BuildPassedConfigSchema,
|
|
1099
|
-
// all optional, uses strictObject to reject unknown keys
|
|
1100
|
-
import_zod28.z.object({})
|
|
1101
|
-
// fallback empty config
|
|
1102
|
-
]);
|
|
1103
|
-
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
1104
|
-
/** The assertion type */
|
|
1105
|
-
type: AssertionTypeSchema,
|
|
1106
|
-
/** Type-specific configuration */
|
|
1107
|
-
config: AssertionConfigSchema
|
|
1108
|
-
});
|
|
1109
|
-
var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
|
|
1110
|
-
id: true,
|
|
1111
|
-
createdAt: true,
|
|
1112
|
-
updatedAt: true,
|
|
1113
|
-
deleted: true
|
|
1114
|
-
});
|
|
1115
|
-
var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
|
|
1116
|
-
function validateAssertionConfig(type, config) {
|
|
1117
|
-
switch (type) {
|
|
1118
|
-
case "skill_was_called":
|
|
1119
|
-
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
1120
|
-
case "build_passed":
|
|
1121
|
-
return BuildPassedConfigSchema.safeParse(config).success;
|
|
1122
|
-
case "llm_judge":
|
|
1123
|
-
case "custom":
|
|
1124
|
-
return LlmJudgeConfigSchema.safeParse(config).success;
|
|
1125
|
-
default:
|
|
1126
|
-
return false;
|
|
1127
|
-
}
|
|
1128
|
-
}
|
|
1129
|
-
function getSkillWasCalledConfig(assertion) {
|
|
1130
|
-
if (assertion.type !== "skill_was_called") return null;
|
|
1131
|
-
const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
|
|
1132
|
-
return result.success ? result.data : null;
|
|
1192
|
+
// src/assertion/system-assertions.ts
|
|
1193
|
+
var SYSTEM_ASSERTION_IDS = {
|
|
1194
|
+
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1195
|
+
BUILD_PASSED: "system:build_passed"
|
|
1196
|
+
};
|
|
1197
|
+
function isSystemAssertionId(id) {
|
|
1198
|
+
return id.startsWith("system:");
|
|
1133
1199
|
}
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1200
|
+
var SYSTEM_ASSERTIONS = {
|
|
1201
|
+
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1202
|
+
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1203
|
+
name: "Skill Was Called",
|
|
1204
|
+
description: "Check if a specific skill was invoked during the agent run",
|
|
1205
|
+
type: "skill_was_called",
|
|
1206
|
+
parameters: [
|
|
1207
|
+
{
|
|
1208
|
+
name: "skillName",
|
|
1209
|
+
label: "Skill Name",
|
|
1210
|
+
type: "string",
|
|
1211
|
+
required: true
|
|
1212
|
+
}
|
|
1213
|
+
]
|
|
1214
|
+
},
|
|
1215
|
+
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1216
|
+
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1217
|
+
name: "Build Passed",
|
|
1218
|
+
description: "Run a build command and verify it exits with expected code",
|
|
1219
|
+
type: "build_passed",
|
|
1220
|
+
parameters: [
|
|
1221
|
+
{
|
|
1222
|
+
name: "command",
|
|
1223
|
+
label: "Build Command",
|
|
1224
|
+
type: "string",
|
|
1225
|
+
required: false,
|
|
1226
|
+
defaultValue: "yarn build"
|
|
1227
|
+
},
|
|
1228
|
+
{
|
|
1229
|
+
name: "expectedExitCode",
|
|
1230
|
+
label: "Expected Exit Code",
|
|
1231
|
+
type: "number",
|
|
1232
|
+
required: false,
|
|
1233
|
+
defaultValue: 0
|
|
1234
|
+
},
|
|
1235
|
+
{
|
|
1236
|
+
name: "maxBuildTime",
|
|
1237
|
+
label: "Max Build Time (ms)",
|
|
1238
|
+
type: "number",
|
|
1239
|
+
required: false
|
|
1240
|
+
},
|
|
1241
|
+
{
|
|
1242
|
+
name: "maxMemory",
|
|
1243
|
+
label: "Max Memory (MB)",
|
|
1244
|
+
type: "number",
|
|
1245
|
+
required: false
|
|
1246
|
+
}
|
|
1247
|
+
]
|
|
1248
|
+
}
|
|
1249
|
+
};
|
|
1250
|
+
function getSystemAssertions() {
|
|
1251
|
+
return Object.values(SYSTEM_ASSERTIONS);
|
|
1138
1252
|
}
|
|
1139
|
-
function
|
|
1140
|
-
|
|
1141
|
-
return null;
|
|
1142
|
-
const result = LlmJudgeConfigSchema.safeParse(assertion.config);
|
|
1143
|
-
return result.success ? result.data : null;
|
|
1253
|
+
function getSystemAssertion(id) {
|
|
1254
|
+
return SYSTEM_ASSERTIONS[id];
|
|
1144
1255
|
}
|
|
1145
1256
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1146
1257
|
0 && (module.exports = {
|
|
@@ -1150,6 +1261,8 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1150
1261
|
AllowedCommands,
|
|
1151
1262
|
ApiCallSchema,
|
|
1152
1263
|
AssertionConfigSchema,
|
|
1264
|
+
AssertionParameterSchema,
|
|
1265
|
+
AssertionParameterTypeSchema,
|
|
1153
1266
|
AssertionResultSchema,
|
|
1154
1267
|
AssertionResultStatus,
|
|
1155
1268
|
AssertionSchema,
|
|
@@ -1216,6 +1329,9 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1216
1329
|
ProjectSchema,
|
|
1217
1330
|
PromptResultSchema,
|
|
1218
1331
|
SKILL_FOLDER_NAME_REGEX,
|
|
1332
|
+
SYSTEM_ASSERTIONS,
|
|
1333
|
+
SYSTEM_ASSERTION_IDS,
|
|
1334
|
+
ScenarioAssertionLinkSchema,
|
|
1219
1335
|
SiteConfigTestSchema,
|
|
1220
1336
|
SkillMetadataSchema,
|
|
1221
1337
|
SkillSchema,
|
|
@@ -1254,6 +1370,9 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1254
1370
|
getBuildPassedConfig,
|
|
1255
1371
|
getLlmJudgeConfig,
|
|
1256
1372
|
getSkillWasCalledConfig,
|
|
1373
|
+
getSystemAssertion,
|
|
1374
|
+
getSystemAssertions,
|
|
1375
|
+
isSystemAssertionId,
|
|
1257
1376
|
isValidSkillFolderName,
|
|
1258
1377
|
parseTraceEventLine,
|
|
1259
1378
|
validateAssertionConfig
|