@wix/evalforge-types 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +479 -320
- package/build/index.js.map +4 -4
- package/build/index.mjs +471 -320
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +101 -11
- package/build/types/assertion/index.d.ts +2 -1
- package/build/types/assertion/system-assertions.d.ts +43 -0
- package/build/types/scenario/test-scenario.d.ts +12 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -26,6 +26,8 @@ __export(index_exports, {
|
|
|
26
26
|
AllowedCommands: () => AllowedCommands,
|
|
27
27
|
ApiCallSchema: () => ApiCallSchema,
|
|
28
28
|
AssertionConfigSchema: () => AssertionConfigSchema,
|
|
29
|
+
AssertionParameterSchema: () => AssertionParameterSchema,
|
|
30
|
+
AssertionParameterTypeSchema: () => AssertionParameterTypeSchema,
|
|
29
31
|
AssertionResultSchema: () => AssertionResultSchema,
|
|
30
32
|
AssertionResultStatus: () => AssertionResultStatus,
|
|
31
33
|
AssertionSchema: () => AssertionSchema,
|
|
@@ -92,6 +94,9 @@ __export(index_exports, {
|
|
|
92
94
|
ProjectSchema: () => ProjectSchema,
|
|
93
95
|
PromptResultSchema: () => PromptResultSchema,
|
|
94
96
|
SKILL_FOLDER_NAME_REGEX: () => SKILL_FOLDER_NAME_REGEX,
|
|
97
|
+
SYSTEM_ASSERTIONS: () => SYSTEM_ASSERTIONS,
|
|
98
|
+
SYSTEM_ASSERTION_IDS: () => SYSTEM_ASSERTION_IDS,
|
|
99
|
+
ScenarioAssertionLinkSchema: () => ScenarioAssertionLinkSchema,
|
|
95
100
|
SiteConfigTestSchema: () => SiteConfigTestSchema,
|
|
96
101
|
SkillMetadataSchema: () => SkillMetadataSchema,
|
|
97
102
|
SkillSchema: () => SkillSchema,
|
|
@@ -130,6 +135,9 @@ __export(index_exports, {
|
|
|
130
135
|
getBuildPassedConfig: () => getBuildPassedConfig,
|
|
131
136
|
getLlmJudgeConfig: () => getLlmJudgeConfig,
|
|
132
137
|
getSkillWasCalledConfig: () => getSkillWasCalledConfig,
|
|
138
|
+
getSystemAssertion: () => getSystemAssertion,
|
|
139
|
+
getSystemAssertions: () => getSystemAssertions,
|
|
140
|
+
isSystemAssertionId: () => isSystemAssertionId,
|
|
133
141
|
isValidSkillFolderName: () => isValidSkillFolderName,
|
|
134
142
|
parseTraceEventLine: () => parseTraceEventLine,
|
|
135
143
|
validateAssertionConfig: () => validateAssertionConfig
|
|
@@ -592,22 +600,147 @@ var EnvironmentSchema = import_zod19.z.object({
|
|
|
592
600
|
});
|
|
593
601
|
|
|
594
602
|
// src/scenario/test-scenario.ts
|
|
603
|
+
var import_zod21 = require("zod");
|
|
604
|
+
|
|
605
|
+
// src/assertion/assertion.ts
|
|
595
606
|
var import_zod20 = require("zod");
|
|
596
|
-
var
|
|
607
|
+
var AssertionTypeSchema = import_zod20.z.enum([
|
|
608
|
+
"skill_was_called",
|
|
609
|
+
"build_passed",
|
|
610
|
+
"llm_judge"
|
|
611
|
+
]);
|
|
612
|
+
var AssertionParameterTypeSchema = import_zod20.z.enum([
|
|
613
|
+
"string",
|
|
614
|
+
"number",
|
|
615
|
+
"boolean"
|
|
616
|
+
]);
|
|
617
|
+
var AssertionParameterSchema = import_zod20.z.object({
|
|
618
|
+
/** Parameter name (used as key in params object) */
|
|
619
|
+
name: import_zod20.z.string().min(1),
|
|
620
|
+
/** Display label for the parameter */
|
|
621
|
+
label: import_zod20.z.string().min(1),
|
|
622
|
+
/** Parameter type */
|
|
623
|
+
type: AssertionParameterTypeSchema,
|
|
624
|
+
/** Whether this parameter is required */
|
|
625
|
+
required: import_zod20.z.boolean(),
|
|
626
|
+
/** Default value (optional, used when not provided) */
|
|
627
|
+
defaultValue: import_zod20.z.union([import_zod20.z.string(), import_zod20.z.number(), import_zod20.z.boolean()]).optional(),
|
|
628
|
+
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
629
|
+
advanced: import_zod20.z.boolean().optional()
|
|
630
|
+
});
|
|
631
|
+
var ScenarioAssertionLinkSchema = import_zod20.z.object({
|
|
632
|
+
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
633
|
+
assertionId: import_zod20.z.string(),
|
|
634
|
+
/** Parameter values for this assertion in this scenario */
|
|
635
|
+
params: import_zod20.z.record(
|
|
636
|
+
import_zod20.z.string(),
|
|
637
|
+
import_zod20.z.union([import_zod20.z.string(), import_zod20.z.number(), import_zod20.z.boolean(), import_zod20.z.null()])
|
|
638
|
+
).optional()
|
|
639
|
+
});
|
|
640
|
+
var SkillWasCalledConfigSchema = import_zod20.z.object({
|
|
641
|
+
/** Name of the skill that must have been called */
|
|
642
|
+
skillName: import_zod20.z.string().min(1)
|
|
643
|
+
});
|
|
644
|
+
var BuildPassedConfigSchema = import_zod20.z.strictObject({
|
|
645
|
+
/** Command to run (default: "yarn build") */
|
|
646
|
+
command: import_zod20.z.string().optional(),
|
|
647
|
+
/** Expected exit code (default: 0) */
|
|
648
|
+
expectedExitCode: import_zod20.z.number().int().optional()
|
|
649
|
+
});
|
|
650
|
+
var LlmJudgeConfigSchema = import_zod20.z.object({
|
|
651
|
+
/**
|
|
652
|
+
* Prompt template with placeholders:
|
|
653
|
+
* - {{output}}: agent's final output
|
|
654
|
+
* - {{cwd}}: working directory
|
|
655
|
+
* - {{changedFiles}}: all files changed (new, modified)
|
|
656
|
+
* - {{modifiedFiles}}: only existing files that were modified
|
|
657
|
+
* - {{newFiles}}: only new files that were created
|
|
658
|
+
* - {{trace}}: step-by-step trace of tool calls
|
|
659
|
+
* - Custom parameters defined in the parameters array
|
|
660
|
+
*/
|
|
661
|
+
prompt: import_zod20.z.string().min(1),
|
|
662
|
+
/** Optional system prompt for the judge */
|
|
663
|
+
systemPrompt: import_zod20.z.string().optional(),
|
|
664
|
+
/** Minimum score to pass (0-100, default 70) */
|
|
665
|
+
minScore: import_zod20.z.number().int().min(0).max(100).optional(),
|
|
666
|
+
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
667
|
+
model: import_zod20.z.string().optional(),
|
|
668
|
+
/** Max output tokens */
|
|
669
|
+
maxTokens: import_zod20.z.number().int().optional(),
|
|
670
|
+
/** Temperature (0-1) */
|
|
671
|
+
temperature: import_zod20.z.number().min(0).max(1).optional(),
|
|
672
|
+
/** User-defined parameters for this assertion */
|
|
673
|
+
parameters: import_zod20.z.array(AssertionParameterSchema).optional()
|
|
674
|
+
});
|
|
675
|
+
var AssertionConfigSchema = import_zod20.z.union([
|
|
676
|
+
LlmJudgeConfigSchema,
|
|
677
|
+
// requires prompt - check first
|
|
678
|
+
SkillWasCalledConfigSchema,
|
|
679
|
+
// requires skillName
|
|
680
|
+
BuildPassedConfigSchema,
|
|
681
|
+
// all optional, uses strictObject to reject unknown keys
|
|
682
|
+
import_zod20.z.object({})
|
|
683
|
+
// fallback empty config
|
|
684
|
+
]);
|
|
685
|
+
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
686
|
+
/** The assertion type */
|
|
687
|
+
type: AssertionTypeSchema,
|
|
688
|
+
/** Type-specific configuration */
|
|
689
|
+
config: AssertionConfigSchema
|
|
690
|
+
});
|
|
691
|
+
var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
|
|
692
|
+
id: true,
|
|
693
|
+
createdAt: true,
|
|
694
|
+
updatedAt: true,
|
|
695
|
+
deleted: true
|
|
696
|
+
});
|
|
697
|
+
var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
|
|
698
|
+
function validateAssertionConfig(type, config) {
|
|
699
|
+
switch (type) {
|
|
700
|
+
case "skill_was_called":
|
|
701
|
+
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
702
|
+
case "build_passed":
|
|
703
|
+
return BuildPassedConfigSchema.safeParse(config).success;
|
|
704
|
+
case "llm_judge":
|
|
705
|
+
return LlmJudgeConfigSchema.safeParse(config).success;
|
|
706
|
+
default:
|
|
707
|
+
return false;
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
function getSkillWasCalledConfig(assertion) {
|
|
711
|
+
if (assertion.type !== "skill_was_called") return null;
|
|
712
|
+
const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
|
|
713
|
+
return result.success ? result.data : null;
|
|
714
|
+
}
|
|
715
|
+
function getBuildPassedConfig(assertion) {
|
|
716
|
+
if (assertion.type !== "build_passed") return null;
|
|
717
|
+
const result = BuildPassedConfigSchema.safeParse(assertion.config);
|
|
718
|
+
return result.success ? result.data : null;
|
|
719
|
+
}
|
|
720
|
+
function getLlmJudgeConfig(assertion) {
|
|
721
|
+
if (assertion.type !== "llm_judge") return null;
|
|
722
|
+
const result = LlmJudgeConfigSchema.safeParse(assertion.config);
|
|
723
|
+
return result.success ? result.data : null;
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// src/scenario/test-scenario.ts
|
|
727
|
+
var ExpectedFileSchema = import_zod21.z.object({
|
|
597
728
|
/** Relative path where the file should be created */
|
|
598
|
-
path:
|
|
729
|
+
path: import_zod21.z.string(),
|
|
599
730
|
/** Optional expected content */
|
|
600
|
-
content:
|
|
731
|
+
content: import_zod21.z.string().optional()
|
|
601
732
|
});
|
|
602
733
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
603
734
|
/** The prompt sent to the agent to trigger the task */
|
|
604
|
-
triggerPrompt:
|
|
735
|
+
triggerPrompt: import_zod21.z.string().min(10),
|
|
605
736
|
/** ID of the template to use for this scenario (null = no template) */
|
|
606
|
-
templateId:
|
|
737
|
+
templateId: import_zod21.z.string().nullish(),
|
|
607
738
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
608
|
-
assertions:
|
|
609
|
-
/** IDs of saved assertions to evaluate (from assertions table) */
|
|
610
|
-
assertionIds:
|
|
739
|
+
assertions: import_zod21.z.array(AssertionSchema).optional(),
|
|
740
|
+
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
741
|
+
assertionIds: import_zod21.z.array(import_zod21.z.string()).optional(),
|
|
742
|
+
/** Linked assertions with per-scenario parameter values */
|
|
743
|
+
assertionLinks: import_zod21.z.array(ScenarioAssertionLinkSchema).optional()
|
|
611
744
|
});
|
|
612
745
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
613
746
|
id: true,
|
|
@@ -618,10 +751,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
618
751
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
619
752
|
|
|
620
753
|
// src/suite/test-suite.ts
|
|
621
|
-
var
|
|
754
|
+
var import_zod22 = require("zod");
|
|
622
755
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
623
756
|
/** IDs of test scenarios in this suite */
|
|
624
|
-
scenarioIds:
|
|
757
|
+
scenarioIds: import_zod22.z.array(import_zod22.z.string())
|
|
625
758
|
});
|
|
626
759
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
627
760
|
id: true,
|
|
@@ -632,21 +765,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
632
765
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
633
766
|
|
|
634
767
|
// src/evaluation/metrics.ts
|
|
635
|
-
var
|
|
636
|
-
var TokenUsageSchema =
|
|
637
|
-
prompt:
|
|
638
|
-
completion:
|
|
639
|
-
total:
|
|
640
|
-
});
|
|
641
|
-
var EvalMetricsSchema =
|
|
642
|
-
totalAssertions:
|
|
643
|
-
passed:
|
|
644
|
-
failed:
|
|
645
|
-
skipped:
|
|
646
|
-
errors:
|
|
647
|
-
passRate:
|
|
648
|
-
avgDuration:
|
|
649
|
-
totalDuration:
|
|
768
|
+
var import_zod23 = require("zod");
|
|
769
|
+
var TokenUsageSchema = import_zod23.z.object({
|
|
770
|
+
prompt: import_zod23.z.number(),
|
|
771
|
+
completion: import_zod23.z.number(),
|
|
772
|
+
total: import_zod23.z.number()
|
|
773
|
+
});
|
|
774
|
+
var EvalMetricsSchema = import_zod23.z.object({
|
|
775
|
+
totalAssertions: import_zod23.z.number(),
|
|
776
|
+
passed: import_zod23.z.number(),
|
|
777
|
+
failed: import_zod23.z.number(),
|
|
778
|
+
skipped: import_zod23.z.number(),
|
|
779
|
+
errors: import_zod23.z.number(),
|
|
780
|
+
passRate: import_zod23.z.number(),
|
|
781
|
+
avgDuration: import_zod23.z.number(),
|
|
782
|
+
totalDuration: import_zod23.z.number()
|
|
650
783
|
});
|
|
651
784
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
652
785
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -656,7 +789,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
656
789
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
657
790
|
return EvalStatus2;
|
|
658
791
|
})(EvalStatus || {});
|
|
659
|
-
var EvalStatusSchema =
|
|
792
|
+
var EvalStatusSchema = import_zod23.z.enum(EvalStatus);
|
|
660
793
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
661
794
|
LLMStepType2["COMPLETION"] = "completion";
|
|
662
795
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -664,52 +797,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
664
797
|
LLMStepType2["THINKING"] = "thinking";
|
|
665
798
|
return LLMStepType2;
|
|
666
799
|
})(LLMStepType || {});
|
|
667
|
-
var LLMTraceStepSchema =
|
|
668
|
-
id:
|
|
669
|
-
stepNumber:
|
|
670
|
-
type:
|
|
671
|
-
model:
|
|
672
|
-
provider:
|
|
673
|
-
startedAt:
|
|
674
|
-
durationMs:
|
|
800
|
+
var LLMTraceStepSchema = import_zod23.z.object({
|
|
801
|
+
id: import_zod23.z.string(),
|
|
802
|
+
stepNumber: import_zod23.z.number(),
|
|
803
|
+
type: import_zod23.z.enum(LLMStepType),
|
|
804
|
+
model: import_zod23.z.string(),
|
|
805
|
+
provider: import_zod23.z.string(),
|
|
806
|
+
startedAt: import_zod23.z.string(),
|
|
807
|
+
durationMs: import_zod23.z.number(),
|
|
675
808
|
tokenUsage: TokenUsageSchema,
|
|
676
|
-
costUsd:
|
|
677
|
-
toolName:
|
|
678
|
-
toolArguments:
|
|
679
|
-
inputPreview:
|
|
680
|
-
outputPreview:
|
|
681
|
-
success:
|
|
682
|
-
error:
|
|
683
|
-
});
|
|
684
|
-
var LLMBreakdownStatsSchema =
|
|
685
|
-
count:
|
|
686
|
-
durationMs:
|
|
687
|
-
tokens:
|
|
688
|
-
costUsd:
|
|
689
|
-
});
|
|
690
|
-
var LLMTraceSummarySchema =
|
|
691
|
-
totalSteps:
|
|
692
|
-
totalDurationMs:
|
|
809
|
+
costUsd: import_zod23.z.number(),
|
|
810
|
+
toolName: import_zod23.z.string().optional(),
|
|
811
|
+
toolArguments: import_zod23.z.string().optional(),
|
|
812
|
+
inputPreview: import_zod23.z.string().optional(),
|
|
813
|
+
outputPreview: import_zod23.z.string().optional(),
|
|
814
|
+
success: import_zod23.z.boolean(),
|
|
815
|
+
error: import_zod23.z.string().optional()
|
|
816
|
+
});
|
|
817
|
+
var LLMBreakdownStatsSchema = import_zod23.z.object({
|
|
818
|
+
count: import_zod23.z.number(),
|
|
819
|
+
durationMs: import_zod23.z.number(),
|
|
820
|
+
tokens: import_zod23.z.number(),
|
|
821
|
+
costUsd: import_zod23.z.number()
|
|
822
|
+
});
|
|
823
|
+
var LLMTraceSummarySchema = import_zod23.z.object({
|
|
824
|
+
totalSteps: import_zod23.z.number(),
|
|
825
|
+
totalDurationMs: import_zod23.z.number(),
|
|
693
826
|
totalTokens: TokenUsageSchema,
|
|
694
|
-
totalCostUsd:
|
|
695
|
-
stepTypeBreakdown:
|
|
696
|
-
modelBreakdown:
|
|
697
|
-
modelsUsed:
|
|
698
|
-
});
|
|
699
|
-
var LLMTraceSchema =
|
|
700
|
-
id:
|
|
701
|
-
steps:
|
|
827
|
+
totalCostUsd: import_zod23.z.number(),
|
|
828
|
+
stepTypeBreakdown: import_zod23.z.record(import_zod23.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
829
|
+
modelBreakdown: import_zod23.z.record(import_zod23.z.string(), LLMBreakdownStatsSchema),
|
|
830
|
+
modelsUsed: import_zod23.z.array(import_zod23.z.string())
|
|
831
|
+
});
|
|
832
|
+
var LLMTraceSchema = import_zod23.z.object({
|
|
833
|
+
id: import_zod23.z.string(),
|
|
834
|
+
steps: import_zod23.z.array(LLMTraceStepSchema),
|
|
702
835
|
summary: LLMTraceSummarySchema
|
|
703
836
|
});
|
|
704
837
|
|
|
705
838
|
// src/evaluation/eval-result.ts
|
|
706
|
-
var
|
|
839
|
+
var import_zod26 = require("zod");
|
|
707
840
|
|
|
708
841
|
// src/evaluation/eval-run.ts
|
|
709
|
-
var
|
|
842
|
+
var import_zod25 = require("zod");
|
|
710
843
|
|
|
711
844
|
// src/evaluation/live-trace.ts
|
|
712
|
-
var
|
|
845
|
+
var import_zod24 = require("zod");
|
|
713
846
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
714
847
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
715
848
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -723,37 +856,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
723
856
|
LiveTraceEventType2["USER"] = "user";
|
|
724
857
|
return LiveTraceEventType2;
|
|
725
858
|
})(LiveTraceEventType || {});
|
|
726
|
-
var LiveTraceEventSchema =
|
|
859
|
+
var LiveTraceEventSchema = import_zod24.z.object({
|
|
727
860
|
/** The evaluation run ID */
|
|
728
|
-
evalRunId:
|
|
861
|
+
evalRunId: import_zod24.z.string(),
|
|
729
862
|
/** The scenario ID being executed */
|
|
730
|
-
scenarioId:
|
|
863
|
+
scenarioId: import_zod24.z.string(),
|
|
731
864
|
/** The scenario name for display */
|
|
732
|
-
scenarioName:
|
|
865
|
+
scenarioName: import_zod24.z.string(),
|
|
733
866
|
/** The target ID (skill, agent, etc.) */
|
|
734
|
-
targetId:
|
|
867
|
+
targetId: import_zod24.z.string(),
|
|
735
868
|
/** The target name for display */
|
|
736
|
-
targetName:
|
|
869
|
+
targetName: import_zod24.z.string(),
|
|
737
870
|
/** Step number in the current scenario execution */
|
|
738
|
-
stepNumber:
|
|
871
|
+
stepNumber: import_zod24.z.number(),
|
|
739
872
|
/** Type of trace event */
|
|
740
|
-
type:
|
|
873
|
+
type: import_zod24.z.enum(LiveTraceEventType),
|
|
741
874
|
/** Tool name if this is a tool_use event */
|
|
742
|
-
toolName:
|
|
875
|
+
toolName: import_zod24.z.string().optional(),
|
|
743
876
|
/** Tool arguments preview (truncated JSON) */
|
|
744
|
-
toolArgs:
|
|
877
|
+
toolArgs: import_zod24.z.string().optional(),
|
|
745
878
|
/** Output preview (truncated text) */
|
|
746
|
-
outputPreview:
|
|
879
|
+
outputPreview: import_zod24.z.string().optional(),
|
|
747
880
|
/** File path for file operations */
|
|
748
|
-
filePath:
|
|
881
|
+
filePath: import_zod24.z.string().optional(),
|
|
749
882
|
/** Elapsed time in milliseconds for progress events */
|
|
750
|
-
elapsedMs:
|
|
883
|
+
elapsedMs: import_zod24.z.number().optional(),
|
|
751
884
|
/** Thinking/reasoning text from Claude */
|
|
752
|
-
thinking:
|
|
885
|
+
thinking: import_zod24.z.string().optional(),
|
|
753
886
|
/** Timestamp when this event occurred */
|
|
754
|
-
timestamp:
|
|
887
|
+
timestamp: import_zod24.z.string(),
|
|
755
888
|
/** Whether this is the final event for this scenario */
|
|
756
|
-
isComplete:
|
|
889
|
+
isComplete: import_zod24.z.boolean()
|
|
757
890
|
});
|
|
758
891
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
759
892
|
function parseTraceEventLine(line) {
|
|
@@ -781,14 +914,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
781
914
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
782
915
|
return TriggerType2;
|
|
783
916
|
})(TriggerType || {});
|
|
784
|
-
var TriggerMetadataSchema =
|
|
785
|
-
version:
|
|
786
|
-
resourceUpdated:
|
|
917
|
+
var TriggerMetadataSchema = import_zod25.z.object({
|
|
918
|
+
version: import_zod25.z.string().optional(),
|
|
919
|
+
resourceUpdated: import_zod25.z.array(import_zod25.z.string()).optional()
|
|
787
920
|
});
|
|
788
|
-
var TriggerSchema =
|
|
789
|
-
id:
|
|
921
|
+
var TriggerSchema = import_zod25.z.object({
|
|
922
|
+
id: import_zod25.z.string(),
|
|
790
923
|
metadata: TriggerMetadataSchema.optional(),
|
|
791
|
-
type:
|
|
924
|
+
type: import_zod25.z.enum(TriggerType)
|
|
792
925
|
});
|
|
793
926
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
794
927
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -806,28 +939,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
806
939
|
FailureSeverity2["LOW"] = "low";
|
|
807
940
|
return FailureSeverity2;
|
|
808
941
|
})(FailureSeverity || {});
|
|
809
|
-
var DiffLineTypeSchema =
|
|
810
|
-
var DiffLineSchema =
|
|
942
|
+
var DiffLineTypeSchema = import_zod25.z.enum(["added", "removed", "unchanged"]);
|
|
943
|
+
var DiffLineSchema = import_zod25.z.object({
|
|
811
944
|
type: DiffLineTypeSchema,
|
|
812
|
-
content:
|
|
813
|
-
lineNumber:
|
|
814
|
-
});
|
|
815
|
-
var DiffContentSchema =
|
|
816
|
-
path:
|
|
817
|
-
expected:
|
|
818
|
-
actual:
|
|
819
|
-
diffLines:
|
|
820
|
-
renamedFrom:
|
|
821
|
-
});
|
|
822
|
-
var CommandExecutionSchema =
|
|
823
|
-
command:
|
|
824
|
-
exitCode:
|
|
825
|
-
output:
|
|
826
|
-
duration:
|
|
827
|
-
});
|
|
828
|
-
var FileModificationSchema =
|
|
829
|
-
path:
|
|
830
|
-
action:
|
|
945
|
+
content: import_zod25.z.string(),
|
|
946
|
+
lineNumber: import_zod25.z.number()
|
|
947
|
+
});
|
|
948
|
+
var DiffContentSchema = import_zod25.z.object({
|
|
949
|
+
path: import_zod25.z.string(),
|
|
950
|
+
expected: import_zod25.z.string(),
|
|
951
|
+
actual: import_zod25.z.string(),
|
|
952
|
+
diffLines: import_zod25.z.array(DiffLineSchema),
|
|
953
|
+
renamedFrom: import_zod25.z.string().optional()
|
|
954
|
+
});
|
|
955
|
+
var CommandExecutionSchema = import_zod25.z.object({
|
|
956
|
+
command: import_zod25.z.string(),
|
|
957
|
+
exitCode: import_zod25.z.number(),
|
|
958
|
+
output: import_zod25.z.string().optional(),
|
|
959
|
+
duration: import_zod25.z.number()
|
|
960
|
+
});
|
|
961
|
+
var FileModificationSchema = import_zod25.z.object({
|
|
962
|
+
path: import_zod25.z.string(),
|
|
963
|
+
action: import_zod25.z.enum(["created", "modified", "deleted"])
|
|
831
964
|
});
|
|
832
965
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
833
966
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -835,75 +968,75 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
835
968
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
836
969
|
return TemplateFileStatus2;
|
|
837
970
|
})(TemplateFileStatus || {});
|
|
838
|
-
var TemplateFileSchema =
|
|
971
|
+
var TemplateFileSchema = import_zod25.z.object({
|
|
839
972
|
/** Relative path within the template */
|
|
840
|
-
path:
|
|
973
|
+
path: import_zod25.z.string(),
|
|
841
974
|
/** Full file content after execution */
|
|
842
|
-
content:
|
|
975
|
+
content: import_zod25.z.string(),
|
|
843
976
|
/** File status (new, modified, unchanged) */
|
|
844
|
-
status:
|
|
845
|
-
});
|
|
846
|
-
var ApiCallSchema =
|
|
847
|
-
endpoint:
|
|
848
|
-
tokensUsed:
|
|
849
|
-
duration:
|
|
850
|
-
});
|
|
851
|
-
var ExecutionTraceSchema =
|
|
852
|
-
commands:
|
|
853
|
-
filesModified:
|
|
854
|
-
apiCalls:
|
|
855
|
-
totalDuration:
|
|
856
|
-
});
|
|
857
|
-
var FailureAnalysisSchema =
|
|
858
|
-
category:
|
|
859
|
-
severity:
|
|
860
|
-
summary:
|
|
861
|
-
details:
|
|
862
|
-
rootCause:
|
|
863
|
-
suggestedFix:
|
|
864
|
-
relatedAssertions:
|
|
865
|
-
codeSnippet:
|
|
866
|
-
similarIssues:
|
|
867
|
-
patternId:
|
|
977
|
+
status: import_zod25.z.enum(["new", "modified", "unchanged"])
|
|
978
|
+
});
|
|
979
|
+
var ApiCallSchema = import_zod25.z.object({
|
|
980
|
+
endpoint: import_zod25.z.string(),
|
|
981
|
+
tokensUsed: import_zod25.z.number(),
|
|
982
|
+
duration: import_zod25.z.number()
|
|
983
|
+
});
|
|
984
|
+
var ExecutionTraceSchema = import_zod25.z.object({
|
|
985
|
+
commands: import_zod25.z.array(CommandExecutionSchema),
|
|
986
|
+
filesModified: import_zod25.z.array(FileModificationSchema),
|
|
987
|
+
apiCalls: import_zod25.z.array(ApiCallSchema),
|
|
988
|
+
totalDuration: import_zod25.z.number()
|
|
989
|
+
});
|
|
990
|
+
var FailureAnalysisSchema = import_zod25.z.object({
|
|
991
|
+
category: import_zod25.z.enum(FailureCategory),
|
|
992
|
+
severity: import_zod25.z.enum(FailureSeverity),
|
|
993
|
+
summary: import_zod25.z.string(),
|
|
994
|
+
details: import_zod25.z.string(),
|
|
995
|
+
rootCause: import_zod25.z.string(),
|
|
996
|
+
suggestedFix: import_zod25.z.string(),
|
|
997
|
+
relatedAssertions: import_zod25.z.array(import_zod25.z.string()),
|
|
998
|
+
codeSnippet: import_zod25.z.string().optional(),
|
|
999
|
+
similarIssues: import_zod25.z.array(import_zod25.z.string()).optional(),
|
|
1000
|
+
patternId: import_zod25.z.string().optional(),
|
|
868
1001
|
// Extended fields for detailed debugging
|
|
869
1002
|
diff: DiffContentSchema.optional(),
|
|
870
1003
|
executionTrace: ExecutionTraceSchema.optional()
|
|
871
1004
|
});
|
|
872
1005
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
873
1006
|
/** Agent ID for this run */
|
|
874
|
-
agentId:
|
|
1007
|
+
agentId: import_zod25.z.string().optional(),
|
|
875
1008
|
/** Skills group ID for this run */
|
|
876
|
-
skillsGroupId:
|
|
1009
|
+
skillsGroupId: import_zod25.z.string().optional(),
|
|
877
1010
|
/** Scenario IDs to run */
|
|
878
|
-
scenarioIds:
|
|
1011
|
+
scenarioIds: import_zod25.z.array(import_zod25.z.string()),
|
|
879
1012
|
/** Current status */
|
|
880
1013
|
status: EvalStatusSchema,
|
|
881
1014
|
/** Progress percentage (0-100) */
|
|
882
|
-
progress:
|
|
1015
|
+
progress: import_zod25.z.number(),
|
|
883
1016
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
884
|
-
results:
|
|
1017
|
+
results: import_zod25.z.array(import_zod25.z.lazy(() => EvalRunResultSchema)),
|
|
885
1018
|
/** Aggregated metrics across all results */
|
|
886
1019
|
aggregateMetrics: EvalMetricsSchema,
|
|
887
1020
|
/** Failure analyses */
|
|
888
|
-
failureAnalyses:
|
|
1021
|
+
failureAnalyses: import_zod25.z.array(FailureAnalysisSchema).optional(),
|
|
889
1022
|
/** Aggregated LLM trace summary */
|
|
890
1023
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
891
1024
|
/** What triggered this run */
|
|
892
1025
|
trigger: TriggerSchema.optional(),
|
|
893
1026
|
/** When the run started (set when evaluation is triggered) */
|
|
894
|
-
startedAt:
|
|
1027
|
+
startedAt: import_zod25.z.string().optional(),
|
|
895
1028
|
/** When the run completed */
|
|
896
|
-
completedAt:
|
|
1029
|
+
completedAt: import_zod25.z.string().optional(),
|
|
897
1030
|
/** Live trace events captured during execution (for playback on results page) */
|
|
898
|
-
liveTraceEvents:
|
|
1031
|
+
liveTraceEvents: import_zod25.z.array(LiveTraceEventSchema).optional(),
|
|
899
1032
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
900
|
-
jobId:
|
|
1033
|
+
jobId: import_zod25.z.string().optional(),
|
|
901
1034
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
902
|
-
jobStatus:
|
|
1035
|
+
jobStatus: import_zod25.z.string().optional(),
|
|
903
1036
|
/** Remote job error message if the job failed */
|
|
904
|
-
jobError:
|
|
1037
|
+
jobError: import_zod25.z.string().optional(),
|
|
905
1038
|
/** Timestamp of the last job status check */
|
|
906
|
-
jobStatusCheckedAt:
|
|
1039
|
+
jobStatusCheckedAt: import_zod25.z.string().optional()
|
|
907
1040
|
});
|
|
908
1041
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
909
1042
|
id: true,
|
|
@@ -916,28 +1049,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
916
1049
|
startedAt: true,
|
|
917
1050
|
completedAt: true
|
|
918
1051
|
});
|
|
919
|
-
var EvaluationProgressSchema =
|
|
920
|
-
runId:
|
|
921
|
-
targetId:
|
|
922
|
-
totalScenarios:
|
|
923
|
-
completedScenarios:
|
|
924
|
-
scenarioProgress:
|
|
925
|
-
|
|
926
|
-
scenarioId:
|
|
927
|
-
currentStep:
|
|
928
|
-
error:
|
|
1052
|
+
var EvaluationProgressSchema = import_zod25.z.object({
|
|
1053
|
+
runId: import_zod25.z.string(),
|
|
1054
|
+
targetId: import_zod25.z.string(),
|
|
1055
|
+
totalScenarios: import_zod25.z.number(),
|
|
1056
|
+
completedScenarios: import_zod25.z.number(),
|
|
1057
|
+
scenarioProgress: import_zod25.z.array(
|
|
1058
|
+
import_zod25.z.object({
|
|
1059
|
+
scenarioId: import_zod25.z.string(),
|
|
1060
|
+
currentStep: import_zod25.z.string(),
|
|
1061
|
+
error: import_zod25.z.string().optional()
|
|
929
1062
|
})
|
|
930
1063
|
),
|
|
931
|
-
createdAt:
|
|
1064
|
+
createdAt: import_zod25.z.number()
|
|
932
1065
|
});
|
|
933
|
-
var EvaluationLogSchema =
|
|
934
|
-
runId:
|
|
935
|
-
scenarioId:
|
|
936
|
-
log:
|
|
937
|
-
level:
|
|
938
|
-
message:
|
|
939
|
-
args:
|
|
940
|
-
error:
|
|
1066
|
+
var EvaluationLogSchema = import_zod25.z.object({
|
|
1067
|
+
runId: import_zod25.z.string(),
|
|
1068
|
+
scenarioId: import_zod25.z.string(),
|
|
1069
|
+
log: import_zod25.z.object({
|
|
1070
|
+
level: import_zod25.z.enum(["info", "error", "debug"]),
|
|
1071
|
+
message: import_zod25.z.string().optional(),
|
|
1072
|
+
args: import_zod25.z.array(import_zod25.z.any()).optional(),
|
|
1073
|
+
error: import_zod25.z.string().optional()
|
|
941
1074
|
})
|
|
942
1075
|
});
|
|
943
1076
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -950,91 +1083,91 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
950
1083
|
AssertionResultStatus2["ERROR"] = "error";
|
|
951
1084
|
return AssertionResultStatus2;
|
|
952
1085
|
})(AssertionResultStatus || {});
|
|
953
|
-
var AssertionResultSchema =
|
|
954
|
-
id:
|
|
955
|
-
assertionId:
|
|
956
|
-
assertionType:
|
|
957
|
-
assertionName:
|
|
958
|
-
status:
|
|
959
|
-
message:
|
|
960
|
-
expected:
|
|
961
|
-
actual:
|
|
962
|
-
duration:
|
|
963
|
-
details:
|
|
964
|
-
llmTraceSteps:
|
|
965
|
-
});
|
|
966
|
-
var EvalRunResultSchema =
|
|
967
|
-
id:
|
|
968
|
-
targetId:
|
|
969
|
-
targetName:
|
|
970
|
-
scenarioId:
|
|
971
|
-
scenarioName:
|
|
1086
|
+
var AssertionResultSchema = import_zod26.z.object({
|
|
1087
|
+
id: import_zod26.z.string(),
|
|
1088
|
+
assertionId: import_zod26.z.string(),
|
|
1089
|
+
assertionType: import_zod26.z.string(),
|
|
1090
|
+
assertionName: import_zod26.z.string(),
|
|
1091
|
+
status: import_zod26.z.enum(AssertionResultStatus),
|
|
1092
|
+
message: import_zod26.z.string().optional(),
|
|
1093
|
+
expected: import_zod26.z.string().optional(),
|
|
1094
|
+
actual: import_zod26.z.string().optional(),
|
|
1095
|
+
duration: import_zod26.z.number().optional(),
|
|
1096
|
+
details: import_zod26.z.record(import_zod26.z.string(), import_zod26.z.unknown()).optional(),
|
|
1097
|
+
llmTraceSteps: import_zod26.z.array(LLMTraceStepSchema).optional()
|
|
1098
|
+
});
|
|
1099
|
+
var EvalRunResultSchema = import_zod26.z.object({
|
|
1100
|
+
id: import_zod26.z.string(),
|
|
1101
|
+
targetId: import_zod26.z.string(),
|
|
1102
|
+
targetName: import_zod26.z.string().optional(),
|
|
1103
|
+
scenarioId: import_zod26.z.string(),
|
|
1104
|
+
scenarioName: import_zod26.z.string(),
|
|
972
1105
|
modelConfig: ModelConfigSchema.optional(),
|
|
973
|
-
assertionResults:
|
|
1106
|
+
assertionResults: import_zod26.z.array(AssertionResultSchema),
|
|
974
1107
|
metrics: EvalMetricsSchema.optional(),
|
|
975
|
-
passed:
|
|
976
|
-
failed:
|
|
977
|
-
passRate:
|
|
978
|
-
duration:
|
|
979
|
-
outputText:
|
|
980
|
-
files:
|
|
981
|
-
fileDiffs:
|
|
1108
|
+
passed: import_zod26.z.number(),
|
|
1109
|
+
failed: import_zod26.z.number(),
|
|
1110
|
+
passRate: import_zod26.z.number(),
|
|
1111
|
+
duration: import_zod26.z.number(),
|
|
1112
|
+
outputText: import_zod26.z.string().optional(),
|
|
1113
|
+
files: import_zod26.z.array(ExpectedFileSchema).optional(),
|
|
1114
|
+
fileDiffs: import_zod26.z.array(DiffContentSchema).optional(),
|
|
982
1115
|
/** Full template files after execution with status indicators */
|
|
983
|
-
templateFiles:
|
|
984
|
-
startedAt:
|
|
985
|
-
completedAt:
|
|
1116
|
+
templateFiles: import_zod26.z.array(TemplateFileSchema).optional(),
|
|
1117
|
+
startedAt: import_zod26.z.string().optional(),
|
|
1118
|
+
completedAt: import_zod26.z.string().optional(),
|
|
986
1119
|
llmTrace: LLMTraceSchema.optional()
|
|
987
1120
|
});
|
|
988
|
-
var PromptResultSchema =
|
|
989
|
-
text:
|
|
990
|
-
files:
|
|
991
|
-
finishReason:
|
|
992
|
-
reasoning:
|
|
993
|
-
reasoningDetails:
|
|
994
|
-
toolCalls:
|
|
995
|
-
toolResults:
|
|
996
|
-
warnings:
|
|
997
|
-
sources:
|
|
998
|
-
steps:
|
|
999
|
-
generationTimeMs:
|
|
1000
|
-
prompt:
|
|
1001
|
-
systemPrompt:
|
|
1002
|
-
usage:
|
|
1003
|
-
totalTokens:
|
|
1004
|
-
totalMicrocentsSpent:
|
|
1121
|
+
var PromptResultSchema = import_zod26.z.object({
|
|
1122
|
+
text: import_zod26.z.string(),
|
|
1123
|
+
files: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1124
|
+
finishReason: import_zod26.z.string().optional(),
|
|
1125
|
+
reasoning: import_zod26.z.string().optional(),
|
|
1126
|
+
reasoningDetails: import_zod26.z.unknown().optional(),
|
|
1127
|
+
toolCalls: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1128
|
+
toolResults: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1129
|
+
warnings: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1130
|
+
sources: import_zod26.z.array(import_zod26.z.unknown()).optional(),
|
|
1131
|
+
steps: import_zod26.z.array(import_zod26.z.unknown()),
|
|
1132
|
+
generationTimeMs: import_zod26.z.number(),
|
|
1133
|
+
prompt: import_zod26.z.string(),
|
|
1134
|
+
systemPrompt: import_zod26.z.string(),
|
|
1135
|
+
usage: import_zod26.z.object({
|
|
1136
|
+
totalTokens: import_zod26.z.number().optional(),
|
|
1137
|
+
totalMicrocentsSpent: import_zod26.z.number().optional()
|
|
1005
1138
|
})
|
|
1006
1139
|
});
|
|
1007
|
-
var EvaluationResultSchema =
|
|
1008
|
-
id:
|
|
1009
|
-
runId:
|
|
1010
|
-
timestamp:
|
|
1140
|
+
var EvaluationResultSchema = import_zod26.z.object({
|
|
1141
|
+
id: import_zod26.z.string(),
|
|
1142
|
+
runId: import_zod26.z.string(),
|
|
1143
|
+
timestamp: import_zod26.z.number(),
|
|
1011
1144
|
promptResult: PromptResultSchema,
|
|
1012
|
-
testResults:
|
|
1013
|
-
tags:
|
|
1014
|
-
feedback:
|
|
1015
|
-
score:
|
|
1016
|
-
suiteId:
|
|
1017
|
-
});
|
|
1018
|
-
var LeanEvaluationResultSchema =
|
|
1019
|
-
id:
|
|
1020
|
-
runId:
|
|
1021
|
-
timestamp:
|
|
1022
|
-
tags:
|
|
1023
|
-
scenarioId:
|
|
1024
|
-
scenarioVersion:
|
|
1025
|
-
targetId:
|
|
1026
|
-
targetVersion:
|
|
1027
|
-
suiteId:
|
|
1028
|
-
score:
|
|
1029
|
-
time:
|
|
1030
|
-
microcentsSpent:
|
|
1145
|
+
testResults: import_zod26.z.array(import_zod26.z.unknown()),
|
|
1146
|
+
tags: import_zod26.z.array(import_zod26.z.string()).optional(),
|
|
1147
|
+
feedback: import_zod26.z.string().optional(),
|
|
1148
|
+
score: import_zod26.z.number(),
|
|
1149
|
+
suiteId: import_zod26.z.string().optional()
|
|
1150
|
+
});
|
|
1151
|
+
var LeanEvaluationResultSchema = import_zod26.z.object({
|
|
1152
|
+
id: import_zod26.z.string(),
|
|
1153
|
+
runId: import_zod26.z.string(),
|
|
1154
|
+
timestamp: import_zod26.z.number(),
|
|
1155
|
+
tags: import_zod26.z.array(import_zod26.z.string()).optional(),
|
|
1156
|
+
scenarioId: import_zod26.z.string(),
|
|
1157
|
+
scenarioVersion: import_zod26.z.number().optional(),
|
|
1158
|
+
targetId: import_zod26.z.string(),
|
|
1159
|
+
targetVersion: import_zod26.z.number().optional(),
|
|
1160
|
+
suiteId: import_zod26.z.string().optional(),
|
|
1161
|
+
score: import_zod26.z.number(),
|
|
1162
|
+
time: import_zod26.z.number().optional(),
|
|
1163
|
+
microcentsSpent: import_zod26.z.number().optional()
|
|
1031
1164
|
});
|
|
1032
1165
|
|
|
1033
1166
|
// src/project/project.ts
|
|
1034
|
-
var
|
|
1167
|
+
var import_zod27 = require("zod");
|
|
1035
1168
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1036
|
-
appId:
|
|
1037
|
-
appSecret:
|
|
1169
|
+
appId: import_zod27.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
1170
|
+
appSecret: import_zod27.z.string().optional().describe("The secret of the app in Dev Center")
|
|
1038
1171
|
});
|
|
1039
1172
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1040
1173
|
id: true,
|
|
@@ -1045,10 +1178,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
1045
1178
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
1046
1179
|
|
|
1047
1180
|
// src/template/template.ts
|
|
1048
|
-
var
|
|
1181
|
+
var import_zod28 = require("zod");
|
|
1049
1182
|
var TemplateSchema = TenantEntitySchema.extend({
|
|
1050
1183
|
/** URL to download the template from */
|
|
1051
|
-
downloadUrl:
|
|
1184
|
+
downloadUrl: import_zod28.z.url()
|
|
1052
1185
|
});
|
|
1053
1186
|
var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
1054
1187
|
id: true,
|
|
@@ -1058,89 +1191,107 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
|
1058
1191
|
});
|
|
1059
1192
|
var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
1060
1193
|
|
|
1061
|
-
// src/assertion/
|
|
1062
|
-
var
|
|
1063
|
-
|
|
1064
|
-
"
|
|
1065
|
-
"
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
var SkillWasCalledConfigSchema = import_zod28.z.object({
|
|
1070
|
-
/** Name of the skill that must have been called */
|
|
1071
|
-
skillName: import_zod28.z.string().min(1)
|
|
1072
|
-
});
|
|
1073
|
-
var BuildPassedConfigSchema = import_zod28.z.strictObject({
|
|
1074
|
-
/** Command to run (default: "yarn build") */
|
|
1075
|
-
command: import_zod28.z.string().optional(),
|
|
1076
|
-
/** Expected exit code (default: 0) */
|
|
1077
|
-
expectedExitCode: import_zod28.z.number().int().optional()
|
|
1078
|
-
});
|
|
1079
|
-
var LlmJudgeConfigSchema = import_zod28.z.object({
|
|
1080
|
-
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1081
|
-
prompt: import_zod28.z.string().min(1),
|
|
1082
|
-
/** Optional system prompt for the judge */
|
|
1083
|
-
systemPrompt: import_zod28.z.string().optional(),
|
|
1084
|
-
/** Minimum score to pass (0-100, default 70) */
|
|
1085
|
-
minScore: import_zod28.z.number().int().min(0).max(100).optional(),
|
|
1086
|
-
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1087
|
-
model: import_zod28.z.string().optional(),
|
|
1088
|
-
/** Max output tokens */
|
|
1089
|
-
maxTokens: import_zod28.z.number().int().optional(),
|
|
1090
|
-
/** Temperature (0-1) */
|
|
1091
|
-
temperature: import_zod28.z.number().min(0).max(1).optional()
|
|
1092
|
-
});
|
|
1093
|
-
var AssertionConfigSchema = import_zod28.z.union([
|
|
1094
|
-
LlmJudgeConfigSchema,
|
|
1095
|
-
// requires prompt - check first
|
|
1096
|
-
SkillWasCalledConfigSchema,
|
|
1097
|
-
// requires skillName
|
|
1098
|
-
BuildPassedConfigSchema,
|
|
1099
|
-
// all optional, uses strictObject to reject unknown keys
|
|
1100
|
-
import_zod28.z.object({})
|
|
1101
|
-
// fallback empty config
|
|
1102
|
-
]);
|
|
1103
|
-
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
1104
|
-
/** The assertion type */
|
|
1105
|
-
type: AssertionTypeSchema,
|
|
1106
|
-
/** Type-specific configuration */
|
|
1107
|
-
config: AssertionConfigSchema
|
|
1108
|
-
});
|
|
1109
|
-
var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
|
|
1110
|
-
id: true,
|
|
1111
|
-
createdAt: true,
|
|
1112
|
-
updatedAt: true,
|
|
1113
|
-
deleted: true
|
|
1114
|
-
});
|
|
1115
|
-
var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
|
|
1116
|
-
function validateAssertionConfig(type, config) {
|
|
1117
|
-
switch (type) {
|
|
1118
|
-
case "skill_was_called":
|
|
1119
|
-
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
1120
|
-
case "build_passed":
|
|
1121
|
-
return BuildPassedConfigSchema.safeParse(config).success;
|
|
1122
|
-
case "llm_judge":
|
|
1123
|
-
case "custom":
|
|
1124
|
-
return LlmJudgeConfigSchema.safeParse(config).success;
|
|
1125
|
-
default:
|
|
1126
|
-
return false;
|
|
1127
|
-
}
|
|
1128
|
-
}
|
|
1129
|
-
function getSkillWasCalledConfig(assertion) {
|
|
1130
|
-
if (assertion.type !== "skill_was_called") return null;
|
|
1131
|
-
const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
|
|
1132
|
-
return result.success ? result.data : null;
|
|
1194
|
+
// src/assertion/system-assertions.ts
|
|
1195
|
+
var SYSTEM_ASSERTION_IDS = {
|
|
1196
|
+
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1197
|
+
BUILD_PASSED: "system:build_passed",
|
|
1198
|
+
LLM_JUDGE: "system:llm_judge"
|
|
1199
|
+
};
|
|
1200
|
+
function isSystemAssertionId(id) {
|
|
1201
|
+
return id.startsWith("system:");
|
|
1133
1202
|
}
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1203
|
+
var SYSTEM_ASSERTIONS = {
|
|
1204
|
+
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1205
|
+
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1206
|
+
name: "Skill Was Called",
|
|
1207
|
+
description: "Check if a specific skill was invoked during the agent run",
|
|
1208
|
+
type: "skill_was_called",
|
|
1209
|
+
parameters: [
|
|
1210
|
+
{
|
|
1211
|
+
name: "skillName",
|
|
1212
|
+
label: "Skill Name",
|
|
1213
|
+
type: "string",
|
|
1214
|
+
required: true
|
|
1215
|
+
}
|
|
1216
|
+
]
|
|
1217
|
+
},
|
|
1218
|
+
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1219
|
+
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1220
|
+
name: "Build Passed",
|
|
1221
|
+
description: "Run a build command and verify it exits with expected code",
|
|
1222
|
+
type: "build_passed",
|
|
1223
|
+
parameters: [
|
|
1224
|
+
{
|
|
1225
|
+
name: "command",
|
|
1226
|
+
label: "Build Command",
|
|
1227
|
+
type: "string",
|
|
1228
|
+
required: false,
|
|
1229
|
+
defaultValue: "yarn build"
|
|
1230
|
+
},
|
|
1231
|
+
{
|
|
1232
|
+
name: "expectedExitCode",
|
|
1233
|
+
label: "Expected Exit Code",
|
|
1234
|
+
type: "number",
|
|
1235
|
+
required: false,
|
|
1236
|
+
defaultValue: 0
|
|
1237
|
+
},
|
|
1238
|
+
{
|
|
1239
|
+
name: "maxBuildTime",
|
|
1240
|
+
label: "Max Build Time (ms)",
|
|
1241
|
+
type: "number",
|
|
1242
|
+
required: false,
|
|
1243
|
+
advanced: true
|
|
1244
|
+
},
|
|
1245
|
+
{
|
|
1246
|
+
name: "maxMemory",
|
|
1247
|
+
label: "Max Memory (MB)",
|
|
1248
|
+
type: "number",
|
|
1249
|
+
required: false,
|
|
1250
|
+
advanced: true
|
|
1251
|
+
}
|
|
1252
|
+
]
|
|
1253
|
+
},
|
|
1254
|
+
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1255
|
+
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1256
|
+
name: "LLM Judge",
|
|
1257
|
+
description: "LLM evaluates the output and assigns a score (0-100)",
|
|
1258
|
+
type: "llm_judge",
|
|
1259
|
+
parameters: [
|
|
1260
|
+
{
|
|
1261
|
+
name: "prompt",
|
|
1262
|
+
label: "Judge Prompt",
|
|
1263
|
+
type: "string",
|
|
1264
|
+
required: true,
|
|
1265
|
+
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1266
|
+
},
|
|
1267
|
+
{
|
|
1268
|
+
name: "systemPrompt",
|
|
1269
|
+
label: "System Prompt (optional)",
|
|
1270
|
+
type: "string",
|
|
1271
|
+
required: false,
|
|
1272
|
+
defaultValue: `You are judging a scenario run. Use these values:
|
|
1273
|
+
- {{output}}: the agent's final output
|
|
1274
|
+
- {{cwd}}: working directory
|
|
1275
|
+
- {{changedFiles}}: list of files changed (or "No files were changed")
|
|
1276
|
+
- {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
|
|
1277
|
+
|
|
1278
|
+
Judge how well the output meets the acceptance criteria stated in the user prompt.`
|
|
1279
|
+
},
|
|
1280
|
+
{
|
|
1281
|
+
name: "minScore",
|
|
1282
|
+
label: "Minimum Score (0-100)",
|
|
1283
|
+
type: "number",
|
|
1284
|
+
required: false,
|
|
1285
|
+
defaultValue: 70
|
|
1286
|
+
}
|
|
1287
|
+
]
|
|
1288
|
+
}
|
|
1289
|
+
};
|
|
1290
|
+
function getSystemAssertions() {
|
|
1291
|
+
return Object.values(SYSTEM_ASSERTIONS);
|
|
1138
1292
|
}
|
|
1139
|
-
function
|
|
1140
|
-
|
|
1141
|
-
return null;
|
|
1142
|
-
const result = LlmJudgeConfigSchema.safeParse(assertion.config);
|
|
1143
|
-
return result.success ? result.data : null;
|
|
1293
|
+
function getSystemAssertion(id) {
|
|
1294
|
+
return SYSTEM_ASSERTIONS[id];
|
|
1144
1295
|
}
|
|
1145
1296
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1146
1297
|
0 && (module.exports = {
|
|
@@ -1150,6 +1301,8 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1150
1301
|
AllowedCommands,
|
|
1151
1302
|
ApiCallSchema,
|
|
1152
1303
|
AssertionConfigSchema,
|
|
1304
|
+
AssertionParameterSchema,
|
|
1305
|
+
AssertionParameterTypeSchema,
|
|
1153
1306
|
AssertionResultSchema,
|
|
1154
1307
|
AssertionResultStatus,
|
|
1155
1308
|
AssertionSchema,
|
|
@@ -1216,6 +1369,9 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1216
1369
|
ProjectSchema,
|
|
1217
1370
|
PromptResultSchema,
|
|
1218
1371
|
SKILL_FOLDER_NAME_REGEX,
|
|
1372
|
+
SYSTEM_ASSERTIONS,
|
|
1373
|
+
SYSTEM_ASSERTION_IDS,
|
|
1374
|
+
ScenarioAssertionLinkSchema,
|
|
1219
1375
|
SiteConfigTestSchema,
|
|
1220
1376
|
SkillMetadataSchema,
|
|
1221
1377
|
SkillSchema,
|
|
@@ -1254,6 +1410,9 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1254
1410
|
getBuildPassedConfig,
|
|
1255
1411
|
getLlmJudgeConfig,
|
|
1256
1412
|
getSkillWasCalledConfig,
|
|
1413
|
+
getSystemAssertion,
|
|
1414
|
+
getSystemAssertions,
|
|
1415
|
+
isSystemAssertionId,
|
|
1257
1416
|
isValidSkillFolderName,
|
|
1258
1417
|
parseTraceEventLine,
|
|
1259
1418
|
validateAssertionConfig
|