@wix/evalforge-types 0.71.0 → 0.73.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.js +715 -620
- package/build/index.js.map +4 -4
- package/build/index.mjs +707 -619
- package/build/index.mjs.map +4 -4
- package/build/types/agent/adapter.d.ts +3 -1
- package/build/types/assertion/assertion.d.ts +26 -6
- package/build/types/assertion/build-passed-command.d.ts +25 -0
- package/build/types/assertion/index.d.ts +1 -0
- package/build/types/scenario/test-scenario.d.ts +64 -3
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -584,11 +584,42 @@ var EnvironmentSchema = z21.object({
|
|
|
584
584
|
});
|
|
585
585
|
|
|
586
586
|
// src/scenario/test-scenario.ts
|
|
587
|
-
import { z as
|
|
587
|
+
import { z as z24 } from "zod";
|
|
588
588
|
|
|
589
589
|
// src/assertion/assertion.ts
|
|
590
|
+
import { z as z23 } from "zod";
|
|
591
|
+
|
|
592
|
+
// src/assertion/build-passed-command.ts
|
|
590
593
|
import { z as z22 } from "zod";
|
|
591
|
-
var
|
|
594
|
+
var ALLOWED_BUILD_COMMANDS = [
|
|
595
|
+
"yarn build",
|
|
596
|
+
"npm run build",
|
|
597
|
+
"pnpm run build",
|
|
598
|
+
"pnpm build"
|
|
599
|
+
];
|
|
600
|
+
var DEFAULT_BUILD_PASSED_COMMAND = "yarn build";
|
|
601
|
+
var BUILD_COMMAND_ARGV = {
|
|
602
|
+
"yarn build": ["yarn", "build"],
|
|
603
|
+
"npm run build": ["npm", "run", "build"],
|
|
604
|
+
"pnpm run build": ["pnpm", "run", "build"],
|
|
605
|
+
"pnpm build": ["pnpm", "build"]
|
|
606
|
+
};
|
|
607
|
+
function isAllowedBuildCommandString(command) {
|
|
608
|
+
const trimmed = command.trim();
|
|
609
|
+
return ALLOWED_BUILD_COMMANDS.includes(trimmed);
|
|
610
|
+
}
|
|
611
|
+
function parseBuildCommandToArgv(command) {
|
|
612
|
+
const trimmed = command.trim();
|
|
613
|
+
if (!(trimmed in BUILD_COMMAND_ARGV)) {
|
|
614
|
+
return null;
|
|
615
|
+
}
|
|
616
|
+
return BUILD_COMMAND_ARGV[trimmed];
|
|
617
|
+
}
|
|
618
|
+
var enumTuple = ALLOWED_BUILD_COMMANDS;
|
|
619
|
+
var BuildPassedCommandStringSchema = z22.enum(enumTuple);
|
|
620
|
+
|
|
621
|
+
// src/assertion/assertion.ts
|
|
622
|
+
var AssertionTypeSchema = z23.enum([
|
|
592
623
|
"skill_was_called",
|
|
593
624
|
"tool_called_with_param",
|
|
594
625
|
"build_passed",
|
|
@@ -597,61 +628,61 @@ var AssertionTypeSchema = z22.enum([
|
|
|
597
628
|
"llm_judge",
|
|
598
629
|
"api_call"
|
|
599
630
|
]);
|
|
600
|
-
var AssertionParameterTypeSchema =
|
|
631
|
+
var AssertionParameterTypeSchema = z23.enum([
|
|
601
632
|
"string",
|
|
602
633
|
"number",
|
|
603
634
|
"boolean"
|
|
604
635
|
]);
|
|
605
|
-
var AssertionParameterSchema =
|
|
636
|
+
var AssertionParameterSchema = z23.object({
|
|
606
637
|
/** Parameter name (used as key in params object) */
|
|
607
|
-
name:
|
|
638
|
+
name: z23.string().min(1),
|
|
608
639
|
/** Display label for the parameter */
|
|
609
|
-
label:
|
|
640
|
+
label: z23.string().min(1),
|
|
610
641
|
/** Parameter type */
|
|
611
642
|
type: AssertionParameterTypeSchema,
|
|
612
643
|
/** Whether this parameter is required */
|
|
613
|
-
required:
|
|
644
|
+
required: z23.boolean(),
|
|
614
645
|
/** Default value (optional, used when not provided) */
|
|
615
|
-
defaultValue:
|
|
646
|
+
defaultValue: z23.union([z23.string(), z23.number(), z23.boolean()]).optional(),
|
|
616
647
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
617
|
-
advanced:
|
|
648
|
+
advanced: z23.boolean().optional()
|
|
618
649
|
});
|
|
619
|
-
var ScenarioAssertionLinkSchema =
|
|
650
|
+
var ScenarioAssertionLinkSchema = z23.object({
|
|
620
651
|
/** ID of the system assertion (e.g., 'system:skill_was_called') */
|
|
621
|
-
assertionId:
|
|
652
|
+
assertionId: z23.string(),
|
|
622
653
|
/** Parameter values for this assertion in this scenario */
|
|
623
|
-
params:
|
|
624
|
-
|
|
625
|
-
|
|
654
|
+
params: z23.record(
|
|
655
|
+
z23.string(),
|
|
656
|
+
z23.union([z23.string(), z23.number(), z23.boolean(), z23.null()])
|
|
626
657
|
).optional()
|
|
627
658
|
});
|
|
628
|
-
var SkillWasCalledConfigSchema =
|
|
659
|
+
var SkillWasCalledConfigSchema = z23.object({
|
|
629
660
|
/** Names of the skills that must have been called */
|
|
630
|
-
skillNames:
|
|
661
|
+
skillNames: z23.array(z23.string().min(1)).min(1)
|
|
631
662
|
});
|
|
632
|
-
var CostConfigSchema =
|
|
663
|
+
var CostConfigSchema = z23.strictObject({
|
|
633
664
|
/** Maximum allowed cost in USD */
|
|
634
|
-
maxCostUsd:
|
|
665
|
+
maxCostUsd: z23.number().positive()
|
|
635
666
|
});
|
|
636
|
-
var ToolCalledWithParamConfigSchema =
|
|
667
|
+
var ToolCalledWithParamConfigSchema = z23.strictObject({
|
|
637
668
|
/** Name of the tool that must have been called */
|
|
638
|
-
toolName:
|
|
669
|
+
toolName: z23.string().min(1),
|
|
639
670
|
/** JSON string of key-value pairs for expected parameters (substring match). Optional — when omitted, only checks tool presence. */
|
|
640
|
-
expectedParams:
|
|
671
|
+
expectedParams: z23.string().min(1).optional(),
|
|
641
672
|
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
642
|
-
requireSuccess:
|
|
673
|
+
requireSuccess: z23.boolean().optional()
|
|
643
674
|
});
|
|
644
|
-
var BuildPassedConfigSchema =
|
|
645
|
-
/**
|
|
646
|
-
command:
|
|
675
|
+
var BuildPassedConfigSchema = z23.strictObject({
|
|
676
|
+
/** Allowlisted command only (default at runtime: "yarn build") */
|
|
677
|
+
command: BuildPassedCommandStringSchema.optional(),
|
|
647
678
|
/** Expected exit code (default: 0) */
|
|
648
|
-
expectedExitCode:
|
|
679
|
+
expectedExitCode: z23.number().int().optional()
|
|
649
680
|
});
|
|
650
|
-
var TimeConfigSchema =
|
|
681
|
+
var TimeConfigSchema = z23.strictObject({
|
|
651
682
|
/** Maximum allowed duration in milliseconds */
|
|
652
|
-
maxDurationMs:
|
|
683
|
+
maxDurationMs: z23.number().int().positive()
|
|
653
684
|
});
|
|
654
|
-
var LlmJudgeConfigSchema =
|
|
685
|
+
var LlmJudgeConfigSchema = z23.object({
|
|
655
686
|
/**
|
|
656
687
|
* Prompt template with placeholders:
|
|
657
688
|
* - {{output}}: agent's final output
|
|
@@ -662,65 +693,65 @@ var LlmJudgeConfigSchema = z22.object({
|
|
|
662
693
|
* - {{trace}}: step-by-step trace of tool calls
|
|
663
694
|
* - Custom parameters defined in the parameters array
|
|
664
695
|
*/
|
|
665
|
-
prompt:
|
|
696
|
+
prompt: z23.string().min(1),
|
|
666
697
|
/** Minimum score to pass (0-10, default 7) */
|
|
667
|
-
minScore:
|
|
698
|
+
minScore: z23.number().int().min(0).max(10).optional(),
|
|
668
699
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
669
|
-
model:
|
|
700
|
+
model: z23.string().optional(),
|
|
670
701
|
/** Max output tokens */
|
|
671
|
-
maxTokens:
|
|
702
|
+
maxTokens: z23.number().int().optional(),
|
|
672
703
|
/** Temperature (0-1) */
|
|
673
|
-
temperature:
|
|
704
|
+
temperature: z23.number().min(0).max(1).optional(),
|
|
674
705
|
/** User-defined parameters for this assertion */
|
|
675
|
-
parameters:
|
|
706
|
+
parameters: z23.array(AssertionParameterSchema).optional()
|
|
676
707
|
});
|
|
677
|
-
var ApiCallConfigSchema =
|
|
708
|
+
var ApiCallConfigSchema = z23.strictObject({
|
|
678
709
|
/** URL to call */
|
|
679
|
-
url:
|
|
710
|
+
url: z23.string().min(1),
|
|
680
711
|
/** HTTP method (default GET) */
|
|
681
|
-
method:
|
|
712
|
+
method: z23.enum(["GET", "POST"]).optional(),
|
|
682
713
|
/** Request body (JSON string, for POST requests) */
|
|
683
|
-
requestBody:
|
|
714
|
+
requestBody: z23.string().optional(),
|
|
684
715
|
/** Expected JSON response to validate against (subset match — extra fields in actual are OK) */
|
|
685
|
-
expectedResponse:
|
|
716
|
+
expectedResponse: z23.string().min(1),
|
|
686
717
|
/** Request headers as JSON string of key-value pairs */
|
|
687
|
-
requestHeaders:
|
|
718
|
+
requestHeaders: z23.string().optional(),
|
|
688
719
|
/** Request timeout in milliseconds (default 30000) */
|
|
689
|
-
timeoutMs:
|
|
720
|
+
timeoutMs: z23.number().int().positive().optional()
|
|
690
721
|
});
|
|
691
722
|
var AssertionBaseFields = {
|
|
692
723
|
/** When true, the assertion's pass/fail logic is inverted (NOT operator). */
|
|
693
|
-
negate:
|
|
724
|
+
negate: z23.boolean().optional()
|
|
694
725
|
};
|
|
695
726
|
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
696
|
-
type:
|
|
727
|
+
type: z23.literal("skill_was_called"),
|
|
697
728
|
...AssertionBaseFields
|
|
698
729
|
});
|
|
699
730
|
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
700
|
-
type:
|
|
731
|
+
type: z23.literal("tool_called_with_param"),
|
|
701
732
|
...AssertionBaseFields
|
|
702
733
|
});
|
|
703
734
|
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
704
|
-
type:
|
|
735
|
+
type: z23.literal("build_passed"),
|
|
705
736
|
...AssertionBaseFields
|
|
706
737
|
});
|
|
707
738
|
var CostAssertionSchema = CostConfigSchema.extend({
|
|
708
|
-
type:
|
|
739
|
+
type: z23.literal("cost"),
|
|
709
740
|
...AssertionBaseFields
|
|
710
741
|
});
|
|
711
742
|
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
712
|
-
type:
|
|
743
|
+
type: z23.literal("llm_judge"),
|
|
713
744
|
...AssertionBaseFields
|
|
714
745
|
});
|
|
715
746
|
var ApiCallAssertionSchema = ApiCallConfigSchema.extend({
|
|
716
|
-
type:
|
|
747
|
+
type: z23.literal("api_call"),
|
|
717
748
|
...AssertionBaseFields
|
|
718
749
|
});
|
|
719
750
|
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
720
|
-
type:
|
|
751
|
+
type: z23.literal("time_limit"),
|
|
721
752
|
...AssertionBaseFields
|
|
722
753
|
});
|
|
723
|
-
var AssertionSchema =
|
|
754
|
+
var AssertionSchema = z23.union([
|
|
724
755
|
SkillWasCalledAssertionSchema,
|
|
725
756
|
ToolCalledWithParamAssertionSchema,
|
|
726
757
|
BuildPassedAssertionSchema,
|
|
@@ -729,7 +760,7 @@ var AssertionSchema = z22.union([
|
|
|
729
760
|
LlmJudgeAssertionSchema,
|
|
730
761
|
ApiCallAssertionSchema
|
|
731
762
|
]);
|
|
732
|
-
var AssertionConfigSchema =
|
|
763
|
+
var AssertionConfigSchema = z23.union([
|
|
733
764
|
LlmJudgeConfigSchema,
|
|
734
765
|
// requires prompt - check first
|
|
735
766
|
SkillWasCalledConfigSchema,
|
|
@@ -744,7 +775,7 @@ var AssertionConfigSchema = z22.union([
|
|
|
744
775
|
// requires maxCostUsd, uses strictObject
|
|
745
776
|
BuildPassedConfigSchema,
|
|
746
777
|
// all optional, uses strictObject to reject unknown keys
|
|
747
|
-
|
|
778
|
+
z23.object({})
|
|
748
779
|
// fallback empty config
|
|
749
780
|
]);
|
|
750
781
|
function validateAssertionConfig(type, config) {
|
|
@@ -768,52 +799,322 @@ function validateAssertionConfig(type, config) {
|
|
|
768
799
|
}
|
|
769
800
|
}
|
|
770
801
|
|
|
802
|
+
// src/assertion/system-assertions.ts
|
|
803
|
+
var SYSTEM_ASSERTION_IDS = {
|
|
804
|
+
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
805
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
806
|
+
BUILD_PASSED: "system:build_passed",
|
|
807
|
+
TIME_LIMIT: "system:time_limit",
|
|
808
|
+
COST: "system:cost",
|
|
809
|
+
LLM_JUDGE: "system:llm_judge",
|
|
810
|
+
API_CALL: "system:api_call"
|
|
811
|
+
};
|
|
812
|
+
function isSystemAssertionId(id) {
|
|
813
|
+
return id.startsWith("system:");
|
|
814
|
+
}
|
|
815
|
+
var SYSTEM_ASSERTIONS = {
|
|
816
|
+
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
817
|
+
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
818
|
+
name: "Skill Was Called",
|
|
819
|
+
description: "Check that one or more skills were invoked during the agent run",
|
|
820
|
+
type: "skill_was_called",
|
|
821
|
+
parameters: [
|
|
822
|
+
{
|
|
823
|
+
name: "skillNames",
|
|
824
|
+
label: "Skills",
|
|
825
|
+
type: "string",
|
|
826
|
+
required: true
|
|
827
|
+
},
|
|
828
|
+
{
|
|
829
|
+
name: "negate",
|
|
830
|
+
label: "Negate (NOT operator)",
|
|
831
|
+
type: "boolean",
|
|
832
|
+
required: false,
|
|
833
|
+
defaultValue: false
|
|
834
|
+
}
|
|
835
|
+
]
|
|
836
|
+
},
|
|
837
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
838
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
839
|
+
name: "Tool Called With Param",
|
|
840
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
841
|
+
type: "tool_called_with_param",
|
|
842
|
+
parameters: [
|
|
843
|
+
{
|
|
844
|
+
name: "toolName",
|
|
845
|
+
label: "Tool Name",
|
|
846
|
+
type: "string",
|
|
847
|
+
required: true
|
|
848
|
+
},
|
|
849
|
+
{
|
|
850
|
+
name: "expectedParams",
|
|
851
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
852
|
+
type: "string",
|
|
853
|
+
required: false
|
|
854
|
+
},
|
|
855
|
+
{
|
|
856
|
+
name: "requireSuccess",
|
|
857
|
+
label: "Require Successful Call",
|
|
858
|
+
type: "boolean",
|
|
859
|
+
required: false,
|
|
860
|
+
defaultValue: false,
|
|
861
|
+
advanced: true
|
|
862
|
+
},
|
|
863
|
+
{
|
|
864
|
+
name: "negate",
|
|
865
|
+
label: "Negate (NOT operator)",
|
|
866
|
+
type: "boolean",
|
|
867
|
+
required: false,
|
|
868
|
+
defaultValue: false
|
|
869
|
+
}
|
|
870
|
+
]
|
|
871
|
+
},
|
|
872
|
+
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
873
|
+
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
874
|
+
name: "Build Passed",
|
|
875
|
+
description: "Run a build command and verify it exits with expected code",
|
|
876
|
+
type: "build_passed",
|
|
877
|
+
parameters: [
|
|
878
|
+
{
|
|
879
|
+
name: "command",
|
|
880
|
+
label: "Build Command",
|
|
881
|
+
type: "string",
|
|
882
|
+
required: false,
|
|
883
|
+
defaultValue: "yarn build"
|
|
884
|
+
},
|
|
885
|
+
{
|
|
886
|
+
name: "expectedExitCode",
|
|
887
|
+
label: "Expected Exit Code",
|
|
888
|
+
type: "number",
|
|
889
|
+
required: false,
|
|
890
|
+
defaultValue: 0
|
|
891
|
+
},
|
|
892
|
+
{
|
|
893
|
+
name: "maxBuildTime",
|
|
894
|
+
label: "Max Build Time (ms)",
|
|
895
|
+
type: "number",
|
|
896
|
+
required: false,
|
|
897
|
+
advanced: true
|
|
898
|
+
},
|
|
899
|
+
{
|
|
900
|
+
name: "maxMemory",
|
|
901
|
+
label: "Max Memory (MB)",
|
|
902
|
+
type: "number",
|
|
903
|
+
required: false,
|
|
904
|
+
advanced: true
|
|
905
|
+
}
|
|
906
|
+
]
|
|
907
|
+
},
|
|
908
|
+
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
909
|
+
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
910
|
+
name: "Time Limit",
|
|
911
|
+
description: "Check that the scenario completed within a maximum duration",
|
|
912
|
+
type: "time_limit",
|
|
913
|
+
parameters: [
|
|
914
|
+
{
|
|
915
|
+
name: "maxDurationMs",
|
|
916
|
+
label: "Max Duration (ms)",
|
|
917
|
+
type: "number",
|
|
918
|
+
required: true,
|
|
919
|
+
defaultValue: 3e5
|
|
920
|
+
}
|
|
921
|
+
]
|
|
922
|
+
},
|
|
923
|
+
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
924
|
+
id: SYSTEM_ASSERTION_IDS.COST,
|
|
925
|
+
name: "Cost",
|
|
926
|
+
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
927
|
+
type: "cost",
|
|
928
|
+
parameters: [
|
|
929
|
+
{
|
|
930
|
+
name: "maxCostUsd",
|
|
931
|
+
label: "Max Cost (USD)",
|
|
932
|
+
type: "number",
|
|
933
|
+
required: true,
|
|
934
|
+
defaultValue: 1
|
|
935
|
+
}
|
|
936
|
+
]
|
|
937
|
+
},
|
|
938
|
+
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
939
|
+
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
940
|
+
name: "LLM Judge",
|
|
941
|
+
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
942
|
+
type: "llm_judge",
|
|
943
|
+
parameters: [
|
|
944
|
+
{
|
|
945
|
+
name: "prompt",
|
|
946
|
+
label: "Judge Prompt",
|
|
947
|
+
type: "string",
|
|
948
|
+
required: true,
|
|
949
|
+
defaultValue: "Verify the output meets the acceptance criteria."
|
|
950
|
+
},
|
|
951
|
+
{
|
|
952
|
+
name: "minScore",
|
|
953
|
+
label: "Minimum Score (0-10)",
|
|
954
|
+
type: "number",
|
|
955
|
+
required: false,
|
|
956
|
+
defaultValue: 7
|
|
957
|
+
},
|
|
958
|
+
{
|
|
959
|
+
name: "model",
|
|
960
|
+
label: "Model",
|
|
961
|
+
type: "string",
|
|
962
|
+
required: false
|
|
963
|
+
}
|
|
964
|
+
]
|
|
965
|
+
},
|
|
966
|
+
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
967
|
+
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
968
|
+
name: "API Call",
|
|
969
|
+
description: "Call an API endpoint and verify the response contains expected data",
|
|
970
|
+
type: "api_call",
|
|
971
|
+
parameters: [
|
|
972
|
+
{
|
|
973
|
+
name: "url",
|
|
974
|
+
label: "URL",
|
|
975
|
+
type: "string",
|
|
976
|
+
required: true
|
|
977
|
+
},
|
|
978
|
+
{
|
|
979
|
+
name: "method",
|
|
980
|
+
label: "HTTP Method",
|
|
981
|
+
type: "string",
|
|
982
|
+
required: false,
|
|
983
|
+
defaultValue: "GET"
|
|
984
|
+
},
|
|
985
|
+
{
|
|
986
|
+
name: "requestBody",
|
|
987
|
+
label: "Request Body (JSON)",
|
|
988
|
+
type: "string",
|
|
989
|
+
required: false
|
|
990
|
+
},
|
|
991
|
+
{
|
|
992
|
+
name: "expectedResponse",
|
|
993
|
+
label: "Expected Response (JSON)",
|
|
994
|
+
type: "string",
|
|
995
|
+
required: true
|
|
996
|
+
},
|
|
997
|
+
{
|
|
998
|
+
name: "requestHeaders",
|
|
999
|
+
label: "Headers (JSON)",
|
|
1000
|
+
type: "string",
|
|
1001
|
+
required: false,
|
|
1002
|
+
advanced: true
|
|
1003
|
+
},
|
|
1004
|
+
{
|
|
1005
|
+
name: "timeoutMs",
|
|
1006
|
+
label: "Timeout (ms)",
|
|
1007
|
+
type: "number",
|
|
1008
|
+
required: false,
|
|
1009
|
+
defaultValue: 3e4,
|
|
1010
|
+
advanced: true
|
|
1011
|
+
}
|
|
1012
|
+
]
|
|
1013
|
+
}
|
|
1014
|
+
};
|
|
1015
|
+
function getSystemAssertions() {
|
|
1016
|
+
return Object.values(SYSTEM_ASSERTIONS);
|
|
1017
|
+
}
|
|
1018
|
+
function getSystemAssertion(id) {
|
|
1019
|
+
return SYSTEM_ASSERTIONS[id];
|
|
1020
|
+
}
|
|
1021
|
+
|
|
771
1022
|
// src/scenario/test-scenario.ts
|
|
772
|
-
var
|
|
1023
|
+
var MAX_IMAGE_BASE64_LENGTH = 4 * Math.ceil(2 * 1024 * 1024 / 3);
|
|
1024
|
+
var TriggerPromptImageSchema = z24.object({
|
|
1025
|
+
/** Base64-encoded image data (no data URL prefix) */
|
|
1026
|
+
base64: z24.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
|
|
1027
|
+
/** MIME type of the image */
|
|
1028
|
+
mediaType: z24.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
|
|
1029
|
+
/** Original filename of the image */
|
|
1030
|
+
name: z24.string()
|
|
1031
|
+
});
|
|
1032
|
+
var ExpectedFileSchema = z24.object({
|
|
773
1033
|
/** Relative path where the file should be created */
|
|
774
|
-
path:
|
|
1034
|
+
path: z24.string(),
|
|
775
1035
|
/** Optional expected content */
|
|
776
|
-
content:
|
|
1036
|
+
content: z24.string().optional()
|
|
777
1037
|
});
|
|
778
1038
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
779
1039
|
/** The prompt sent to the agent to trigger the task */
|
|
780
|
-
triggerPrompt:
|
|
1040
|
+
triggerPrompt: z24.string().min(10),
|
|
781
1041
|
/** ID of the template to use for this scenario (null = no template) */
|
|
782
|
-
templateId:
|
|
1042
|
+
templateId: z24.string().nullish(),
|
|
783
1043
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
784
|
-
assertions:
|
|
1044
|
+
assertions: z24.array(AssertionSchema).optional(),
|
|
785
1045
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
786
|
-
assertionIds:
|
|
1046
|
+
assertionIds: z24.array(z24.string()).optional(),
|
|
787
1047
|
/** Linked assertions with per-scenario parameter values */
|
|
788
|
-
assertionLinks:
|
|
1048
|
+
assertionLinks: z24.array(ScenarioAssertionLinkSchema).optional(),
|
|
789
1049
|
/** Tags for categorisation and filtering */
|
|
790
|
-
tags:
|
|
791
|
-
|
|
792
|
-
|
|
1050
|
+
tags: z24.array(z24.string()).optional(),
|
|
1051
|
+
/** Base64-encoded images attached to the trigger prompt (max 3) */
|
|
1052
|
+
triggerPromptImages: z24.array(TriggerPromptImageSchema).max(3).optional()
|
|
1053
|
+
});
|
|
1054
|
+
function validateBuildPassedParamsInAssertionLinks(links, ctx) {
|
|
1055
|
+
if (!links) return;
|
|
1056
|
+
for (let i = 0; i < links.length; i++) {
|
|
1057
|
+
const link = links[i];
|
|
1058
|
+
if (link.assertionId !== SYSTEM_ASSERTION_IDS.BUILD_PASSED) continue;
|
|
1059
|
+
const cmd = link.params?.command;
|
|
1060
|
+
if (cmd === void 0 || cmd === null) continue;
|
|
1061
|
+
if (typeof cmd !== "string") {
|
|
1062
|
+
ctx.addIssue({
|
|
1063
|
+
code: z24.ZodIssueCode.custom,
|
|
1064
|
+
message: "build_passed command must be a string",
|
|
1065
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1066
|
+
});
|
|
1067
|
+
continue;
|
|
1068
|
+
}
|
|
1069
|
+
if (!isAllowedBuildCommandString(cmd)) {
|
|
1070
|
+
ctx.addIssue({
|
|
1071
|
+
code: z24.ZodIssueCode.custom,
|
|
1072
|
+
message: "Invalid build_passed command. Allowed: yarn build, npm run build, pnpm run build, pnpm build",
|
|
1073
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1074
|
+
});
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
var TestScenarioCreateBaseSchema = TestScenarioSchema.omit({
|
|
793
1079
|
id: true,
|
|
794
1080
|
createdAt: true,
|
|
795
1081
|
updatedAt: true,
|
|
796
1082
|
deleted: true
|
|
797
1083
|
});
|
|
798
|
-
var
|
|
1084
|
+
var CreateTestScenarioInputSchema = TestScenarioCreateBaseSchema.superRefine((data, ctx) => {
|
|
1085
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1086
|
+
});
|
|
1087
|
+
var UpdateTestScenarioInputSchema = TestScenarioCreateBaseSchema.partial().superRefine((data, ctx) => {
|
|
1088
|
+
if (data.assertionLinks !== void 0) {
|
|
1089
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1090
|
+
}
|
|
1091
|
+
});
|
|
799
1092
|
|
|
800
1093
|
// src/scenario/batch-import.ts
|
|
801
|
-
import { z as
|
|
1094
|
+
import { z as z25 } from "zod";
|
|
802
1095
|
var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
|
803
|
-
var BatchAssertionLinkSchema =
|
|
804
|
-
|
|
1096
|
+
var BatchAssertionLinkSchema = z25.union([
|
|
1097
|
+
z25.string().min(1),
|
|
805
1098
|
ScenarioAssertionLinkSchema
|
|
806
1099
|
]);
|
|
807
|
-
var BatchScenarioEntrySchema =
|
|
808
|
-
name:
|
|
809
|
-
description:
|
|
810
|
-
triggerPrompt:
|
|
811
|
-
templateId:
|
|
812
|
-
tags:
|
|
813
|
-
assertionLinks:
|
|
1100
|
+
var BatchScenarioEntrySchema = z25.object({
|
|
1101
|
+
name: z25.string().min(1, "name: Required"),
|
|
1102
|
+
description: z25.string().optional().default(""),
|
|
1103
|
+
triggerPrompt: z25.string().min(10, "triggerPrompt: Must be at least 10 characters"),
|
|
1104
|
+
templateId: z25.string().nullish(),
|
|
1105
|
+
tags: z25.array(z25.string()).optional(),
|
|
1106
|
+
assertionLinks: z25.array(BatchAssertionLinkSchema).optional()
|
|
1107
|
+
}).superRefine((data, ctx) => {
|
|
1108
|
+
if (!data.assertionLinks) return;
|
|
1109
|
+
const objectLinks = data.assertionLinks.filter(
|
|
1110
|
+
(link) => typeof link !== "string"
|
|
1111
|
+
);
|
|
1112
|
+
if (objectLinks.length > 0) {
|
|
1113
|
+
validateBuildPassedParamsInAssertionLinks(objectLinks, ctx);
|
|
1114
|
+
}
|
|
814
1115
|
});
|
|
815
|
-
var BatchImportPayloadSchema =
|
|
816
|
-
scenarios:
|
|
1116
|
+
var BatchImportPayloadSchema = z25.object({
|
|
1117
|
+
scenarios: z25.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
|
|
817
1118
|
});
|
|
818
1119
|
var BATCH_IMPORT_LIMITS = {
|
|
819
1120
|
MAX_SCENARIOS: 100,
|
|
@@ -835,29 +1136,29 @@ function normalizeBatchAssertionLink(link) {
|
|
|
835
1136
|
}
|
|
836
1137
|
return link;
|
|
837
1138
|
}
|
|
838
|
-
var BatchResultItemSchema =
|
|
839
|
-
index:
|
|
840
|
-
name:
|
|
841
|
-
status:
|
|
842
|
-
id:
|
|
843
|
-
errors:
|
|
844
|
-
});
|
|
845
|
-
var BatchSummarySchema =
|
|
846
|
-
total:
|
|
847
|
-
valid:
|
|
848
|
-
invalid:
|
|
849
|
-
created:
|
|
850
|
-
});
|
|
851
|
-
var BatchImportResponseSchema =
|
|
1139
|
+
var BatchResultItemSchema = z25.object({
|
|
1140
|
+
index: z25.number(),
|
|
1141
|
+
name: z25.string(),
|
|
1142
|
+
status: z25.enum(["valid", "invalid"]),
|
|
1143
|
+
id: z25.string().nullable().optional(),
|
|
1144
|
+
errors: z25.array(z25.string()).optional()
|
|
1145
|
+
});
|
|
1146
|
+
var BatchSummarySchema = z25.object({
|
|
1147
|
+
total: z25.number(),
|
|
1148
|
+
valid: z25.number(),
|
|
1149
|
+
invalid: z25.number(),
|
|
1150
|
+
created: z25.number()
|
|
1151
|
+
});
|
|
1152
|
+
var BatchImportResponseSchema = z25.object({
|
|
852
1153
|
summary: BatchSummarySchema,
|
|
853
|
-
results:
|
|
1154
|
+
results: z25.array(BatchResultItemSchema)
|
|
854
1155
|
});
|
|
855
1156
|
|
|
856
1157
|
// src/suite/test-suite.ts
|
|
857
|
-
import { z as
|
|
1158
|
+
import { z as z26 } from "zod";
|
|
858
1159
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
859
1160
|
/** IDs of test scenarios in this suite */
|
|
860
|
-
scenarioIds:
|
|
1161
|
+
scenarioIds: z26.array(z26.string())
|
|
861
1162
|
});
|
|
862
1163
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
863
1164
|
id: true,
|
|
@@ -868,21 +1169,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
868
1169
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
869
1170
|
|
|
870
1171
|
// src/evaluation/metrics.ts
|
|
871
|
-
import { z as
|
|
872
|
-
var TokenUsageSchema =
|
|
873
|
-
prompt:
|
|
874
|
-
completion:
|
|
875
|
-
total:
|
|
876
|
-
});
|
|
877
|
-
var EvalMetricsSchema =
|
|
878
|
-
totalAssertions:
|
|
879
|
-
passed:
|
|
880
|
-
failed:
|
|
881
|
-
skipped:
|
|
882
|
-
errors:
|
|
883
|
-
passRate:
|
|
884
|
-
avgDuration:
|
|
885
|
-
totalDuration:
|
|
1172
|
+
import { z as z27 } from "zod";
|
|
1173
|
+
var TokenUsageSchema = z27.object({
|
|
1174
|
+
prompt: z27.number(),
|
|
1175
|
+
completion: z27.number(),
|
|
1176
|
+
total: z27.number()
|
|
1177
|
+
});
|
|
1178
|
+
var EvalMetricsSchema = z27.object({
|
|
1179
|
+
totalAssertions: z27.number(),
|
|
1180
|
+
passed: z27.number(),
|
|
1181
|
+
failed: z27.number(),
|
|
1182
|
+
skipped: z27.number(),
|
|
1183
|
+
errors: z27.number(),
|
|
1184
|
+
passRate: z27.number(),
|
|
1185
|
+
avgDuration: z27.number(),
|
|
1186
|
+
totalDuration: z27.number()
|
|
886
1187
|
});
|
|
887
1188
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
888
1189
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -892,7 +1193,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
892
1193
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
893
1194
|
return EvalStatus2;
|
|
894
1195
|
})(EvalStatus || {});
|
|
895
|
-
var EvalStatusSchema =
|
|
1196
|
+
var EvalStatusSchema = z27.enum(EvalStatus);
|
|
896
1197
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
897
1198
|
LLMStepType2["COMPLETION"] = "completion";
|
|
898
1199
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -900,54 +1201,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
900
1201
|
LLMStepType2["THINKING"] = "thinking";
|
|
901
1202
|
return LLMStepType2;
|
|
902
1203
|
})(LLMStepType || {});
|
|
903
|
-
var LLMTraceStepSchema =
|
|
904
|
-
id:
|
|
905
|
-
stepNumber:
|
|
906
|
-
type:
|
|
907
|
-
model:
|
|
908
|
-
provider:
|
|
909
|
-
startedAt:
|
|
910
|
-
durationMs:
|
|
1204
|
+
var LLMTraceStepSchema = z27.object({
|
|
1205
|
+
id: z27.string(),
|
|
1206
|
+
stepNumber: z27.number(),
|
|
1207
|
+
type: z27.enum(LLMStepType),
|
|
1208
|
+
model: z27.string(),
|
|
1209
|
+
provider: z27.string(),
|
|
1210
|
+
startedAt: z27.string(),
|
|
1211
|
+
durationMs: z27.number(),
|
|
911
1212
|
tokenUsage: TokenUsageSchema,
|
|
912
|
-
costUsd:
|
|
913
|
-
toolName:
|
|
914
|
-
toolArguments:
|
|
915
|
-
inputPreview:
|
|
916
|
-
outputPreview:
|
|
917
|
-
success:
|
|
918
|
-
error:
|
|
919
|
-
turnIndex:
|
|
920
|
-
});
|
|
921
|
-
var LLMBreakdownStatsSchema =
|
|
922
|
-
count:
|
|
923
|
-
durationMs:
|
|
924
|
-
tokens:
|
|
925
|
-
costUsd:
|
|
926
|
-
});
|
|
927
|
-
var LLMTraceSummarySchema =
|
|
928
|
-
totalSteps:
|
|
929
|
-
totalTurns:
|
|
930
|
-
totalDurationMs:
|
|
1213
|
+
costUsd: z27.number(),
|
|
1214
|
+
toolName: z27.string().optional(),
|
|
1215
|
+
toolArguments: z27.string().optional(),
|
|
1216
|
+
inputPreview: z27.string().optional(),
|
|
1217
|
+
outputPreview: z27.string().optional(),
|
|
1218
|
+
success: z27.boolean(),
|
|
1219
|
+
error: z27.string().optional(),
|
|
1220
|
+
turnIndex: z27.number().optional()
|
|
1221
|
+
});
|
|
1222
|
+
var LLMBreakdownStatsSchema = z27.object({
|
|
1223
|
+
count: z27.number(),
|
|
1224
|
+
durationMs: z27.number(),
|
|
1225
|
+
tokens: z27.number(),
|
|
1226
|
+
costUsd: z27.number()
|
|
1227
|
+
});
|
|
1228
|
+
var LLMTraceSummarySchema = z27.object({
|
|
1229
|
+
totalSteps: z27.number(),
|
|
1230
|
+
totalTurns: z27.number().optional(),
|
|
1231
|
+
totalDurationMs: z27.number(),
|
|
931
1232
|
totalTokens: TokenUsageSchema,
|
|
932
|
-
totalCostUsd:
|
|
933
|
-
stepTypeBreakdown:
|
|
934
|
-
modelBreakdown:
|
|
935
|
-
modelsUsed:
|
|
936
|
-
});
|
|
937
|
-
var LLMTraceSchema =
|
|
938
|
-
id:
|
|
939
|
-
steps:
|
|
1233
|
+
totalCostUsd: z27.number(),
|
|
1234
|
+
stepTypeBreakdown: z27.record(z27.string(), LLMBreakdownStatsSchema).optional(),
|
|
1235
|
+
modelBreakdown: z27.record(z27.string(), LLMBreakdownStatsSchema),
|
|
1236
|
+
modelsUsed: z27.array(z27.string())
|
|
1237
|
+
});
|
|
1238
|
+
var LLMTraceSchema = z27.object({
|
|
1239
|
+
id: z27.string(),
|
|
1240
|
+
steps: z27.array(LLMTraceStepSchema),
|
|
940
1241
|
summary: LLMTraceSummarySchema
|
|
941
1242
|
});
|
|
942
1243
|
|
|
943
1244
|
// src/evaluation/eval-result.ts
|
|
944
|
-
import { z as
|
|
1245
|
+
import { z as z31 } from "zod";
|
|
945
1246
|
|
|
946
1247
|
// src/evaluation/eval-run.ts
|
|
947
|
-
import { z as
|
|
1248
|
+
import { z as z29 } from "zod";
|
|
948
1249
|
|
|
949
1250
|
// src/evaluation/live-trace.ts
|
|
950
|
-
import { z as
|
|
1251
|
+
import { z as z28 } from "zod";
|
|
951
1252
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
952
1253
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
953
1254
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -961,37 +1262,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
961
1262
|
LiveTraceEventType2["USER"] = "user";
|
|
962
1263
|
return LiveTraceEventType2;
|
|
963
1264
|
})(LiveTraceEventType || {});
|
|
964
|
-
var LiveTraceEventSchema =
|
|
1265
|
+
var LiveTraceEventSchema = z28.object({
|
|
965
1266
|
/** The evaluation run ID */
|
|
966
|
-
evalRunId:
|
|
1267
|
+
evalRunId: z28.string(),
|
|
967
1268
|
/** The scenario ID being executed */
|
|
968
|
-
scenarioId:
|
|
1269
|
+
scenarioId: z28.string(),
|
|
969
1270
|
/** The scenario name for display */
|
|
970
|
-
scenarioName:
|
|
1271
|
+
scenarioName: z28.string(),
|
|
971
1272
|
/** The target ID (skill, agent, etc.) */
|
|
972
|
-
targetId:
|
|
1273
|
+
targetId: z28.string(),
|
|
973
1274
|
/** The target name for display */
|
|
974
|
-
targetName:
|
|
1275
|
+
targetName: z28.string(),
|
|
975
1276
|
/** Step number in the current scenario execution */
|
|
976
|
-
stepNumber:
|
|
1277
|
+
stepNumber: z28.number(),
|
|
977
1278
|
/** Type of trace event */
|
|
978
|
-
type:
|
|
1279
|
+
type: z28.enum(LiveTraceEventType),
|
|
979
1280
|
/** Tool name if this is a tool_use event */
|
|
980
|
-
toolName:
|
|
1281
|
+
toolName: z28.string().optional(),
|
|
981
1282
|
/** Tool arguments preview (truncated JSON) */
|
|
982
|
-
toolArgs:
|
|
1283
|
+
toolArgs: z28.string().optional(),
|
|
983
1284
|
/** Output preview (truncated text) */
|
|
984
|
-
outputPreview:
|
|
1285
|
+
outputPreview: z28.string().optional(),
|
|
985
1286
|
/** File path for file operations */
|
|
986
|
-
filePath:
|
|
1287
|
+
filePath: z28.string().optional(),
|
|
987
1288
|
/** Elapsed time in milliseconds for progress events */
|
|
988
|
-
elapsedMs:
|
|
1289
|
+
elapsedMs: z28.number().optional(),
|
|
989
1290
|
/** Thinking/reasoning text from Claude */
|
|
990
|
-
thinking:
|
|
1291
|
+
thinking: z28.string().optional(),
|
|
991
1292
|
/** Timestamp when this event occurred */
|
|
992
|
-
timestamp:
|
|
1293
|
+
timestamp: z28.string(),
|
|
993
1294
|
/** Whether this is the final event for this scenario */
|
|
994
|
-
isComplete:
|
|
1295
|
+
isComplete: z28.boolean()
|
|
995
1296
|
});
|
|
996
1297
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
997
1298
|
function parseTraceEventLine(line) {
|
|
@@ -1020,40 +1321,40 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1020
1321
|
TriggerType2["SCHEDULED"] = "SCHEDULED";
|
|
1021
1322
|
return TriggerType2;
|
|
1022
1323
|
})(TriggerType || {});
|
|
1023
|
-
var TriggerMetadataSchema =
|
|
1024
|
-
version:
|
|
1025
|
-
resourceUpdated:
|
|
1026
|
-
scheduleId:
|
|
1324
|
+
var TriggerMetadataSchema = z29.object({
|
|
1325
|
+
version: z29.string().optional(),
|
|
1326
|
+
resourceUpdated: z29.array(z29.string()).optional(),
|
|
1327
|
+
scheduleId: z29.string().optional()
|
|
1027
1328
|
});
|
|
1028
|
-
var TriggerSchema =
|
|
1029
|
-
id:
|
|
1329
|
+
var TriggerSchema = z29.object({
|
|
1330
|
+
id: z29.string(),
|
|
1030
1331
|
metadata: TriggerMetadataSchema.optional(),
|
|
1031
|
-
type:
|
|
1332
|
+
type: z29.nativeEnum(TriggerType)
|
|
1032
1333
|
});
|
|
1033
|
-
var DiffLineTypeSchema =
|
|
1034
|
-
var DiffLineSchema =
|
|
1334
|
+
var DiffLineTypeSchema = z29.enum(["added", "removed", "unchanged"]);
|
|
1335
|
+
var DiffLineSchema = z29.object({
|
|
1035
1336
|
type: DiffLineTypeSchema,
|
|
1036
|
-
content:
|
|
1037
|
-
lineNumber:
|
|
1038
|
-
});
|
|
1039
|
-
var DiffContentSchema =
|
|
1040
|
-
path:
|
|
1041
|
-
expected:
|
|
1042
|
-
actual:
|
|
1043
|
-
diffLines:
|
|
1044
|
-
renamedFrom:
|
|
1337
|
+
content: z29.string(),
|
|
1338
|
+
lineNumber: z29.number()
|
|
1339
|
+
});
|
|
1340
|
+
var DiffContentSchema = z29.object({
|
|
1341
|
+
path: z29.string(),
|
|
1342
|
+
expected: z29.string(),
|
|
1343
|
+
actual: z29.string(),
|
|
1344
|
+
diffLines: z29.array(DiffLineSchema),
|
|
1345
|
+
renamedFrom: z29.string().optional(),
|
|
1045
1346
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1046
|
-
isInfrastructure:
|
|
1347
|
+
isInfrastructure: z29.boolean().optional()
|
|
1047
1348
|
});
|
|
1048
|
-
var CommandExecutionSchema =
|
|
1049
|
-
command:
|
|
1050
|
-
exitCode:
|
|
1051
|
-
output:
|
|
1052
|
-
duration:
|
|
1349
|
+
var CommandExecutionSchema = z29.object({
|
|
1350
|
+
command: z29.string(),
|
|
1351
|
+
exitCode: z29.number(),
|
|
1352
|
+
output: z29.string().optional(),
|
|
1353
|
+
duration: z29.number()
|
|
1053
1354
|
});
|
|
1054
|
-
var FileModificationSchema =
|
|
1055
|
-
path:
|
|
1056
|
-
action:
|
|
1355
|
+
var FileModificationSchema = z29.object({
|
|
1356
|
+
path: z29.string(),
|
|
1357
|
+
action: z29.enum(["created", "modified", "deleted"])
|
|
1057
1358
|
});
|
|
1058
1359
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1059
1360
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1061,62 +1362,62 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1061
1362
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1062
1363
|
return TemplateFileStatus2;
|
|
1063
1364
|
})(TemplateFileStatus || {});
|
|
1064
|
-
var TemplateFileSchema =
|
|
1365
|
+
var TemplateFileSchema = z29.object({
|
|
1065
1366
|
/** Relative path within the template */
|
|
1066
|
-
path:
|
|
1367
|
+
path: z29.string(),
|
|
1067
1368
|
/** Full file content after execution */
|
|
1068
|
-
content:
|
|
1369
|
+
content: z29.string(),
|
|
1069
1370
|
/** File status (new, modified, unchanged) */
|
|
1070
|
-
status:
|
|
1371
|
+
status: z29.enum(["new", "modified", "unchanged"]),
|
|
1071
1372
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1072
|
-
isInfrastructure:
|
|
1373
|
+
isInfrastructure: z29.boolean().optional()
|
|
1073
1374
|
});
|
|
1074
|
-
var ApiCallSchema =
|
|
1075
|
-
endpoint:
|
|
1076
|
-
tokensUsed:
|
|
1077
|
-
duration:
|
|
1375
|
+
var ApiCallSchema = z29.object({
|
|
1376
|
+
endpoint: z29.string(),
|
|
1377
|
+
tokensUsed: z29.number(),
|
|
1378
|
+
duration: z29.number()
|
|
1078
1379
|
});
|
|
1079
|
-
var ExecutionTraceSchema =
|
|
1080
|
-
commands:
|
|
1081
|
-
filesModified:
|
|
1082
|
-
apiCalls:
|
|
1083
|
-
totalDuration:
|
|
1380
|
+
var ExecutionTraceSchema = z29.object({
|
|
1381
|
+
commands: z29.array(CommandExecutionSchema),
|
|
1382
|
+
filesModified: z29.array(FileModificationSchema),
|
|
1383
|
+
apiCalls: z29.array(ApiCallSchema),
|
|
1384
|
+
totalDuration: z29.number()
|
|
1084
1385
|
});
|
|
1085
|
-
var RunAnalysisFindingSchema =
|
|
1086
|
-
category:
|
|
1386
|
+
var RunAnalysisFindingSchema = z29.object({
|
|
1387
|
+
category: z29.enum([
|
|
1087
1388
|
"failure_pattern",
|
|
1088
1389
|
"cost_waste",
|
|
1089
1390
|
"flakiness",
|
|
1090
1391
|
"inefficiency",
|
|
1091
1392
|
"positive"
|
|
1092
1393
|
]),
|
|
1093
|
-
severity:
|
|
1094
|
-
description:
|
|
1095
|
-
affectedScenarios:
|
|
1096
|
-
recommendation:
|
|
1394
|
+
severity: z29.enum(["high", "medium", "low"]),
|
|
1395
|
+
description: z29.string(),
|
|
1396
|
+
affectedScenarios: z29.array(z29.string()),
|
|
1397
|
+
recommendation: z29.string().optional()
|
|
1097
1398
|
});
|
|
1098
|
-
var RunAnalysisSchema =
|
|
1099
|
-
generatedAt:
|
|
1100
|
-
summary:
|
|
1101
|
-
findings:
|
|
1399
|
+
var RunAnalysisSchema = z29.object({
|
|
1400
|
+
generatedAt: z29.string(),
|
|
1401
|
+
summary: z29.string(),
|
|
1402
|
+
findings: z29.array(RunAnalysisFindingSchema)
|
|
1102
1403
|
});
|
|
1103
1404
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1104
1405
|
/** Agent ID for this run */
|
|
1105
|
-
agentId:
|
|
1406
|
+
agentId: z29.string().optional(),
|
|
1106
1407
|
/** Preset ID that originated this run (optional) */
|
|
1107
|
-
presetId:
|
|
1408
|
+
presetId: z29.string().optional(),
|
|
1108
1409
|
/** Skill IDs for this run */
|
|
1109
|
-
skillIds:
|
|
1410
|
+
skillIds: z29.array(z29.string()).optional(),
|
|
1110
1411
|
/** Map of skillId to skillVersionId for this run */
|
|
1111
|
-
skillVersions:
|
|
1412
|
+
skillVersions: z29.record(z29.string(), z29.string()).optional(),
|
|
1112
1413
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
1113
|
-
scenarioIds:
|
|
1414
|
+
scenarioIds: z29.array(z29.string()),
|
|
1114
1415
|
/** Current status */
|
|
1115
1416
|
status: EvalStatusSchema,
|
|
1116
1417
|
/** Progress percentage (0-100) */
|
|
1117
|
-
progress:
|
|
1418
|
+
progress: z29.number(),
|
|
1118
1419
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1119
|
-
results:
|
|
1420
|
+
results: z29.array(z29.lazy(() => EvalRunResultSchema)),
|
|
1120
1421
|
/** Aggregated metrics across all results */
|
|
1121
1422
|
aggregateMetrics: EvalMetricsSchema,
|
|
1122
1423
|
/** Aggregated LLM trace summary */
|
|
@@ -1124,41 +1425,41 @@ var EvalRunSchema = TenantEntitySchema.extend({
|
|
|
1124
1425
|
/** What triggered this run */
|
|
1125
1426
|
trigger: TriggerSchema.optional(),
|
|
1126
1427
|
/** When the run started (set when evaluation is triggered) */
|
|
1127
|
-
startedAt:
|
|
1428
|
+
startedAt: z29.string().optional(),
|
|
1128
1429
|
/** When the run completed */
|
|
1129
|
-
completedAt:
|
|
1430
|
+
completedAt: z29.string().optional(),
|
|
1130
1431
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1131
|
-
liveTraceEvents:
|
|
1432
|
+
liveTraceEvents: z29.array(LiveTraceEventSchema).optional(),
|
|
1132
1433
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1133
|
-
jobId:
|
|
1434
|
+
jobId: z29.string().optional(),
|
|
1134
1435
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1135
|
-
jobStatus:
|
|
1436
|
+
jobStatus: z29.string().optional(),
|
|
1136
1437
|
/** Remote job error message if the job failed */
|
|
1137
|
-
jobError:
|
|
1438
|
+
jobError: z29.string().optional(),
|
|
1138
1439
|
/** Timestamp of the last job status check */
|
|
1139
|
-
jobStatusCheckedAt:
|
|
1440
|
+
jobStatusCheckedAt: z29.string().optional(),
|
|
1140
1441
|
/** MCP server IDs to enable for this run (optional) */
|
|
1141
|
-
mcpIds:
|
|
1442
|
+
mcpIds: z29.array(z29.string()).optional(),
|
|
1142
1443
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
1143
|
-
subAgentIds:
|
|
1444
|
+
subAgentIds: z29.array(z29.string()).optional(),
|
|
1144
1445
|
/** Rule IDs to enable for this run (optional) */
|
|
1145
|
-
ruleIds:
|
|
1446
|
+
ruleIds: z29.array(z29.string()).optional(),
|
|
1146
1447
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
1147
|
-
tags:
|
|
1448
|
+
tags: z29.array(z29.string()).optional(),
|
|
1148
1449
|
/** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
|
|
1149
|
-
runsPerScenario:
|
|
1450
|
+
runsPerScenario: z29.number().int().min(1).max(20).optional(),
|
|
1150
1451
|
/** Snapshot of agent configuration captured at run creation time */
|
|
1151
|
-
agentSnapshot:
|
|
1152
|
-
name:
|
|
1452
|
+
agentSnapshot: z29.object({
|
|
1453
|
+
name: z29.string().optional(),
|
|
1153
1454
|
agentType: AgentTypeSchema.optional(),
|
|
1154
1455
|
runCommand: AgentRunCommandSchema.optional(),
|
|
1155
|
-
systemPrompt:
|
|
1456
|
+
systemPrompt: z29.string().nullable().optional(),
|
|
1156
1457
|
modelConfig: ModelConfigSchema.optional()
|
|
1157
1458
|
}).optional(),
|
|
1158
1459
|
/** UUID linking all runs in a comparison group */
|
|
1159
|
-
comparisonGroupId:
|
|
1460
|
+
comparisonGroupId: z29.string().optional(),
|
|
1160
1461
|
/** Human-readable label for this variant (e.g., "MCP: Wix Stores") */
|
|
1161
|
-
comparisonLabel:
|
|
1462
|
+
comparisonLabel: z29.string().optional(),
|
|
1162
1463
|
/** LLM-generated analysis of the completed run */
|
|
1163
1464
|
runAnalysis: RunAnalysisSchema.optional()
|
|
1164
1465
|
});
|
|
@@ -1176,60 +1477,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1176
1477
|
agentSnapshot: true
|
|
1177
1478
|
}).extend({
|
|
1178
1479
|
/** Optional on input — backend resolves from tags when not provided */
|
|
1179
|
-
scenarioIds:
|
|
1480
|
+
scenarioIds: z29.array(z29.string()).optional()
|
|
1180
1481
|
}).refine(
|
|
1181
1482
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
1182
1483
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
1183
1484
|
);
|
|
1184
|
-
var EvaluationProgressSchema =
|
|
1185
|
-
runId:
|
|
1186
|
-
targetId:
|
|
1187
|
-
totalScenarios:
|
|
1188
|
-
completedScenarios:
|
|
1189
|
-
scenarioProgress:
|
|
1190
|
-
|
|
1191
|
-
scenarioId:
|
|
1192
|
-
currentStep:
|
|
1193
|
-
error:
|
|
1485
|
+
var EvaluationProgressSchema = z29.object({
|
|
1486
|
+
runId: z29.string(),
|
|
1487
|
+
targetId: z29.string(),
|
|
1488
|
+
totalScenarios: z29.number(),
|
|
1489
|
+
completedScenarios: z29.number(),
|
|
1490
|
+
scenarioProgress: z29.array(
|
|
1491
|
+
z29.object({
|
|
1492
|
+
scenarioId: z29.string(),
|
|
1493
|
+
currentStep: z29.string(),
|
|
1494
|
+
error: z29.string().optional()
|
|
1194
1495
|
})
|
|
1195
1496
|
),
|
|
1196
|
-
createdAt:
|
|
1197
|
-
});
|
|
1198
|
-
var EvaluationLogSchema =
|
|
1199
|
-
runId:
|
|
1200
|
-
scenarioId:
|
|
1201
|
-
log:
|
|
1202
|
-
level:
|
|
1203
|
-
message:
|
|
1204
|
-
args:
|
|
1205
|
-
error:
|
|
1497
|
+
createdAt: z29.number()
|
|
1498
|
+
});
|
|
1499
|
+
var EvaluationLogSchema = z29.object({
|
|
1500
|
+
runId: z29.string(),
|
|
1501
|
+
scenarioId: z29.string(),
|
|
1502
|
+
log: z29.object({
|
|
1503
|
+
level: z29.enum(["info", "error", "debug"]),
|
|
1504
|
+
message: z29.string().optional(),
|
|
1505
|
+
args: z29.array(z29.any()).optional(),
|
|
1506
|
+
error: z29.string().optional()
|
|
1206
1507
|
})
|
|
1207
1508
|
});
|
|
1208
1509
|
var LLM_TIMEOUT = 12e4;
|
|
1209
1510
|
|
|
1210
1511
|
// src/evaluation/conversation.ts
|
|
1211
|
-
import { z as
|
|
1212
|
-
var TextBlockSchema =
|
|
1213
|
-
type:
|
|
1214
|
-
text:
|
|
1215
|
-
});
|
|
1216
|
-
var ThinkingBlockSchema =
|
|
1217
|
-
type:
|
|
1218
|
-
thinking:
|
|
1219
|
-
});
|
|
1220
|
-
var ToolUseBlockSchema =
|
|
1221
|
-
type:
|
|
1222
|
-
toolName:
|
|
1223
|
-
toolId:
|
|
1224
|
-
input:
|
|
1225
|
-
});
|
|
1226
|
-
var ToolResultBlockSchema =
|
|
1227
|
-
type:
|
|
1228
|
-
toolUseId:
|
|
1229
|
-
content:
|
|
1230
|
-
isError:
|
|
1231
|
-
});
|
|
1232
|
-
var ConversationBlockSchema =
|
|
1512
|
+
import { z as z30 } from "zod";
|
|
1513
|
+
var TextBlockSchema = z30.object({
|
|
1514
|
+
type: z30.literal("text"),
|
|
1515
|
+
text: z30.string()
|
|
1516
|
+
});
|
|
1517
|
+
var ThinkingBlockSchema = z30.object({
|
|
1518
|
+
type: z30.literal("thinking"),
|
|
1519
|
+
thinking: z30.string()
|
|
1520
|
+
});
|
|
1521
|
+
var ToolUseBlockSchema = z30.object({
|
|
1522
|
+
type: z30.literal("tool_use"),
|
|
1523
|
+
toolName: z30.string(),
|
|
1524
|
+
toolId: z30.string(),
|
|
1525
|
+
input: z30.unknown()
|
|
1526
|
+
});
|
|
1527
|
+
var ToolResultBlockSchema = z30.object({
|
|
1528
|
+
type: z30.literal("tool_result"),
|
|
1529
|
+
toolUseId: z30.string(),
|
|
1530
|
+
content: z30.string(),
|
|
1531
|
+
isError: z30.boolean().optional()
|
|
1532
|
+
});
|
|
1533
|
+
var ConversationBlockSchema = z30.discriminatedUnion("type", [
|
|
1233
1534
|
TextBlockSchema,
|
|
1234
1535
|
ThinkingBlockSchema,
|
|
1235
1536
|
ToolUseBlockSchema,
|
|
@@ -1240,18 +1541,18 @@ var ConversationMessageRoles = [
|
|
|
1240
1541
|
"user",
|
|
1241
1542
|
"system"
|
|
1242
1543
|
];
|
|
1243
|
-
var ConversationMessageSchema =
|
|
1244
|
-
role:
|
|
1245
|
-
content:
|
|
1246
|
-
timestamp:
|
|
1544
|
+
var ConversationMessageSchema = z30.object({
|
|
1545
|
+
role: z30.enum(ConversationMessageRoles),
|
|
1546
|
+
content: z30.array(ConversationBlockSchema),
|
|
1547
|
+
timestamp: z30.string()
|
|
1247
1548
|
});
|
|
1248
|
-
var ScenarioConversationSchema =
|
|
1249
|
-
id:
|
|
1250
|
-
projectId:
|
|
1251
|
-
evalRunId:
|
|
1252
|
-
resultId:
|
|
1253
|
-
messages:
|
|
1254
|
-
createdAt:
|
|
1549
|
+
var ScenarioConversationSchema = z30.object({
|
|
1550
|
+
id: z30.string(),
|
|
1551
|
+
projectId: z30.string(),
|
|
1552
|
+
evalRunId: z30.string(),
|
|
1553
|
+
resultId: z30.string(),
|
|
1554
|
+
messages: z30.array(ConversationMessageSchema),
|
|
1555
|
+
createdAt: z30.string()
|
|
1255
1556
|
});
|
|
1256
1557
|
|
|
1257
1558
|
// src/evaluation/eval-result.ts
|
|
@@ -1262,98 +1563,98 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1262
1563
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1263
1564
|
return AssertionResultStatus2;
|
|
1264
1565
|
})(AssertionResultStatus || {});
|
|
1265
|
-
var AssertionResultSchema =
|
|
1266
|
-
id:
|
|
1267
|
-
assertionId:
|
|
1268
|
-
assertionType:
|
|
1269
|
-
assertionName:
|
|
1270
|
-
status:
|
|
1271
|
-
message:
|
|
1272
|
-
expected:
|
|
1273
|
-
actual:
|
|
1274
|
-
duration:
|
|
1275
|
-
details:
|
|
1276
|
-
llmTraceSteps:
|
|
1277
|
-
});
|
|
1278
|
-
var EvalRunResultSchema =
|
|
1279
|
-
id:
|
|
1280
|
-
targetId:
|
|
1281
|
-
targetName:
|
|
1566
|
+
var AssertionResultSchema = z31.object({
|
|
1567
|
+
id: z31.string(),
|
|
1568
|
+
assertionId: z31.string(),
|
|
1569
|
+
assertionType: z31.string(),
|
|
1570
|
+
assertionName: z31.string(),
|
|
1571
|
+
status: z31.enum(AssertionResultStatus),
|
|
1572
|
+
message: z31.string().optional(),
|
|
1573
|
+
expected: z31.string().optional(),
|
|
1574
|
+
actual: z31.string().optional(),
|
|
1575
|
+
duration: z31.number().optional(),
|
|
1576
|
+
details: z31.record(z31.string(), z31.unknown()).optional(),
|
|
1577
|
+
llmTraceSteps: z31.array(LLMTraceStepSchema).optional()
|
|
1578
|
+
});
|
|
1579
|
+
var EvalRunResultSchema = z31.object({
|
|
1580
|
+
id: z31.string(),
|
|
1581
|
+
targetId: z31.string(),
|
|
1582
|
+
targetName: z31.string().optional(),
|
|
1282
1583
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
1283
|
-
skillVersionId:
|
|
1584
|
+
skillVersionId: z31.string().optional(),
|
|
1284
1585
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
1285
|
-
skillVersion:
|
|
1286
|
-
scenarioId:
|
|
1287
|
-
scenarioName:
|
|
1586
|
+
skillVersion: z31.string().optional(),
|
|
1587
|
+
scenarioId: z31.string(),
|
|
1588
|
+
scenarioName: z31.string(),
|
|
1288
1589
|
/** Snapshot of the trigger prompt used during the run (prevents stale display after edits) */
|
|
1289
|
-
triggerPrompt:
|
|
1590
|
+
triggerPrompt: z31.string().optional(),
|
|
1290
1591
|
modelConfig: ModelConfigSchema.optional(),
|
|
1291
|
-
assertionResults:
|
|
1592
|
+
assertionResults: z31.array(AssertionResultSchema),
|
|
1292
1593
|
metrics: EvalMetricsSchema.optional(),
|
|
1293
|
-
passed:
|
|
1294
|
-
failed:
|
|
1295
|
-
passRate:
|
|
1296
|
-
duration:
|
|
1297
|
-
outputText:
|
|
1298
|
-
files:
|
|
1299
|
-
fileDiffs:
|
|
1594
|
+
passed: z31.number(),
|
|
1595
|
+
failed: z31.number(),
|
|
1596
|
+
passRate: z31.number(),
|
|
1597
|
+
duration: z31.number(),
|
|
1598
|
+
outputText: z31.string().optional(),
|
|
1599
|
+
files: z31.array(ExpectedFileSchema).optional(),
|
|
1600
|
+
fileDiffs: z31.array(DiffContentSchema).optional(),
|
|
1300
1601
|
/** Full template files after execution with status indicators */
|
|
1301
|
-
templateFiles:
|
|
1302
|
-
startedAt:
|
|
1303
|
-
completedAt:
|
|
1602
|
+
templateFiles: z31.array(TemplateFileSchema).optional(),
|
|
1603
|
+
startedAt: z31.string().optional(),
|
|
1604
|
+
completedAt: z31.string().optional(),
|
|
1304
1605
|
llmTrace: LLMTraceSchema.optional(),
|
|
1305
1606
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
1306
|
-
conversation:
|
|
1607
|
+
conversation: z31.array(ConversationMessageSchema).optional(),
|
|
1307
1608
|
/** 0-based iteration index when a scenario is run multiple times within a single eval run */
|
|
1308
|
-
iterationIndex:
|
|
1309
|
-
});
|
|
1310
|
-
var PromptResultSchema =
|
|
1311
|
-
text:
|
|
1312
|
-
files:
|
|
1313
|
-
finishReason:
|
|
1314
|
-
reasoning:
|
|
1315
|
-
reasoningDetails:
|
|
1316
|
-
toolCalls:
|
|
1317
|
-
toolResults:
|
|
1318
|
-
warnings:
|
|
1319
|
-
sources:
|
|
1320
|
-
steps:
|
|
1321
|
-
generationTimeMs:
|
|
1322
|
-
prompt:
|
|
1323
|
-
systemPrompt:
|
|
1324
|
-
usage:
|
|
1325
|
-
totalTokens:
|
|
1326
|
-
totalMicrocentsSpent:
|
|
1609
|
+
iterationIndex: z31.number().int().min(0).optional()
|
|
1610
|
+
});
|
|
1611
|
+
var PromptResultSchema = z31.object({
|
|
1612
|
+
text: z31.string(),
|
|
1613
|
+
files: z31.array(z31.unknown()).optional(),
|
|
1614
|
+
finishReason: z31.string().optional(),
|
|
1615
|
+
reasoning: z31.string().optional(),
|
|
1616
|
+
reasoningDetails: z31.unknown().optional(),
|
|
1617
|
+
toolCalls: z31.array(z31.unknown()).optional(),
|
|
1618
|
+
toolResults: z31.array(z31.unknown()).optional(),
|
|
1619
|
+
warnings: z31.array(z31.unknown()).optional(),
|
|
1620
|
+
sources: z31.array(z31.unknown()).optional(),
|
|
1621
|
+
steps: z31.array(z31.unknown()),
|
|
1622
|
+
generationTimeMs: z31.number(),
|
|
1623
|
+
prompt: z31.string(),
|
|
1624
|
+
systemPrompt: z31.string(),
|
|
1625
|
+
usage: z31.object({
|
|
1626
|
+
totalTokens: z31.number().optional(),
|
|
1627
|
+
totalMicrocentsSpent: z31.number().optional()
|
|
1327
1628
|
})
|
|
1328
1629
|
});
|
|
1329
|
-
var EvaluationResultSchema =
|
|
1330
|
-
id:
|
|
1331
|
-
runId:
|
|
1332
|
-
timestamp:
|
|
1630
|
+
var EvaluationResultSchema = z31.object({
|
|
1631
|
+
id: z31.string(),
|
|
1632
|
+
runId: z31.string(),
|
|
1633
|
+
timestamp: z31.number(),
|
|
1333
1634
|
promptResult: PromptResultSchema,
|
|
1334
|
-
testResults:
|
|
1335
|
-
tags:
|
|
1336
|
-
feedback:
|
|
1337
|
-
score:
|
|
1338
|
-
suiteId:
|
|
1339
|
-
});
|
|
1340
|
-
var LeanEvaluationResultSchema =
|
|
1341
|
-
id:
|
|
1342
|
-
runId:
|
|
1343
|
-
timestamp:
|
|
1344
|
-
tags:
|
|
1345
|
-
scenarioId:
|
|
1346
|
-
scenarioVersion:
|
|
1347
|
-
targetId:
|
|
1348
|
-
targetVersion:
|
|
1349
|
-
suiteId:
|
|
1350
|
-
score:
|
|
1351
|
-
time:
|
|
1352
|
-
microcentsSpent:
|
|
1635
|
+
testResults: z31.array(z31.unknown()),
|
|
1636
|
+
tags: z31.array(z31.string()).optional(),
|
|
1637
|
+
feedback: z31.string().optional(),
|
|
1638
|
+
score: z31.number(),
|
|
1639
|
+
suiteId: z31.string().optional()
|
|
1640
|
+
});
|
|
1641
|
+
var LeanEvaluationResultSchema = z31.object({
|
|
1642
|
+
id: z31.string(),
|
|
1643
|
+
runId: z31.string(),
|
|
1644
|
+
timestamp: z31.number(),
|
|
1645
|
+
tags: z31.array(z31.string()).optional(),
|
|
1646
|
+
scenarioId: z31.string(),
|
|
1647
|
+
scenarioVersion: z31.number().optional(),
|
|
1648
|
+
targetId: z31.string(),
|
|
1649
|
+
targetVersion: z31.number().optional(),
|
|
1650
|
+
suiteId: z31.string().optional(),
|
|
1651
|
+
score: z31.number(),
|
|
1652
|
+
time: z31.number().optional(),
|
|
1653
|
+
microcentsSpent: z31.number().optional()
|
|
1353
1654
|
});
|
|
1354
1655
|
|
|
1355
1656
|
// src/evaluation/eval-run-folder.ts
|
|
1356
|
-
import { z as
|
|
1657
|
+
import { z as z32 } from "zod";
|
|
1357
1658
|
var EvalRunFolderSchema = TenantEntitySchema.extend({});
|
|
1358
1659
|
var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
1359
1660
|
id: true,
|
|
@@ -1367,26 +1668,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
|
1367
1668
|
updatedAt: true,
|
|
1368
1669
|
deleted: true
|
|
1369
1670
|
}).partial();
|
|
1370
|
-
var EvalRunFolderMembershipSchema =
|
|
1371
|
-
folderId:
|
|
1372
|
-
evalRunId:
|
|
1373
|
-
projectId:
|
|
1374
|
-
createdAt:
|
|
1671
|
+
var EvalRunFolderMembershipSchema = z32.object({
|
|
1672
|
+
folderId: z32.string(),
|
|
1673
|
+
evalRunId: z32.string(),
|
|
1674
|
+
projectId: z32.string(),
|
|
1675
|
+
createdAt: z32.string()
|
|
1375
1676
|
});
|
|
1376
1677
|
|
|
1377
1678
|
// src/project/project.ts
|
|
1378
|
-
import { z as
|
|
1679
|
+
import { z as z33 } from "zod";
|
|
1379
1680
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1380
|
-
appId:
|
|
1381
|
-
scenarioTags:
|
|
1681
|
+
appId: z33.string().optional().describe("The ID of the app in Dev Center"),
|
|
1682
|
+
scenarioTags: z33.array(z33.string()).optional().describe("Project-level tag vocabulary for scenarios"),
|
|
1382
1683
|
/** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
|
|
1383
|
-
wixAuthToken:
|
|
1684
|
+
wixAuthToken: z33.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
|
|
1384
1685
|
/** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
|
|
1385
|
-
base44AuthFile:
|
|
1686
|
+
base44AuthFile: z33.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
|
|
1386
1687
|
/** Resolved at runtime from the encrypted Wix auth token */
|
|
1387
|
-
wixAuthEmail:
|
|
1688
|
+
wixAuthEmail: z33.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
|
|
1388
1689
|
/** Resolved at runtime from the encrypted Base44 auth file */
|
|
1389
|
-
base44AuthEmail:
|
|
1690
|
+
base44AuthEmail: z33.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
|
|
1390
1691
|
});
|
|
1391
1692
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1392
1693
|
id: true,
|
|
@@ -1412,7 +1713,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
|
1412
1713
|
var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
1413
1714
|
|
|
1414
1715
|
// src/schedule/eval-schedule.ts
|
|
1415
|
-
import { z as
|
|
1716
|
+
import { z as z34 } from "zod";
|
|
1416
1717
|
var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
1417
1718
|
FrequencyType2["DAILY"] = "daily";
|
|
1418
1719
|
FrequencyType2["WEEKDAY"] = "weekday";
|
|
@@ -1422,29 +1723,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
|
1422
1723
|
})(FrequencyType || {});
|
|
1423
1724
|
var EvalScheduleSchema = TenantEntitySchema.extend({
|
|
1424
1725
|
/** Whether the schedule is active */
|
|
1425
|
-
enabled:
|
|
1726
|
+
enabled: z34.boolean(),
|
|
1426
1727
|
/** Test suite to run */
|
|
1427
|
-
suiteId:
|
|
1728
|
+
suiteId: z34.string(),
|
|
1428
1729
|
/** Preset that provides agent + entities for this schedule */
|
|
1429
|
-
presetId:
|
|
1730
|
+
presetId: z34.string(),
|
|
1430
1731
|
/** How often to run */
|
|
1431
|
-
frequencyType:
|
|
1732
|
+
frequencyType: z34.nativeEnum(FrequencyType),
|
|
1432
1733
|
/** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
|
|
1433
|
-
timeOfDay:
|
|
1734
|
+
timeOfDay: z34.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
|
|
1434
1735
|
/** Day of week (0=Sun, 6=Sat) for weekly schedules */
|
|
1435
|
-
dayOfWeek:
|
|
1736
|
+
dayOfWeek: z34.number().min(0).max(6).optional(),
|
|
1436
1737
|
/** Day of month (1-31) for monthly schedules */
|
|
1437
|
-
dayOfMonth:
|
|
1738
|
+
dayOfMonth: z34.number().min(1).max(31).optional(),
|
|
1438
1739
|
/** IANA timezone (e.g., 'America/New_York') */
|
|
1439
|
-
timezone:
|
|
1740
|
+
timezone: z34.string(),
|
|
1440
1741
|
/** ID of the last eval run created by this schedule */
|
|
1441
|
-
lastRunId:
|
|
1742
|
+
lastRunId: z34.string().optional(),
|
|
1442
1743
|
/** Denormalized status of the last run */
|
|
1443
|
-
lastRunStatus:
|
|
1744
|
+
lastRunStatus: z34.string().optional(),
|
|
1444
1745
|
/** ISO timestamp of the last run */
|
|
1445
|
-
lastRunAt:
|
|
1746
|
+
lastRunAt: z34.string().optional(),
|
|
1446
1747
|
/** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
|
|
1447
|
-
nextRunAt:
|
|
1748
|
+
nextRunAt: z34.string().optional()
|
|
1448
1749
|
});
|
|
1449
1750
|
function isValidTimezone(tz) {
|
|
1450
1751
|
try {
|
|
@@ -1457,14 +1758,14 @@ function isValidTimezone(tz) {
|
|
|
1457
1758
|
function validateScheduleFields(data, ctx, options) {
|
|
1458
1759
|
if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
|
|
1459
1760
|
ctx.addIssue({
|
|
1460
|
-
code:
|
|
1761
|
+
code: z34.ZodIssueCode.custom,
|
|
1461
1762
|
message: "dayOfWeek is required for weekly schedules",
|
|
1462
1763
|
path: ["dayOfWeek"]
|
|
1463
1764
|
});
|
|
1464
1765
|
}
|
|
1465
1766
|
if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
|
|
1466
1767
|
ctx.addIssue({
|
|
1467
|
-
code:
|
|
1768
|
+
code: z34.ZodIssueCode.custom,
|
|
1468
1769
|
message: "dayOfMonth is required for monthly schedules",
|
|
1469
1770
|
path: ["dayOfMonth"]
|
|
1470
1771
|
});
|
|
@@ -1472,7 +1773,7 @@ function validateScheduleFields(data, ctx, options) {
|
|
|
1472
1773
|
const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
|
|
1473
1774
|
if (shouldValidateTz && !isValidTimezone(data.timezone)) {
|
|
1474
1775
|
ctx.addIssue({
|
|
1475
|
-
code:
|
|
1776
|
+
code: z34.ZodIssueCode.custom,
|
|
1476
1777
|
message: "Invalid IANA timezone",
|
|
1477
1778
|
path: ["timezone"]
|
|
1478
1779
|
});
|
|
@@ -1495,228 +1796,9 @@ var CreateEvalScheduleInputSchema = BaseCreateScheduleSchema.superRefine((data,
|
|
|
1495
1796
|
var UpdateEvalScheduleInputSchema = BaseCreateScheduleSchema.partial().superRefine((data, ctx) => {
|
|
1496
1797
|
validateScheduleFields(data, ctx, { partial: true });
|
|
1497
1798
|
});
|
|
1498
|
-
|
|
1499
|
-
// src/assertion/system-assertions.ts
|
|
1500
|
-
var SYSTEM_ASSERTION_IDS = {
|
|
1501
|
-
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1502
|
-
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
1503
|
-
BUILD_PASSED: "system:build_passed",
|
|
1504
|
-
TIME_LIMIT: "system:time_limit",
|
|
1505
|
-
COST: "system:cost",
|
|
1506
|
-
LLM_JUDGE: "system:llm_judge",
|
|
1507
|
-
API_CALL: "system:api_call"
|
|
1508
|
-
};
|
|
1509
|
-
function isSystemAssertionId(id) {
|
|
1510
|
-
return id.startsWith("system:");
|
|
1511
|
-
}
|
|
1512
|
-
var SYSTEM_ASSERTIONS = {
|
|
1513
|
-
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1514
|
-
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1515
|
-
name: "Skill Was Called",
|
|
1516
|
-
description: "Check that one or more skills were invoked during the agent run",
|
|
1517
|
-
type: "skill_was_called",
|
|
1518
|
-
parameters: [
|
|
1519
|
-
{
|
|
1520
|
-
name: "skillNames",
|
|
1521
|
-
label: "Skills",
|
|
1522
|
-
type: "string",
|
|
1523
|
-
required: true
|
|
1524
|
-
},
|
|
1525
|
-
{
|
|
1526
|
-
name: "negate",
|
|
1527
|
-
label: "Negate (NOT operator)",
|
|
1528
|
-
type: "boolean",
|
|
1529
|
-
required: false,
|
|
1530
|
-
defaultValue: false
|
|
1531
|
-
}
|
|
1532
|
-
]
|
|
1533
|
-
},
|
|
1534
|
-
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
1535
|
-
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
1536
|
-
name: "Tool Called With Param",
|
|
1537
|
-
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
1538
|
-
type: "tool_called_with_param",
|
|
1539
|
-
parameters: [
|
|
1540
|
-
{
|
|
1541
|
-
name: "toolName",
|
|
1542
|
-
label: "Tool Name",
|
|
1543
|
-
type: "string",
|
|
1544
|
-
required: true
|
|
1545
|
-
},
|
|
1546
|
-
{
|
|
1547
|
-
name: "expectedParams",
|
|
1548
|
-
label: "Expected Parameters (JSON, substring match)",
|
|
1549
|
-
type: "string",
|
|
1550
|
-
required: false
|
|
1551
|
-
},
|
|
1552
|
-
{
|
|
1553
|
-
name: "requireSuccess",
|
|
1554
|
-
label: "Require Successful Call",
|
|
1555
|
-
type: "boolean",
|
|
1556
|
-
required: false,
|
|
1557
|
-
defaultValue: false,
|
|
1558
|
-
advanced: true
|
|
1559
|
-
},
|
|
1560
|
-
{
|
|
1561
|
-
name: "negate",
|
|
1562
|
-
label: "Negate (NOT operator)",
|
|
1563
|
-
type: "boolean",
|
|
1564
|
-
required: false,
|
|
1565
|
-
defaultValue: false
|
|
1566
|
-
}
|
|
1567
|
-
]
|
|
1568
|
-
},
|
|
1569
|
-
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1570
|
-
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1571
|
-
name: "Build Passed",
|
|
1572
|
-
description: "Run a build command and verify it exits with expected code",
|
|
1573
|
-
type: "build_passed",
|
|
1574
|
-
parameters: [
|
|
1575
|
-
{
|
|
1576
|
-
name: "command",
|
|
1577
|
-
label: "Build Command",
|
|
1578
|
-
type: "string",
|
|
1579
|
-
required: false,
|
|
1580
|
-
defaultValue: "yarn build"
|
|
1581
|
-
},
|
|
1582
|
-
{
|
|
1583
|
-
name: "expectedExitCode",
|
|
1584
|
-
label: "Expected Exit Code",
|
|
1585
|
-
type: "number",
|
|
1586
|
-
required: false,
|
|
1587
|
-
defaultValue: 0
|
|
1588
|
-
},
|
|
1589
|
-
{
|
|
1590
|
-
name: "maxBuildTime",
|
|
1591
|
-
label: "Max Build Time (ms)",
|
|
1592
|
-
type: "number",
|
|
1593
|
-
required: false,
|
|
1594
|
-
advanced: true
|
|
1595
|
-
},
|
|
1596
|
-
{
|
|
1597
|
-
name: "maxMemory",
|
|
1598
|
-
label: "Max Memory (MB)",
|
|
1599
|
-
type: "number",
|
|
1600
|
-
required: false,
|
|
1601
|
-
advanced: true
|
|
1602
|
-
}
|
|
1603
|
-
]
|
|
1604
|
-
},
|
|
1605
|
-
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
1606
|
-
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
1607
|
-
name: "Time Limit",
|
|
1608
|
-
description: "Check that the scenario completed within a maximum duration",
|
|
1609
|
-
type: "time_limit",
|
|
1610
|
-
parameters: [
|
|
1611
|
-
{
|
|
1612
|
-
name: "maxDurationMs",
|
|
1613
|
-
label: "Max Duration (ms)",
|
|
1614
|
-
type: "number",
|
|
1615
|
-
required: true,
|
|
1616
|
-
defaultValue: 3e5
|
|
1617
|
-
}
|
|
1618
|
-
]
|
|
1619
|
-
},
|
|
1620
|
-
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
1621
|
-
id: SYSTEM_ASSERTION_IDS.COST,
|
|
1622
|
-
name: "Cost",
|
|
1623
|
-
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
1624
|
-
type: "cost",
|
|
1625
|
-
parameters: [
|
|
1626
|
-
{
|
|
1627
|
-
name: "maxCostUsd",
|
|
1628
|
-
label: "Max Cost (USD)",
|
|
1629
|
-
type: "number",
|
|
1630
|
-
required: true,
|
|
1631
|
-
defaultValue: 1
|
|
1632
|
-
}
|
|
1633
|
-
]
|
|
1634
|
-
},
|
|
1635
|
-
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1636
|
-
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1637
|
-
name: "LLM Judge",
|
|
1638
|
-
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
1639
|
-
type: "llm_judge",
|
|
1640
|
-
parameters: [
|
|
1641
|
-
{
|
|
1642
|
-
name: "prompt",
|
|
1643
|
-
label: "Judge Prompt",
|
|
1644
|
-
type: "string",
|
|
1645
|
-
required: true,
|
|
1646
|
-
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1647
|
-
},
|
|
1648
|
-
{
|
|
1649
|
-
name: "minScore",
|
|
1650
|
-
label: "Minimum Score (0-10)",
|
|
1651
|
-
type: "number",
|
|
1652
|
-
required: false,
|
|
1653
|
-
defaultValue: 7
|
|
1654
|
-
},
|
|
1655
|
-
{
|
|
1656
|
-
name: "model",
|
|
1657
|
-
label: "Model",
|
|
1658
|
-
type: "string",
|
|
1659
|
-
required: false
|
|
1660
|
-
}
|
|
1661
|
-
]
|
|
1662
|
-
},
|
|
1663
|
-
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
1664
|
-
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
1665
|
-
name: "API Call",
|
|
1666
|
-
description: "Call an API endpoint and verify the response contains expected data",
|
|
1667
|
-
type: "api_call",
|
|
1668
|
-
parameters: [
|
|
1669
|
-
{
|
|
1670
|
-
name: "url",
|
|
1671
|
-
label: "URL",
|
|
1672
|
-
type: "string",
|
|
1673
|
-
required: true
|
|
1674
|
-
},
|
|
1675
|
-
{
|
|
1676
|
-
name: "method",
|
|
1677
|
-
label: "HTTP Method",
|
|
1678
|
-
type: "string",
|
|
1679
|
-
required: false,
|
|
1680
|
-
defaultValue: "GET"
|
|
1681
|
-
},
|
|
1682
|
-
{
|
|
1683
|
-
name: "requestBody",
|
|
1684
|
-
label: "Request Body (JSON)",
|
|
1685
|
-
type: "string",
|
|
1686
|
-
required: false
|
|
1687
|
-
},
|
|
1688
|
-
{
|
|
1689
|
-
name: "expectedResponse",
|
|
1690
|
-
label: "Expected Response (JSON)",
|
|
1691
|
-
type: "string",
|
|
1692
|
-
required: true
|
|
1693
|
-
},
|
|
1694
|
-
{
|
|
1695
|
-
name: "requestHeaders",
|
|
1696
|
-
label: "Headers (JSON)",
|
|
1697
|
-
type: "string",
|
|
1698
|
-
required: false,
|
|
1699
|
-
advanced: true
|
|
1700
|
-
},
|
|
1701
|
-
{
|
|
1702
|
-
name: "timeoutMs",
|
|
1703
|
-
label: "Timeout (ms)",
|
|
1704
|
-
type: "number",
|
|
1705
|
-
required: false,
|
|
1706
|
-
defaultValue: 3e4,
|
|
1707
|
-
advanced: true
|
|
1708
|
-
}
|
|
1709
|
-
]
|
|
1710
|
-
}
|
|
1711
|
-
};
|
|
1712
|
-
function getSystemAssertions() {
|
|
1713
|
-
return Object.values(SYSTEM_ASSERTIONS);
|
|
1714
|
-
}
|
|
1715
|
-
function getSystemAssertion(id) {
|
|
1716
|
-
return SYSTEM_ASSERTIONS[id];
|
|
1717
|
-
}
|
|
1718
1799
|
export {
|
|
1719
1800
|
AGENT_TYPE_LABELS,
|
|
1801
|
+
ALLOWED_BUILD_COMMANDS,
|
|
1720
1802
|
ALL_AVAILABLE_MODEL_IDS,
|
|
1721
1803
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
1722
1804
|
AVAILABLE_OPENAI_MODEL_IDS,
|
|
@@ -1750,6 +1832,7 @@ export {
|
|
|
1750
1832
|
BatchSummarySchema,
|
|
1751
1833
|
BuildCheckTestSchema,
|
|
1752
1834
|
BuildPassedAssertionSchema,
|
|
1835
|
+
BuildPassedCommandStringSchema,
|
|
1753
1836
|
BuildPassedConfigSchema,
|
|
1754
1837
|
BulkImportResultItemSchema,
|
|
1755
1838
|
BulkImportResultSchema,
|
|
@@ -1777,6 +1860,7 @@ export {
|
|
|
1777
1860
|
CreateTemplateInputSchema,
|
|
1778
1861
|
CreateTestScenarioInputSchema,
|
|
1779
1862
|
CreateTestSuiteInputSchema,
|
|
1863
|
+
DEFAULT_BUILD_PASSED_COMMAND,
|
|
1780
1864
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
1781
1865
|
DEFAULT_JUDGE_MODEL,
|
|
1782
1866
|
DiffContentSchema,
|
|
@@ -1874,6 +1958,7 @@ export {
|
|
|
1874
1958
|
ToolTestSchema,
|
|
1875
1959
|
ToolUseBlockSchema,
|
|
1876
1960
|
TriggerMetadataSchema,
|
|
1961
|
+
TriggerPromptImageSchema,
|
|
1877
1962
|
TriggerSchema,
|
|
1878
1963
|
TriggerType,
|
|
1879
1964
|
UpdateAgentInputSchema,
|
|
@@ -1893,11 +1978,14 @@ export {
|
|
|
1893
1978
|
formatTraceEventLine,
|
|
1894
1979
|
getSystemAssertion,
|
|
1895
1980
|
getSystemAssertions,
|
|
1981
|
+
isAllowedBuildCommandString,
|
|
1896
1982
|
isSystemAssertionId,
|
|
1897
1983
|
isValidSkillFolderName,
|
|
1898
1984
|
normalizeBatchAssertionLink,
|
|
1899
1985
|
normalizeModelId,
|
|
1986
|
+
parseBuildCommandToArgv,
|
|
1900
1987
|
parseTraceEventLine,
|
|
1901
|
-
validateAssertionConfig
|
|
1988
|
+
validateAssertionConfig,
|
|
1989
|
+
validateBuildPassedParamsInAssertionLinks
|
|
1902
1990
|
};
|
|
1903
1991
|
//# sourceMappingURL=index.mjs.map
|