@wix/evalforge-types 0.72.0 → 0.74.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.js +711 -625
- package/build/index.js.map +4 -4
- package/build/index.mjs +704 -624
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +26 -6
- package/build/types/assertion/build-passed-command.d.ts +25 -0
- package/build/types/assertion/index.d.ts +1 -0
- package/build/types/project/project.d.ts +5 -2
- package/build/types/scenario/test-scenario.d.ts +22 -3
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -584,11 +584,42 @@ var EnvironmentSchema = z21.object({
|
|
|
584
584
|
});
|
|
585
585
|
|
|
586
586
|
// src/scenario/test-scenario.ts
|
|
587
|
-
import { z as
|
|
587
|
+
import { z as z24 } from "zod";
|
|
588
588
|
|
|
589
589
|
// src/assertion/assertion.ts
|
|
590
|
+
import { z as z23 } from "zod";
|
|
591
|
+
|
|
592
|
+
// src/assertion/build-passed-command.ts
|
|
590
593
|
import { z as z22 } from "zod";
|
|
591
|
-
var
|
|
594
|
+
var ALLOWED_BUILD_COMMANDS = [
|
|
595
|
+
"yarn build",
|
|
596
|
+
"npm run build",
|
|
597
|
+
"pnpm run build",
|
|
598
|
+
"pnpm build"
|
|
599
|
+
];
|
|
600
|
+
var DEFAULT_BUILD_PASSED_COMMAND = "yarn build";
|
|
601
|
+
var BUILD_COMMAND_ARGV = {
|
|
602
|
+
"yarn build": ["yarn", "build"],
|
|
603
|
+
"npm run build": ["npm", "run", "build"],
|
|
604
|
+
"pnpm run build": ["pnpm", "run", "build"],
|
|
605
|
+
"pnpm build": ["pnpm", "build"]
|
|
606
|
+
};
|
|
607
|
+
function isAllowedBuildCommandString(command) {
|
|
608
|
+
const trimmed = command.trim();
|
|
609
|
+
return ALLOWED_BUILD_COMMANDS.includes(trimmed);
|
|
610
|
+
}
|
|
611
|
+
function parseBuildCommandToArgv(command) {
|
|
612
|
+
const trimmed = command.trim();
|
|
613
|
+
if (!(trimmed in BUILD_COMMAND_ARGV)) {
|
|
614
|
+
return null;
|
|
615
|
+
}
|
|
616
|
+
return BUILD_COMMAND_ARGV[trimmed];
|
|
617
|
+
}
|
|
618
|
+
var enumTuple = ALLOWED_BUILD_COMMANDS;
|
|
619
|
+
var BuildPassedCommandStringSchema = z22.enum(enumTuple);
|
|
620
|
+
|
|
621
|
+
// src/assertion/assertion.ts
|
|
622
|
+
var AssertionTypeSchema = z23.enum([
|
|
592
623
|
"skill_was_called",
|
|
593
624
|
"tool_called_with_param",
|
|
594
625
|
"build_passed",
|
|
@@ -597,61 +628,61 @@ var AssertionTypeSchema = z22.enum([
|
|
|
597
628
|
"llm_judge",
|
|
598
629
|
"api_call"
|
|
599
630
|
]);
|
|
600
|
-
var AssertionParameterTypeSchema =
|
|
631
|
+
var AssertionParameterTypeSchema = z23.enum([
|
|
601
632
|
"string",
|
|
602
633
|
"number",
|
|
603
634
|
"boolean"
|
|
604
635
|
]);
|
|
605
|
-
var AssertionParameterSchema =
|
|
636
|
+
var AssertionParameterSchema = z23.object({
|
|
606
637
|
/** Parameter name (used as key in params object) */
|
|
607
|
-
name:
|
|
638
|
+
name: z23.string().min(1),
|
|
608
639
|
/** Display label for the parameter */
|
|
609
|
-
label:
|
|
640
|
+
label: z23.string().min(1),
|
|
610
641
|
/** Parameter type */
|
|
611
642
|
type: AssertionParameterTypeSchema,
|
|
612
643
|
/** Whether this parameter is required */
|
|
613
|
-
required:
|
|
644
|
+
required: z23.boolean(),
|
|
614
645
|
/** Default value (optional, used when not provided) */
|
|
615
|
-
defaultValue:
|
|
646
|
+
defaultValue: z23.union([z23.string(), z23.number(), z23.boolean()]).optional(),
|
|
616
647
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
617
|
-
advanced:
|
|
648
|
+
advanced: z23.boolean().optional()
|
|
618
649
|
});
|
|
619
|
-
var ScenarioAssertionLinkSchema =
|
|
650
|
+
var ScenarioAssertionLinkSchema = z23.object({
|
|
620
651
|
/** ID of the system assertion (e.g., 'system:skill_was_called') */
|
|
621
|
-
assertionId:
|
|
652
|
+
assertionId: z23.string(),
|
|
622
653
|
/** Parameter values for this assertion in this scenario */
|
|
623
|
-
params:
|
|
624
|
-
|
|
625
|
-
|
|
654
|
+
params: z23.record(
|
|
655
|
+
z23.string(),
|
|
656
|
+
z23.union([z23.string(), z23.number(), z23.boolean(), z23.null()])
|
|
626
657
|
).optional()
|
|
627
658
|
});
|
|
628
|
-
var SkillWasCalledConfigSchema =
|
|
659
|
+
var SkillWasCalledConfigSchema = z23.object({
|
|
629
660
|
/** Names of the skills that must have been called */
|
|
630
|
-
skillNames:
|
|
661
|
+
skillNames: z23.array(z23.string().min(1)).min(1)
|
|
631
662
|
});
|
|
632
|
-
var CostConfigSchema =
|
|
663
|
+
var CostConfigSchema = z23.strictObject({
|
|
633
664
|
/** Maximum allowed cost in USD */
|
|
634
|
-
maxCostUsd:
|
|
665
|
+
maxCostUsd: z23.number().positive()
|
|
635
666
|
});
|
|
636
|
-
var ToolCalledWithParamConfigSchema =
|
|
667
|
+
var ToolCalledWithParamConfigSchema = z23.strictObject({
|
|
637
668
|
/** Name of the tool that must have been called */
|
|
638
|
-
toolName:
|
|
669
|
+
toolName: z23.string().min(1),
|
|
639
670
|
/** JSON string of key-value pairs for expected parameters (substring match). Optional — when omitted, only checks tool presence. */
|
|
640
|
-
expectedParams:
|
|
671
|
+
expectedParams: z23.string().min(1).optional(),
|
|
641
672
|
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
642
|
-
requireSuccess:
|
|
673
|
+
requireSuccess: z23.boolean().optional()
|
|
643
674
|
});
|
|
644
|
-
var BuildPassedConfigSchema =
|
|
645
|
-
/**
|
|
646
|
-
command:
|
|
675
|
+
var BuildPassedConfigSchema = z23.strictObject({
|
|
676
|
+
/** Allowlisted command only (default at runtime: "yarn build") */
|
|
677
|
+
command: BuildPassedCommandStringSchema.optional(),
|
|
647
678
|
/** Expected exit code (default: 0) */
|
|
648
|
-
expectedExitCode:
|
|
679
|
+
expectedExitCode: z23.number().int().optional()
|
|
649
680
|
});
|
|
650
|
-
var TimeConfigSchema =
|
|
681
|
+
var TimeConfigSchema = z23.strictObject({
|
|
651
682
|
/** Maximum allowed duration in milliseconds */
|
|
652
|
-
maxDurationMs:
|
|
683
|
+
maxDurationMs: z23.number().int().positive()
|
|
653
684
|
});
|
|
654
|
-
var LlmJudgeConfigSchema =
|
|
685
|
+
var LlmJudgeConfigSchema = z23.object({
|
|
655
686
|
/**
|
|
656
687
|
* Prompt template with placeholders:
|
|
657
688
|
* - {{output}}: agent's final output
|
|
@@ -662,65 +693,65 @@ var LlmJudgeConfigSchema = z22.object({
|
|
|
662
693
|
* - {{trace}}: step-by-step trace of tool calls
|
|
663
694
|
* - Custom parameters defined in the parameters array
|
|
664
695
|
*/
|
|
665
|
-
prompt:
|
|
696
|
+
prompt: z23.string().min(1),
|
|
666
697
|
/** Minimum score to pass (0-10, default 7) */
|
|
667
|
-
minScore:
|
|
698
|
+
minScore: z23.number().int().min(0).max(10).optional(),
|
|
668
699
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
669
|
-
model:
|
|
700
|
+
model: z23.string().optional(),
|
|
670
701
|
/** Max output tokens */
|
|
671
|
-
maxTokens:
|
|
702
|
+
maxTokens: z23.number().int().optional(),
|
|
672
703
|
/** Temperature (0-1) */
|
|
673
|
-
temperature:
|
|
704
|
+
temperature: z23.number().min(0).max(1).optional(),
|
|
674
705
|
/** User-defined parameters for this assertion */
|
|
675
|
-
parameters:
|
|
706
|
+
parameters: z23.array(AssertionParameterSchema).optional()
|
|
676
707
|
});
|
|
677
|
-
var ApiCallConfigSchema =
|
|
708
|
+
var ApiCallConfigSchema = z23.strictObject({
|
|
678
709
|
/** URL to call */
|
|
679
|
-
url:
|
|
710
|
+
url: z23.string().min(1),
|
|
680
711
|
/** HTTP method (default GET) */
|
|
681
|
-
method:
|
|
712
|
+
method: z23.enum(["GET", "POST"]).optional(),
|
|
682
713
|
/** Request body (JSON string, for POST requests) */
|
|
683
|
-
requestBody:
|
|
714
|
+
requestBody: z23.string().optional(),
|
|
684
715
|
/** Expected JSON response to validate against (subset match — extra fields in actual are OK) */
|
|
685
|
-
expectedResponse:
|
|
716
|
+
expectedResponse: z23.string().min(1),
|
|
686
717
|
/** Request headers as JSON string of key-value pairs */
|
|
687
|
-
requestHeaders:
|
|
718
|
+
requestHeaders: z23.string().optional(),
|
|
688
719
|
/** Request timeout in milliseconds (default 30000) */
|
|
689
|
-
timeoutMs:
|
|
720
|
+
timeoutMs: z23.number().int().positive().optional()
|
|
690
721
|
});
|
|
691
722
|
var AssertionBaseFields = {
|
|
692
723
|
/** When true, the assertion's pass/fail logic is inverted (NOT operator). */
|
|
693
|
-
negate:
|
|
724
|
+
negate: z23.boolean().optional()
|
|
694
725
|
};
|
|
695
726
|
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
696
|
-
type:
|
|
727
|
+
type: z23.literal("skill_was_called"),
|
|
697
728
|
...AssertionBaseFields
|
|
698
729
|
});
|
|
699
730
|
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
700
|
-
type:
|
|
731
|
+
type: z23.literal("tool_called_with_param"),
|
|
701
732
|
...AssertionBaseFields
|
|
702
733
|
});
|
|
703
734
|
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
704
|
-
type:
|
|
735
|
+
type: z23.literal("build_passed"),
|
|
705
736
|
...AssertionBaseFields
|
|
706
737
|
});
|
|
707
738
|
var CostAssertionSchema = CostConfigSchema.extend({
|
|
708
|
-
type:
|
|
739
|
+
type: z23.literal("cost"),
|
|
709
740
|
...AssertionBaseFields
|
|
710
741
|
});
|
|
711
742
|
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
712
|
-
type:
|
|
743
|
+
type: z23.literal("llm_judge"),
|
|
713
744
|
...AssertionBaseFields
|
|
714
745
|
});
|
|
715
746
|
var ApiCallAssertionSchema = ApiCallConfigSchema.extend({
|
|
716
|
-
type:
|
|
747
|
+
type: z23.literal("api_call"),
|
|
717
748
|
...AssertionBaseFields
|
|
718
749
|
});
|
|
719
750
|
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
720
|
-
type:
|
|
751
|
+
type: z23.literal("time_limit"),
|
|
721
752
|
...AssertionBaseFields
|
|
722
753
|
});
|
|
723
|
-
var AssertionSchema =
|
|
754
|
+
var AssertionSchema = z23.union([
|
|
724
755
|
SkillWasCalledAssertionSchema,
|
|
725
756
|
ToolCalledWithParamAssertionSchema,
|
|
726
757
|
BuildPassedAssertionSchema,
|
|
@@ -729,7 +760,7 @@ var AssertionSchema = z22.union([
|
|
|
729
760
|
LlmJudgeAssertionSchema,
|
|
730
761
|
ApiCallAssertionSchema
|
|
731
762
|
]);
|
|
732
|
-
var AssertionConfigSchema =
|
|
763
|
+
var AssertionConfigSchema = z23.union([
|
|
733
764
|
LlmJudgeConfigSchema,
|
|
734
765
|
// requires prompt - check first
|
|
735
766
|
SkillWasCalledConfigSchema,
|
|
@@ -744,7 +775,7 @@ var AssertionConfigSchema = z22.union([
|
|
|
744
775
|
// requires maxCostUsd, uses strictObject
|
|
745
776
|
BuildPassedConfigSchema,
|
|
746
777
|
// all optional, uses strictObject to reject unknown keys
|
|
747
|
-
|
|
778
|
+
z23.object({})
|
|
748
779
|
// fallback empty config
|
|
749
780
|
]);
|
|
750
781
|
function validateAssertionConfig(type, config) {
|
|
@@ -768,63 +799,322 @@ function validateAssertionConfig(type, config) {
|
|
|
768
799
|
}
|
|
769
800
|
}
|
|
770
801
|
|
|
802
|
+
// src/assertion/system-assertions.ts
|
|
803
|
+
var SYSTEM_ASSERTION_IDS = {
|
|
804
|
+
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
805
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
806
|
+
BUILD_PASSED: "system:build_passed",
|
|
807
|
+
TIME_LIMIT: "system:time_limit",
|
|
808
|
+
COST: "system:cost",
|
|
809
|
+
LLM_JUDGE: "system:llm_judge",
|
|
810
|
+
API_CALL: "system:api_call"
|
|
811
|
+
};
|
|
812
|
+
function isSystemAssertionId(id) {
|
|
813
|
+
return id.startsWith("system:");
|
|
814
|
+
}
|
|
815
|
+
var SYSTEM_ASSERTIONS = {
|
|
816
|
+
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
817
|
+
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
818
|
+
name: "Skill Was Called",
|
|
819
|
+
description: "Check that one or more skills were invoked during the agent run",
|
|
820
|
+
type: "skill_was_called",
|
|
821
|
+
parameters: [
|
|
822
|
+
{
|
|
823
|
+
name: "skillNames",
|
|
824
|
+
label: "Skills",
|
|
825
|
+
type: "string",
|
|
826
|
+
required: true
|
|
827
|
+
},
|
|
828
|
+
{
|
|
829
|
+
name: "negate",
|
|
830
|
+
label: "Negate (NOT operator)",
|
|
831
|
+
type: "boolean",
|
|
832
|
+
required: false,
|
|
833
|
+
defaultValue: false
|
|
834
|
+
}
|
|
835
|
+
]
|
|
836
|
+
},
|
|
837
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
838
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
839
|
+
name: "Tool Called With Param",
|
|
840
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
841
|
+
type: "tool_called_with_param",
|
|
842
|
+
parameters: [
|
|
843
|
+
{
|
|
844
|
+
name: "toolName",
|
|
845
|
+
label: "Tool Name",
|
|
846
|
+
type: "string",
|
|
847
|
+
required: true
|
|
848
|
+
},
|
|
849
|
+
{
|
|
850
|
+
name: "expectedParams",
|
|
851
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
852
|
+
type: "string",
|
|
853
|
+
required: false
|
|
854
|
+
},
|
|
855
|
+
{
|
|
856
|
+
name: "requireSuccess",
|
|
857
|
+
label: "Require Successful Call",
|
|
858
|
+
type: "boolean",
|
|
859
|
+
required: false,
|
|
860
|
+
defaultValue: false,
|
|
861
|
+
advanced: true
|
|
862
|
+
},
|
|
863
|
+
{
|
|
864
|
+
name: "negate",
|
|
865
|
+
label: "Negate (NOT operator)",
|
|
866
|
+
type: "boolean",
|
|
867
|
+
required: false,
|
|
868
|
+
defaultValue: false
|
|
869
|
+
}
|
|
870
|
+
]
|
|
871
|
+
},
|
|
872
|
+
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
873
|
+
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
874
|
+
name: "Build Passed",
|
|
875
|
+
description: "Run a build command and verify it exits with expected code",
|
|
876
|
+
type: "build_passed",
|
|
877
|
+
parameters: [
|
|
878
|
+
{
|
|
879
|
+
name: "command",
|
|
880
|
+
label: "Build Command",
|
|
881
|
+
type: "string",
|
|
882
|
+
required: false,
|
|
883
|
+
defaultValue: "yarn build"
|
|
884
|
+
},
|
|
885
|
+
{
|
|
886
|
+
name: "expectedExitCode",
|
|
887
|
+
label: "Expected Exit Code",
|
|
888
|
+
type: "number",
|
|
889
|
+
required: false,
|
|
890
|
+
defaultValue: 0
|
|
891
|
+
},
|
|
892
|
+
{
|
|
893
|
+
name: "maxBuildTime",
|
|
894
|
+
label: "Max Build Time (ms)",
|
|
895
|
+
type: "number",
|
|
896
|
+
required: false,
|
|
897
|
+
advanced: true
|
|
898
|
+
},
|
|
899
|
+
{
|
|
900
|
+
name: "maxMemory",
|
|
901
|
+
label: "Max Memory (MB)",
|
|
902
|
+
type: "number",
|
|
903
|
+
required: false,
|
|
904
|
+
advanced: true
|
|
905
|
+
}
|
|
906
|
+
]
|
|
907
|
+
},
|
|
908
|
+
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
909
|
+
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
910
|
+
name: "Time Limit",
|
|
911
|
+
description: "Check that the scenario completed within a maximum duration",
|
|
912
|
+
type: "time_limit",
|
|
913
|
+
parameters: [
|
|
914
|
+
{
|
|
915
|
+
name: "maxDurationMs",
|
|
916
|
+
label: "Max Duration (ms)",
|
|
917
|
+
type: "number",
|
|
918
|
+
required: true,
|
|
919
|
+
defaultValue: 3e5
|
|
920
|
+
}
|
|
921
|
+
]
|
|
922
|
+
},
|
|
923
|
+
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
924
|
+
id: SYSTEM_ASSERTION_IDS.COST,
|
|
925
|
+
name: "Cost",
|
|
926
|
+
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
927
|
+
type: "cost",
|
|
928
|
+
parameters: [
|
|
929
|
+
{
|
|
930
|
+
name: "maxCostUsd",
|
|
931
|
+
label: "Max Cost (USD)",
|
|
932
|
+
type: "number",
|
|
933
|
+
required: true,
|
|
934
|
+
defaultValue: 1
|
|
935
|
+
}
|
|
936
|
+
]
|
|
937
|
+
},
|
|
938
|
+
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
939
|
+
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
940
|
+
name: "LLM Judge",
|
|
941
|
+
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
942
|
+
type: "llm_judge",
|
|
943
|
+
parameters: [
|
|
944
|
+
{
|
|
945
|
+
name: "prompt",
|
|
946
|
+
label: "Judge Prompt",
|
|
947
|
+
type: "string",
|
|
948
|
+
required: true,
|
|
949
|
+
defaultValue: "Verify the output meets the acceptance criteria."
|
|
950
|
+
},
|
|
951
|
+
{
|
|
952
|
+
name: "minScore",
|
|
953
|
+
label: "Minimum Score (0-10)",
|
|
954
|
+
type: "number",
|
|
955
|
+
required: false,
|
|
956
|
+
defaultValue: 7
|
|
957
|
+
},
|
|
958
|
+
{
|
|
959
|
+
name: "model",
|
|
960
|
+
label: "Model",
|
|
961
|
+
type: "string",
|
|
962
|
+
required: false
|
|
963
|
+
}
|
|
964
|
+
]
|
|
965
|
+
},
|
|
966
|
+
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
967
|
+
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
968
|
+
name: "API Call",
|
|
969
|
+
description: "Call an API endpoint and verify the response contains expected data",
|
|
970
|
+
type: "api_call",
|
|
971
|
+
parameters: [
|
|
972
|
+
{
|
|
973
|
+
name: "url",
|
|
974
|
+
label: "URL",
|
|
975
|
+
type: "string",
|
|
976
|
+
required: true
|
|
977
|
+
},
|
|
978
|
+
{
|
|
979
|
+
name: "method",
|
|
980
|
+
label: "HTTP Method",
|
|
981
|
+
type: "string",
|
|
982
|
+
required: false,
|
|
983
|
+
defaultValue: "GET"
|
|
984
|
+
},
|
|
985
|
+
{
|
|
986
|
+
name: "requestBody",
|
|
987
|
+
label: "Request Body (JSON)",
|
|
988
|
+
type: "string",
|
|
989
|
+
required: false
|
|
990
|
+
},
|
|
991
|
+
{
|
|
992
|
+
name: "expectedResponse",
|
|
993
|
+
label: "Expected Response (JSON)",
|
|
994
|
+
type: "string",
|
|
995
|
+
required: true
|
|
996
|
+
},
|
|
997
|
+
{
|
|
998
|
+
name: "requestHeaders",
|
|
999
|
+
label: "Headers (JSON)",
|
|
1000
|
+
type: "string",
|
|
1001
|
+
required: false,
|
|
1002
|
+
advanced: true
|
|
1003
|
+
},
|
|
1004
|
+
{
|
|
1005
|
+
name: "timeoutMs",
|
|
1006
|
+
label: "Timeout (ms)",
|
|
1007
|
+
type: "number",
|
|
1008
|
+
required: false,
|
|
1009
|
+
defaultValue: 3e4,
|
|
1010
|
+
advanced: true
|
|
1011
|
+
}
|
|
1012
|
+
]
|
|
1013
|
+
}
|
|
1014
|
+
};
|
|
1015
|
+
function getSystemAssertions() {
|
|
1016
|
+
return Object.values(SYSTEM_ASSERTIONS);
|
|
1017
|
+
}
|
|
1018
|
+
function getSystemAssertion(id) {
|
|
1019
|
+
return SYSTEM_ASSERTIONS[id];
|
|
1020
|
+
}
|
|
1021
|
+
|
|
771
1022
|
// src/scenario/test-scenario.ts
|
|
772
1023
|
var MAX_IMAGE_BASE64_LENGTH = 4 * Math.ceil(2 * 1024 * 1024 / 3);
|
|
773
|
-
var TriggerPromptImageSchema =
|
|
1024
|
+
var TriggerPromptImageSchema = z24.object({
|
|
774
1025
|
/** Base64-encoded image data (no data URL prefix) */
|
|
775
|
-
base64:
|
|
1026
|
+
base64: z24.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
|
|
776
1027
|
/** MIME type of the image */
|
|
777
|
-
mediaType:
|
|
1028
|
+
mediaType: z24.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
|
|
778
1029
|
/** Original filename of the image */
|
|
779
|
-
name:
|
|
1030
|
+
name: z24.string()
|
|
780
1031
|
});
|
|
781
|
-
var ExpectedFileSchema =
|
|
1032
|
+
var ExpectedFileSchema = z24.object({
|
|
782
1033
|
/** Relative path where the file should be created */
|
|
783
|
-
path:
|
|
1034
|
+
path: z24.string(),
|
|
784
1035
|
/** Optional expected content */
|
|
785
|
-
content:
|
|
1036
|
+
content: z24.string().optional()
|
|
786
1037
|
});
|
|
787
1038
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
788
1039
|
/** The prompt sent to the agent to trigger the task */
|
|
789
|
-
triggerPrompt:
|
|
1040
|
+
triggerPrompt: z24.string().min(10),
|
|
790
1041
|
/** ID of the template to use for this scenario (null = no template) */
|
|
791
|
-
templateId:
|
|
1042
|
+
templateId: z24.string().nullish(),
|
|
792
1043
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
793
|
-
assertions:
|
|
1044
|
+
assertions: z24.array(AssertionSchema).optional(),
|
|
794
1045
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
795
|
-
assertionIds:
|
|
1046
|
+
assertionIds: z24.array(z24.string()).optional(),
|
|
796
1047
|
/** Linked assertions with per-scenario parameter values */
|
|
797
|
-
assertionLinks:
|
|
1048
|
+
assertionLinks: z24.array(ScenarioAssertionLinkSchema).optional(),
|
|
798
1049
|
/** Tags for categorisation and filtering */
|
|
799
|
-
tags:
|
|
1050
|
+
tags: z24.array(z24.string()).optional(),
|
|
800
1051
|
/** Base64-encoded images attached to the trigger prompt (max 3) */
|
|
801
|
-
triggerPromptImages:
|
|
802
|
-
});
|
|
803
|
-
|
|
1052
|
+
triggerPromptImages: z24.array(TriggerPromptImageSchema).max(3).optional()
|
|
1053
|
+
});
|
|
1054
|
+
function validateBuildPassedParamsInAssertionLinks(links, ctx) {
|
|
1055
|
+
if (!links) return;
|
|
1056
|
+
for (let i = 0; i < links.length; i++) {
|
|
1057
|
+
const link = links[i];
|
|
1058
|
+
if (link.assertionId !== SYSTEM_ASSERTION_IDS.BUILD_PASSED) continue;
|
|
1059
|
+
const cmd = link.params?.command;
|
|
1060
|
+
if (cmd === void 0 || cmd === null) continue;
|
|
1061
|
+
if (typeof cmd !== "string") {
|
|
1062
|
+
ctx.addIssue({
|
|
1063
|
+
code: z24.ZodIssueCode.custom,
|
|
1064
|
+
message: "build_passed command must be a string",
|
|
1065
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1066
|
+
});
|
|
1067
|
+
continue;
|
|
1068
|
+
}
|
|
1069
|
+
if (!isAllowedBuildCommandString(cmd)) {
|
|
1070
|
+
ctx.addIssue({
|
|
1071
|
+
code: z24.ZodIssueCode.custom,
|
|
1072
|
+
message: "Invalid build_passed command. Allowed: yarn build, npm run build, pnpm run build, pnpm build",
|
|
1073
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1074
|
+
});
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
var TestScenarioCreateBaseSchema = TestScenarioSchema.omit({
|
|
804
1079
|
id: true,
|
|
805
1080
|
createdAt: true,
|
|
806
1081
|
updatedAt: true,
|
|
807
1082
|
deleted: true
|
|
808
1083
|
});
|
|
809
|
-
var
|
|
1084
|
+
var CreateTestScenarioInputSchema = TestScenarioCreateBaseSchema.superRefine((data, ctx) => {
|
|
1085
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1086
|
+
});
|
|
1087
|
+
var UpdateTestScenarioInputSchema = TestScenarioCreateBaseSchema.partial().superRefine((data, ctx) => {
|
|
1088
|
+
if (data.assertionLinks !== void 0) {
|
|
1089
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1090
|
+
}
|
|
1091
|
+
});
|
|
810
1092
|
|
|
811
1093
|
// src/scenario/batch-import.ts
|
|
812
|
-
import { z as
|
|
1094
|
+
import { z as z25 } from "zod";
|
|
813
1095
|
var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
|
814
|
-
var BatchAssertionLinkSchema =
|
|
815
|
-
|
|
1096
|
+
var BatchAssertionLinkSchema = z25.union([
|
|
1097
|
+
z25.string().min(1),
|
|
816
1098
|
ScenarioAssertionLinkSchema
|
|
817
1099
|
]);
|
|
818
|
-
var BatchScenarioEntrySchema =
|
|
819
|
-
name:
|
|
820
|
-
description:
|
|
821
|
-
triggerPrompt:
|
|
822
|
-
templateId:
|
|
823
|
-
tags:
|
|
824
|
-
assertionLinks:
|
|
1100
|
+
var BatchScenarioEntrySchema = z25.object({
|
|
1101
|
+
name: z25.string().min(1, "name: Required"),
|
|
1102
|
+
description: z25.string().optional().default(""),
|
|
1103
|
+
triggerPrompt: z25.string().min(10, "triggerPrompt: Must be at least 10 characters"),
|
|
1104
|
+
templateId: z25.string().nullish(),
|
|
1105
|
+
tags: z25.array(z25.string()).optional(),
|
|
1106
|
+
assertionLinks: z25.array(BatchAssertionLinkSchema).optional()
|
|
1107
|
+
}).superRefine((data, ctx) => {
|
|
1108
|
+
if (!data.assertionLinks) return;
|
|
1109
|
+
const objectLinks = data.assertionLinks.filter(
|
|
1110
|
+
(link) => typeof link !== "string"
|
|
1111
|
+
);
|
|
1112
|
+
if (objectLinks.length > 0) {
|
|
1113
|
+
validateBuildPassedParamsInAssertionLinks(objectLinks, ctx);
|
|
1114
|
+
}
|
|
825
1115
|
});
|
|
826
|
-
var BatchImportPayloadSchema =
|
|
827
|
-
scenarios:
|
|
1116
|
+
var BatchImportPayloadSchema = z25.object({
|
|
1117
|
+
scenarios: z25.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
|
|
828
1118
|
});
|
|
829
1119
|
var BATCH_IMPORT_LIMITS = {
|
|
830
1120
|
MAX_SCENARIOS: 100,
|
|
@@ -846,29 +1136,29 @@ function normalizeBatchAssertionLink(link) {
|
|
|
846
1136
|
}
|
|
847
1137
|
return link;
|
|
848
1138
|
}
|
|
849
|
-
var BatchResultItemSchema =
|
|
850
|
-
index:
|
|
851
|
-
name:
|
|
852
|
-
status:
|
|
853
|
-
id:
|
|
854
|
-
errors:
|
|
855
|
-
});
|
|
856
|
-
var BatchSummarySchema =
|
|
857
|
-
total:
|
|
858
|
-
valid:
|
|
859
|
-
invalid:
|
|
860
|
-
created:
|
|
861
|
-
});
|
|
862
|
-
var BatchImportResponseSchema =
|
|
1139
|
+
var BatchResultItemSchema = z25.object({
|
|
1140
|
+
index: z25.number(),
|
|
1141
|
+
name: z25.string(),
|
|
1142
|
+
status: z25.enum(["valid", "invalid"]),
|
|
1143
|
+
id: z25.string().nullable().optional(),
|
|
1144
|
+
errors: z25.array(z25.string()).optional()
|
|
1145
|
+
});
|
|
1146
|
+
var BatchSummarySchema = z25.object({
|
|
1147
|
+
total: z25.number(),
|
|
1148
|
+
valid: z25.number(),
|
|
1149
|
+
invalid: z25.number(),
|
|
1150
|
+
created: z25.number()
|
|
1151
|
+
});
|
|
1152
|
+
var BatchImportResponseSchema = z25.object({
|
|
863
1153
|
summary: BatchSummarySchema,
|
|
864
|
-
results:
|
|
1154
|
+
results: z25.array(BatchResultItemSchema)
|
|
865
1155
|
});
|
|
866
1156
|
|
|
867
1157
|
// src/suite/test-suite.ts
|
|
868
|
-
import { z as
|
|
1158
|
+
import { z as z26 } from "zod";
|
|
869
1159
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
870
1160
|
/** IDs of test scenarios in this suite */
|
|
871
|
-
scenarioIds:
|
|
1161
|
+
scenarioIds: z26.array(z26.string())
|
|
872
1162
|
});
|
|
873
1163
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
874
1164
|
id: true,
|
|
@@ -879,21 +1169,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
879
1169
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
880
1170
|
|
|
881
1171
|
// src/evaluation/metrics.ts
|
|
882
|
-
import { z as
|
|
883
|
-
var TokenUsageSchema =
|
|
884
|
-
prompt:
|
|
885
|
-
completion:
|
|
886
|
-
total:
|
|
887
|
-
});
|
|
888
|
-
var EvalMetricsSchema =
|
|
889
|
-
totalAssertions:
|
|
890
|
-
passed:
|
|
891
|
-
failed:
|
|
892
|
-
skipped:
|
|
893
|
-
errors:
|
|
894
|
-
passRate:
|
|
895
|
-
avgDuration:
|
|
896
|
-
totalDuration:
|
|
1172
|
+
import { z as z27 } from "zod";
|
|
1173
|
+
var TokenUsageSchema = z27.object({
|
|
1174
|
+
prompt: z27.number(),
|
|
1175
|
+
completion: z27.number(),
|
|
1176
|
+
total: z27.number()
|
|
1177
|
+
});
|
|
1178
|
+
var EvalMetricsSchema = z27.object({
|
|
1179
|
+
totalAssertions: z27.number(),
|
|
1180
|
+
passed: z27.number(),
|
|
1181
|
+
failed: z27.number(),
|
|
1182
|
+
skipped: z27.number(),
|
|
1183
|
+
errors: z27.number(),
|
|
1184
|
+
passRate: z27.number(),
|
|
1185
|
+
avgDuration: z27.number(),
|
|
1186
|
+
totalDuration: z27.number()
|
|
897
1187
|
});
|
|
898
1188
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
899
1189
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -903,7 +1193,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
903
1193
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
904
1194
|
return EvalStatus2;
|
|
905
1195
|
})(EvalStatus || {});
|
|
906
|
-
var EvalStatusSchema =
|
|
1196
|
+
var EvalStatusSchema = z27.enum(EvalStatus);
|
|
907
1197
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
908
1198
|
LLMStepType2["COMPLETION"] = "completion";
|
|
909
1199
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -911,54 +1201,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
911
1201
|
LLMStepType2["THINKING"] = "thinking";
|
|
912
1202
|
return LLMStepType2;
|
|
913
1203
|
})(LLMStepType || {});
|
|
914
|
-
var LLMTraceStepSchema =
|
|
915
|
-
id:
|
|
916
|
-
stepNumber:
|
|
917
|
-
type:
|
|
918
|
-
model:
|
|
919
|
-
provider:
|
|
920
|
-
startedAt:
|
|
921
|
-
durationMs:
|
|
1204
|
+
var LLMTraceStepSchema = z27.object({
|
|
1205
|
+
id: z27.string(),
|
|
1206
|
+
stepNumber: z27.number(),
|
|
1207
|
+
type: z27.enum(LLMStepType),
|
|
1208
|
+
model: z27.string(),
|
|
1209
|
+
provider: z27.string(),
|
|
1210
|
+
startedAt: z27.string(),
|
|
1211
|
+
durationMs: z27.number(),
|
|
922
1212
|
tokenUsage: TokenUsageSchema,
|
|
923
|
-
costUsd:
|
|
924
|
-
toolName:
|
|
925
|
-
toolArguments:
|
|
926
|
-
inputPreview:
|
|
927
|
-
outputPreview:
|
|
928
|
-
success:
|
|
929
|
-
error:
|
|
930
|
-
turnIndex:
|
|
931
|
-
});
|
|
932
|
-
var LLMBreakdownStatsSchema =
|
|
933
|
-
count:
|
|
934
|
-
durationMs:
|
|
935
|
-
tokens:
|
|
936
|
-
costUsd:
|
|
937
|
-
});
|
|
938
|
-
var LLMTraceSummarySchema =
|
|
939
|
-
totalSteps:
|
|
940
|
-
totalTurns:
|
|
941
|
-
totalDurationMs:
|
|
1213
|
+
costUsd: z27.number(),
|
|
1214
|
+
toolName: z27.string().optional(),
|
|
1215
|
+
toolArguments: z27.string().optional(),
|
|
1216
|
+
inputPreview: z27.string().optional(),
|
|
1217
|
+
outputPreview: z27.string().optional(),
|
|
1218
|
+
success: z27.boolean(),
|
|
1219
|
+
error: z27.string().optional(),
|
|
1220
|
+
turnIndex: z27.number().optional()
|
|
1221
|
+
});
|
|
1222
|
+
var LLMBreakdownStatsSchema = z27.object({
|
|
1223
|
+
count: z27.number(),
|
|
1224
|
+
durationMs: z27.number(),
|
|
1225
|
+
tokens: z27.number(),
|
|
1226
|
+
costUsd: z27.number()
|
|
1227
|
+
});
|
|
1228
|
+
var LLMTraceSummarySchema = z27.object({
|
|
1229
|
+
totalSteps: z27.number(),
|
|
1230
|
+
totalTurns: z27.number().optional(),
|
|
1231
|
+
totalDurationMs: z27.number(),
|
|
942
1232
|
totalTokens: TokenUsageSchema,
|
|
943
|
-
totalCostUsd:
|
|
944
|
-
stepTypeBreakdown:
|
|
945
|
-
modelBreakdown:
|
|
946
|
-
modelsUsed:
|
|
947
|
-
});
|
|
948
|
-
var LLMTraceSchema =
|
|
949
|
-
id:
|
|
950
|
-
steps:
|
|
1233
|
+
totalCostUsd: z27.number(),
|
|
1234
|
+
stepTypeBreakdown: z27.record(z27.string(), LLMBreakdownStatsSchema).optional(),
|
|
1235
|
+
modelBreakdown: z27.record(z27.string(), LLMBreakdownStatsSchema),
|
|
1236
|
+
modelsUsed: z27.array(z27.string())
|
|
1237
|
+
});
|
|
1238
|
+
var LLMTraceSchema = z27.object({
|
|
1239
|
+
id: z27.string(),
|
|
1240
|
+
steps: z27.array(LLMTraceStepSchema),
|
|
951
1241
|
summary: LLMTraceSummarySchema
|
|
952
1242
|
});
|
|
953
1243
|
|
|
954
1244
|
// src/evaluation/eval-result.ts
|
|
955
|
-
import { z as
|
|
1245
|
+
import { z as z31 } from "zod";
|
|
956
1246
|
|
|
957
1247
|
// src/evaluation/eval-run.ts
|
|
958
|
-
import { z as
|
|
1248
|
+
import { z as z29 } from "zod";
|
|
959
1249
|
|
|
960
1250
|
// src/evaluation/live-trace.ts
|
|
961
|
-
import { z as
|
|
1251
|
+
import { z as z28 } from "zod";
|
|
962
1252
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
963
1253
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
964
1254
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -972,37 +1262,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
972
1262
|
LiveTraceEventType2["USER"] = "user";
|
|
973
1263
|
return LiveTraceEventType2;
|
|
974
1264
|
})(LiveTraceEventType || {});
|
|
975
|
-
var LiveTraceEventSchema =
|
|
1265
|
+
var LiveTraceEventSchema = z28.object({
|
|
976
1266
|
/** The evaluation run ID */
|
|
977
|
-
evalRunId:
|
|
1267
|
+
evalRunId: z28.string(),
|
|
978
1268
|
/** The scenario ID being executed */
|
|
979
|
-
scenarioId:
|
|
1269
|
+
scenarioId: z28.string(),
|
|
980
1270
|
/** The scenario name for display */
|
|
981
|
-
scenarioName:
|
|
1271
|
+
scenarioName: z28.string(),
|
|
982
1272
|
/** The target ID (skill, agent, etc.) */
|
|
983
|
-
targetId:
|
|
1273
|
+
targetId: z28.string(),
|
|
984
1274
|
/** The target name for display */
|
|
985
|
-
targetName:
|
|
1275
|
+
targetName: z28.string(),
|
|
986
1276
|
/** Step number in the current scenario execution */
|
|
987
|
-
stepNumber:
|
|
1277
|
+
stepNumber: z28.number(),
|
|
988
1278
|
/** Type of trace event */
|
|
989
|
-
type:
|
|
1279
|
+
type: z28.enum(LiveTraceEventType),
|
|
990
1280
|
/** Tool name if this is a tool_use event */
|
|
991
|
-
toolName:
|
|
1281
|
+
toolName: z28.string().optional(),
|
|
992
1282
|
/** Tool arguments preview (truncated JSON) */
|
|
993
|
-
toolArgs:
|
|
1283
|
+
toolArgs: z28.string().optional(),
|
|
994
1284
|
/** Output preview (truncated text) */
|
|
995
|
-
outputPreview:
|
|
1285
|
+
outputPreview: z28.string().optional(),
|
|
996
1286
|
/** File path for file operations */
|
|
997
|
-
filePath:
|
|
1287
|
+
filePath: z28.string().optional(),
|
|
998
1288
|
/** Elapsed time in milliseconds for progress events */
|
|
999
|
-
elapsedMs:
|
|
1289
|
+
elapsedMs: z28.number().optional(),
|
|
1000
1290
|
/** Thinking/reasoning text from Claude */
|
|
1001
|
-
thinking:
|
|
1291
|
+
thinking: z28.string().optional(),
|
|
1002
1292
|
/** Timestamp when this event occurred */
|
|
1003
|
-
timestamp:
|
|
1293
|
+
timestamp: z28.string(),
|
|
1004
1294
|
/** Whether this is the final event for this scenario */
|
|
1005
|
-
isComplete:
|
|
1295
|
+
isComplete: z28.boolean()
|
|
1006
1296
|
});
|
|
1007
1297
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1008
1298
|
function parseTraceEventLine(line) {
|
|
@@ -1031,40 +1321,40 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1031
1321
|
TriggerType2["SCHEDULED"] = "SCHEDULED";
|
|
1032
1322
|
return TriggerType2;
|
|
1033
1323
|
})(TriggerType || {});
|
|
1034
|
-
var TriggerMetadataSchema =
|
|
1035
|
-
version:
|
|
1036
|
-
resourceUpdated:
|
|
1037
|
-
scheduleId:
|
|
1324
|
+
var TriggerMetadataSchema = z29.object({
|
|
1325
|
+
version: z29.string().optional(),
|
|
1326
|
+
resourceUpdated: z29.array(z29.string()).optional(),
|
|
1327
|
+
scheduleId: z29.string().optional()
|
|
1038
1328
|
});
|
|
1039
|
-
var TriggerSchema =
|
|
1040
|
-
id:
|
|
1329
|
+
var TriggerSchema = z29.object({
|
|
1330
|
+
id: z29.string(),
|
|
1041
1331
|
metadata: TriggerMetadataSchema.optional(),
|
|
1042
|
-
type:
|
|
1332
|
+
type: z29.nativeEnum(TriggerType)
|
|
1043
1333
|
});
|
|
1044
|
-
var DiffLineTypeSchema =
|
|
1045
|
-
var DiffLineSchema =
|
|
1334
|
+
var DiffLineTypeSchema = z29.enum(["added", "removed", "unchanged"]);
|
|
1335
|
+
var DiffLineSchema = z29.object({
|
|
1046
1336
|
type: DiffLineTypeSchema,
|
|
1047
|
-
content:
|
|
1048
|
-
lineNumber:
|
|
1049
|
-
});
|
|
1050
|
-
var DiffContentSchema =
|
|
1051
|
-
path:
|
|
1052
|
-
expected:
|
|
1053
|
-
actual:
|
|
1054
|
-
diffLines:
|
|
1055
|
-
renamedFrom:
|
|
1337
|
+
content: z29.string(),
|
|
1338
|
+
lineNumber: z29.number()
|
|
1339
|
+
});
|
|
1340
|
+
var DiffContentSchema = z29.object({
|
|
1341
|
+
path: z29.string(),
|
|
1342
|
+
expected: z29.string(),
|
|
1343
|
+
actual: z29.string(),
|
|
1344
|
+
diffLines: z29.array(DiffLineSchema),
|
|
1345
|
+
renamedFrom: z29.string().optional(),
|
|
1056
1346
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1057
|
-
isInfrastructure:
|
|
1347
|
+
isInfrastructure: z29.boolean().optional()
|
|
1058
1348
|
});
|
|
1059
|
-
var CommandExecutionSchema =
|
|
1060
|
-
command:
|
|
1061
|
-
exitCode:
|
|
1062
|
-
output:
|
|
1063
|
-
duration:
|
|
1349
|
+
var CommandExecutionSchema = z29.object({
|
|
1350
|
+
command: z29.string(),
|
|
1351
|
+
exitCode: z29.number(),
|
|
1352
|
+
output: z29.string().optional(),
|
|
1353
|
+
duration: z29.number()
|
|
1064
1354
|
});
|
|
1065
|
-
var FileModificationSchema =
|
|
1066
|
-
path:
|
|
1067
|
-
action:
|
|
1355
|
+
var FileModificationSchema = z29.object({
|
|
1356
|
+
path: z29.string(),
|
|
1357
|
+
action: z29.enum(["created", "modified", "deleted"])
|
|
1068
1358
|
});
|
|
1069
1359
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1070
1360
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1072,62 +1362,62 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1072
1362
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1073
1363
|
return TemplateFileStatus2;
|
|
1074
1364
|
})(TemplateFileStatus || {});
|
|
1075
|
-
var TemplateFileSchema =
|
|
1365
|
+
var TemplateFileSchema = z29.object({
|
|
1076
1366
|
/** Relative path within the template */
|
|
1077
|
-
path:
|
|
1367
|
+
path: z29.string(),
|
|
1078
1368
|
/** Full file content after execution */
|
|
1079
|
-
content:
|
|
1369
|
+
content: z29.string(),
|
|
1080
1370
|
/** File status (new, modified, unchanged) */
|
|
1081
|
-
status:
|
|
1371
|
+
status: z29.enum(["new", "modified", "unchanged"]),
|
|
1082
1372
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1083
|
-
isInfrastructure:
|
|
1373
|
+
isInfrastructure: z29.boolean().optional()
|
|
1084
1374
|
});
|
|
1085
|
-
var ApiCallSchema =
|
|
1086
|
-
endpoint:
|
|
1087
|
-
tokensUsed:
|
|
1088
|
-
duration:
|
|
1375
|
+
var ApiCallSchema = z29.object({
|
|
1376
|
+
endpoint: z29.string(),
|
|
1377
|
+
tokensUsed: z29.number(),
|
|
1378
|
+
duration: z29.number()
|
|
1089
1379
|
});
|
|
1090
|
-
var ExecutionTraceSchema =
|
|
1091
|
-
commands:
|
|
1092
|
-
filesModified:
|
|
1093
|
-
apiCalls:
|
|
1094
|
-
totalDuration:
|
|
1380
|
+
var ExecutionTraceSchema = z29.object({
|
|
1381
|
+
commands: z29.array(CommandExecutionSchema),
|
|
1382
|
+
filesModified: z29.array(FileModificationSchema),
|
|
1383
|
+
apiCalls: z29.array(ApiCallSchema),
|
|
1384
|
+
totalDuration: z29.number()
|
|
1095
1385
|
});
|
|
1096
|
-
var RunAnalysisFindingSchema =
|
|
1097
|
-
category:
|
|
1386
|
+
var RunAnalysisFindingSchema = z29.object({
|
|
1387
|
+
category: z29.enum([
|
|
1098
1388
|
"failure_pattern",
|
|
1099
1389
|
"cost_waste",
|
|
1100
1390
|
"flakiness",
|
|
1101
1391
|
"inefficiency",
|
|
1102
1392
|
"positive"
|
|
1103
1393
|
]),
|
|
1104
|
-
severity:
|
|
1105
|
-
description:
|
|
1106
|
-
affectedScenarios:
|
|
1107
|
-
recommendation:
|
|
1394
|
+
severity: z29.enum(["high", "medium", "low"]),
|
|
1395
|
+
description: z29.string(),
|
|
1396
|
+
affectedScenarios: z29.array(z29.string()),
|
|
1397
|
+
recommendation: z29.string().optional()
|
|
1108
1398
|
});
|
|
1109
|
-
var RunAnalysisSchema =
|
|
1110
|
-
generatedAt:
|
|
1111
|
-
summary:
|
|
1112
|
-
findings:
|
|
1399
|
+
var RunAnalysisSchema = z29.object({
|
|
1400
|
+
generatedAt: z29.string(),
|
|
1401
|
+
summary: z29.string(),
|
|
1402
|
+
findings: z29.array(RunAnalysisFindingSchema)
|
|
1113
1403
|
});
|
|
1114
1404
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1115
1405
|
/** Agent ID for this run */
|
|
1116
|
-
agentId:
|
|
1406
|
+
agentId: z29.string().optional(),
|
|
1117
1407
|
/** Preset ID that originated this run (optional) */
|
|
1118
|
-
presetId:
|
|
1408
|
+
presetId: z29.string().optional(),
|
|
1119
1409
|
/** Skill IDs for this run */
|
|
1120
|
-
skillIds:
|
|
1410
|
+
skillIds: z29.array(z29.string()).optional(),
|
|
1121
1411
|
/** Map of skillId to skillVersionId for this run */
|
|
1122
|
-
skillVersions:
|
|
1412
|
+
skillVersions: z29.record(z29.string(), z29.string()).optional(),
|
|
1123
1413
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
1124
|
-
scenarioIds:
|
|
1414
|
+
scenarioIds: z29.array(z29.string()),
|
|
1125
1415
|
/** Current status */
|
|
1126
1416
|
status: EvalStatusSchema,
|
|
1127
1417
|
/** Progress percentage (0-100) */
|
|
1128
|
-
progress:
|
|
1418
|
+
progress: z29.number(),
|
|
1129
1419
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1130
|
-
results:
|
|
1420
|
+
results: z29.array(z29.lazy(() => EvalRunResultSchema)),
|
|
1131
1421
|
/** Aggregated metrics across all results */
|
|
1132
1422
|
aggregateMetrics: EvalMetricsSchema,
|
|
1133
1423
|
/** Aggregated LLM trace summary */
|
|
@@ -1135,41 +1425,41 @@ var EvalRunSchema = TenantEntitySchema.extend({
|
|
|
1135
1425
|
/** What triggered this run */
|
|
1136
1426
|
trigger: TriggerSchema.optional(),
|
|
1137
1427
|
/** When the run started (set when evaluation is triggered) */
|
|
1138
|
-
startedAt:
|
|
1428
|
+
startedAt: z29.string().optional(),
|
|
1139
1429
|
/** When the run completed */
|
|
1140
|
-
completedAt:
|
|
1430
|
+
completedAt: z29.string().optional(),
|
|
1141
1431
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1142
|
-
liveTraceEvents:
|
|
1432
|
+
liveTraceEvents: z29.array(LiveTraceEventSchema).optional(),
|
|
1143
1433
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1144
|
-
jobId:
|
|
1434
|
+
jobId: z29.string().optional(),
|
|
1145
1435
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1146
|
-
jobStatus:
|
|
1436
|
+
jobStatus: z29.string().optional(),
|
|
1147
1437
|
/** Remote job error message if the job failed */
|
|
1148
|
-
jobError:
|
|
1438
|
+
jobError: z29.string().optional(),
|
|
1149
1439
|
/** Timestamp of the last job status check */
|
|
1150
|
-
jobStatusCheckedAt:
|
|
1440
|
+
jobStatusCheckedAt: z29.string().optional(),
|
|
1151
1441
|
/** MCP server IDs to enable for this run (optional) */
|
|
1152
|
-
mcpIds:
|
|
1442
|
+
mcpIds: z29.array(z29.string()).optional(),
|
|
1153
1443
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
1154
|
-
subAgentIds:
|
|
1444
|
+
subAgentIds: z29.array(z29.string()).optional(),
|
|
1155
1445
|
/** Rule IDs to enable for this run (optional) */
|
|
1156
|
-
ruleIds:
|
|
1446
|
+
ruleIds: z29.array(z29.string()).optional(),
|
|
1157
1447
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
1158
|
-
tags:
|
|
1448
|
+
tags: z29.array(z29.string()).optional(),
|
|
1159
1449
|
/** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
|
|
1160
|
-
runsPerScenario:
|
|
1450
|
+
runsPerScenario: z29.number().int().min(1).max(20).optional(),
|
|
1161
1451
|
/** Snapshot of agent configuration captured at run creation time */
|
|
1162
|
-
agentSnapshot:
|
|
1163
|
-
name:
|
|
1452
|
+
agentSnapshot: z29.object({
|
|
1453
|
+
name: z29.string().optional(),
|
|
1164
1454
|
agentType: AgentTypeSchema.optional(),
|
|
1165
1455
|
runCommand: AgentRunCommandSchema.optional(),
|
|
1166
|
-
systemPrompt:
|
|
1456
|
+
systemPrompt: z29.string().nullable().optional(),
|
|
1167
1457
|
modelConfig: ModelConfigSchema.optional()
|
|
1168
1458
|
}).optional(),
|
|
1169
1459
|
/** UUID linking all runs in a comparison group */
|
|
1170
|
-
comparisonGroupId:
|
|
1460
|
+
comparisonGroupId: z29.string().optional(),
|
|
1171
1461
|
/** Human-readable label for this variant (e.g., "MCP: Wix Stores") */
|
|
1172
|
-
comparisonLabel:
|
|
1462
|
+
comparisonLabel: z29.string().optional(),
|
|
1173
1463
|
/** LLM-generated analysis of the completed run */
|
|
1174
1464
|
runAnalysis: RunAnalysisSchema.optional()
|
|
1175
1465
|
});
|
|
@@ -1187,60 +1477,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1187
1477
|
agentSnapshot: true
|
|
1188
1478
|
}).extend({
|
|
1189
1479
|
/** Optional on input — backend resolves from tags when not provided */
|
|
1190
|
-
scenarioIds:
|
|
1480
|
+
scenarioIds: z29.array(z29.string()).optional()
|
|
1191
1481
|
}).refine(
|
|
1192
1482
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
1193
1483
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
1194
1484
|
);
|
|
1195
|
-
var EvaluationProgressSchema =
|
|
1196
|
-
runId:
|
|
1197
|
-
targetId:
|
|
1198
|
-
totalScenarios:
|
|
1199
|
-
completedScenarios:
|
|
1200
|
-
scenarioProgress:
|
|
1201
|
-
|
|
1202
|
-
scenarioId:
|
|
1203
|
-
currentStep:
|
|
1204
|
-
error:
|
|
1485
|
+
var EvaluationProgressSchema = z29.object({
|
|
1486
|
+
runId: z29.string(),
|
|
1487
|
+
targetId: z29.string(),
|
|
1488
|
+
totalScenarios: z29.number(),
|
|
1489
|
+
completedScenarios: z29.number(),
|
|
1490
|
+
scenarioProgress: z29.array(
|
|
1491
|
+
z29.object({
|
|
1492
|
+
scenarioId: z29.string(),
|
|
1493
|
+
currentStep: z29.string(),
|
|
1494
|
+
error: z29.string().optional()
|
|
1205
1495
|
})
|
|
1206
1496
|
),
|
|
1207
|
-
createdAt:
|
|
1208
|
-
});
|
|
1209
|
-
var EvaluationLogSchema =
|
|
1210
|
-
runId:
|
|
1211
|
-
scenarioId:
|
|
1212
|
-
log:
|
|
1213
|
-
level:
|
|
1214
|
-
message:
|
|
1215
|
-
args:
|
|
1216
|
-
error:
|
|
1497
|
+
createdAt: z29.number()
|
|
1498
|
+
});
|
|
1499
|
+
var EvaluationLogSchema = z29.object({
|
|
1500
|
+
runId: z29.string(),
|
|
1501
|
+
scenarioId: z29.string(),
|
|
1502
|
+
log: z29.object({
|
|
1503
|
+
level: z29.enum(["info", "error", "debug"]),
|
|
1504
|
+
message: z29.string().optional(),
|
|
1505
|
+
args: z29.array(z29.any()).optional(),
|
|
1506
|
+
error: z29.string().optional()
|
|
1217
1507
|
})
|
|
1218
1508
|
});
|
|
1219
1509
|
var LLM_TIMEOUT = 12e4;
|
|
1220
1510
|
|
|
1221
1511
|
// src/evaluation/conversation.ts
|
|
1222
|
-
import { z as
|
|
1223
|
-
var TextBlockSchema =
|
|
1224
|
-
type:
|
|
1225
|
-
text:
|
|
1226
|
-
});
|
|
1227
|
-
var ThinkingBlockSchema =
|
|
1228
|
-
type:
|
|
1229
|
-
thinking:
|
|
1230
|
-
});
|
|
1231
|
-
var ToolUseBlockSchema =
|
|
1232
|
-
type:
|
|
1233
|
-
toolName:
|
|
1234
|
-
toolId:
|
|
1235
|
-
input:
|
|
1236
|
-
});
|
|
1237
|
-
var ToolResultBlockSchema =
|
|
1238
|
-
type:
|
|
1239
|
-
toolUseId:
|
|
1240
|
-
content:
|
|
1241
|
-
isError:
|
|
1242
|
-
});
|
|
1243
|
-
var ConversationBlockSchema =
|
|
1512
|
+
import { z as z30 } from "zod";
|
|
1513
|
+
var TextBlockSchema = z30.object({
|
|
1514
|
+
type: z30.literal("text"),
|
|
1515
|
+
text: z30.string()
|
|
1516
|
+
});
|
|
1517
|
+
var ThinkingBlockSchema = z30.object({
|
|
1518
|
+
type: z30.literal("thinking"),
|
|
1519
|
+
thinking: z30.string()
|
|
1520
|
+
});
|
|
1521
|
+
var ToolUseBlockSchema = z30.object({
|
|
1522
|
+
type: z30.literal("tool_use"),
|
|
1523
|
+
toolName: z30.string(),
|
|
1524
|
+
toolId: z30.string(),
|
|
1525
|
+
input: z30.unknown()
|
|
1526
|
+
});
|
|
1527
|
+
var ToolResultBlockSchema = z30.object({
|
|
1528
|
+
type: z30.literal("tool_result"),
|
|
1529
|
+
toolUseId: z30.string(),
|
|
1530
|
+
content: z30.string(),
|
|
1531
|
+
isError: z30.boolean().optional()
|
|
1532
|
+
});
|
|
1533
|
+
var ConversationBlockSchema = z30.discriminatedUnion("type", [
|
|
1244
1534
|
TextBlockSchema,
|
|
1245
1535
|
ThinkingBlockSchema,
|
|
1246
1536
|
ToolUseBlockSchema,
|
|
@@ -1251,18 +1541,18 @@ var ConversationMessageRoles = [
|
|
|
1251
1541
|
"user",
|
|
1252
1542
|
"system"
|
|
1253
1543
|
];
|
|
1254
|
-
var ConversationMessageSchema =
|
|
1255
|
-
role:
|
|
1256
|
-
content:
|
|
1257
|
-
timestamp:
|
|
1544
|
+
var ConversationMessageSchema = z30.object({
|
|
1545
|
+
role: z30.enum(ConversationMessageRoles),
|
|
1546
|
+
content: z30.array(ConversationBlockSchema),
|
|
1547
|
+
timestamp: z30.string()
|
|
1258
1548
|
});
|
|
1259
|
-
var ScenarioConversationSchema =
|
|
1260
|
-
id:
|
|
1261
|
-
projectId:
|
|
1262
|
-
evalRunId:
|
|
1263
|
-
resultId:
|
|
1264
|
-
messages:
|
|
1265
|
-
createdAt:
|
|
1549
|
+
var ScenarioConversationSchema = z30.object({
|
|
1550
|
+
id: z30.string(),
|
|
1551
|
+
projectId: z30.string(),
|
|
1552
|
+
evalRunId: z30.string(),
|
|
1553
|
+
resultId: z30.string(),
|
|
1554
|
+
messages: z30.array(ConversationMessageSchema),
|
|
1555
|
+
createdAt: z30.string()
|
|
1266
1556
|
});
|
|
1267
1557
|
|
|
1268
1558
|
// src/evaluation/eval-result.ts
|
|
@@ -1273,98 +1563,98 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1273
1563
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1274
1564
|
return AssertionResultStatus2;
|
|
1275
1565
|
})(AssertionResultStatus || {});
|
|
1276
|
-
var AssertionResultSchema =
|
|
1277
|
-
id:
|
|
1278
|
-
assertionId:
|
|
1279
|
-
assertionType:
|
|
1280
|
-
assertionName:
|
|
1281
|
-
status:
|
|
1282
|
-
message:
|
|
1283
|
-
expected:
|
|
1284
|
-
actual:
|
|
1285
|
-
duration:
|
|
1286
|
-
details:
|
|
1287
|
-
llmTraceSteps:
|
|
1288
|
-
});
|
|
1289
|
-
var EvalRunResultSchema =
|
|
1290
|
-
id:
|
|
1291
|
-
targetId:
|
|
1292
|
-
targetName:
|
|
1566
|
+
var AssertionResultSchema = z31.object({
|
|
1567
|
+
id: z31.string(),
|
|
1568
|
+
assertionId: z31.string(),
|
|
1569
|
+
assertionType: z31.string(),
|
|
1570
|
+
assertionName: z31.string(),
|
|
1571
|
+
status: z31.enum(AssertionResultStatus),
|
|
1572
|
+
message: z31.string().optional(),
|
|
1573
|
+
expected: z31.string().optional(),
|
|
1574
|
+
actual: z31.string().optional(),
|
|
1575
|
+
duration: z31.number().optional(),
|
|
1576
|
+
details: z31.record(z31.string(), z31.unknown()).optional(),
|
|
1577
|
+
llmTraceSteps: z31.array(LLMTraceStepSchema).optional()
|
|
1578
|
+
});
|
|
1579
|
+
var EvalRunResultSchema = z31.object({
|
|
1580
|
+
id: z31.string(),
|
|
1581
|
+
targetId: z31.string(),
|
|
1582
|
+
targetName: z31.string().optional(),
|
|
1293
1583
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
1294
|
-
skillVersionId:
|
|
1584
|
+
skillVersionId: z31.string().optional(),
|
|
1295
1585
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
1296
|
-
skillVersion:
|
|
1297
|
-
scenarioId:
|
|
1298
|
-
scenarioName:
|
|
1586
|
+
skillVersion: z31.string().optional(),
|
|
1587
|
+
scenarioId: z31.string(),
|
|
1588
|
+
scenarioName: z31.string(),
|
|
1299
1589
|
/** Snapshot of the trigger prompt used during the run (prevents stale display after edits) */
|
|
1300
|
-
triggerPrompt:
|
|
1590
|
+
triggerPrompt: z31.string().optional(),
|
|
1301
1591
|
modelConfig: ModelConfigSchema.optional(),
|
|
1302
|
-
assertionResults:
|
|
1592
|
+
assertionResults: z31.array(AssertionResultSchema),
|
|
1303
1593
|
metrics: EvalMetricsSchema.optional(),
|
|
1304
|
-
passed:
|
|
1305
|
-
failed:
|
|
1306
|
-
passRate:
|
|
1307
|
-
duration:
|
|
1308
|
-
outputText:
|
|
1309
|
-
files:
|
|
1310
|
-
fileDiffs:
|
|
1594
|
+
passed: z31.number(),
|
|
1595
|
+
failed: z31.number(),
|
|
1596
|
+
passRate: z31.number(),
|
|
1597
|
+
duration: z31.number(),
|
|
1598
|
+
outputText: z31.string().optional(),
|
|
1599
|
+
files: z31.array(ExpectedFileSchema).optional(),
|
|
1600
|
+
fileDiffs: z31.array(DiffContentSchema).optional(),
|
|
1311
1601
|
/** Full template files after execution with status indicators */
|
|
1312
|
-
templateFiles:
|
|
1313
|
-
startedAt:
|
|
1314
|
-
completedAt:
|
|
1602
|
+
templateFiles: z31.array(TemplateFileSchema).optional(),
|
|
1603
|
+
startedAt: z31.string().optional(),
|
|
1604
|
+
completedAt: z31.string().optional(),
|
|
1315
1605
|
llmTrace: LLMTraceSchema.optional(),
|
|
1316
1606
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
1317
|
-
conversation:
|
|
1607
|
+
conversation: z31.array(ConversationMessageSchema).optional(),
|
|
1318
1608
|
/** 0-based iteration index when a scenario is run multiple times within a single eval run */
|
|
1319
|
-
iterationIndex:
|
|
1320
|
-
});
|
|
1321
|
-
var PromptResultSchema =
|
|
1322
|
-
text:
|
|
1323
|
-
files:
|
|
1324
|
-
finishReason:
|
|
1325
|
-
reasoning:
|
|
1326
|
-
reasoningDetails:
|
|
1327
|
-
toolCalls:
|
|
1328
|
-
toolResults:
|
|
1329
|
-
warnings:
|
|
1330
|
-
sources:
|
|
1331
|
-
steps:
|
|
1332
|
-
generationTimeMs:
|
|
1333
|
-
prompt:
|
|
1334
|
-
systemPrompt:
|
|
1335
|
-
usage:
|
|
1336
|
-
totalTokens:
|
|
1337
|
-
totalMicrocentsSpent:
|
|
1609
|
+
iterationIndex: z31.number().int().min(0).optional()
|
|
1610
|
+
});
|
|
1611
|
+
var PromptResultSchema = z31.object({
|
|
1612
|
+
text: z31.string(),
|
|
1613
|
+
files: z31.array(z31.unknown()).optional(),
|
|
1614
|
+
finishReason: z31.string().optional(),
|
|
1615
|
+
reasoning: z31.string().optional(),
|
|
1616
|
+
reasoningDetails: z31.unknown().optional(),
|
|
1617
|
+
toolCalls: z31.array(z31.unknown()).optional(),
|
|
1618
|
+
toolResults: z31.array(z31.unknown()).optional(),
|
|
1619
|
+
warnings: z31.array(z31.unknown()).optional(),
|
|
1620
|
+
sources: z31.array(z31.unknown()).optional(),
|
|
1621
|
+
steps: z31.array(z31.unknown()),
|
|
1622
|
+
generationTimeMs: z31.number(),
|
|
1623
|
+
prompt: z31.string(),
|
|
1624
|
+
systemPrompt: z31.string(),
|
|
1625
|
+
usage: z31.object({
|
|
1626
|
+
totalTokens: z31.number().optional(),
|
|
1627
|
+
totalMicrocentsSpent: z31.number().optional()
|
|
1338
1628
|
})
|
|
1339
1629
|
});
|
|
1340
|
-
var EvaluationResultSchema =
|
|
1341
|
-
id:
|
|
1342
|
-
runId:
|
|
1343
|
-
timestamp:
|
|
1630
|
+
var EvaluationResultSchema = z31.object({
|
|
1631
|
+
id: z31.string(),
|
|
1632
|
+
runId: z31.string(),
|
|
1633
|
+
timestamp: z31.number(),
|
|
1344
1634
|
promptResult: PromptResultSchema,
|
|
1345
|
-
testResults:
|
|
1346
|
-
tags:
|
|
1347
|
-
feedback:
|
|
1348
|
-
score:
|
|
1349
|
-
suiteId:
|
|
1350
|
-
});
|
|
1351
|
-
var LeanEvaluationResultSchema =
|
|
1352
|
-
id:
|
|
1353
|
-
runId:
|
|
1354
|
-
timestamp:
|
|
1355
|
-
tags:
|
|
1356
|
-
scenarioId:
|
|
1357
|
-
scenarioVersion:
|
|
1358
|
-
targetId:
|
|
1359
|
-
targetVersion:
|
|
1360
|
-
suiteId:
|
|
1361
|
-
score:
|
|
1362
|
-
time:
|
|
1363
|
-
microcentsSpent:
|
|
1635
|
+
testResults: z31.array(z31.unknown()),
|
|
1636
|
+
tags: z31.array(z31.string()).optional(),
|
|
1637
|
+
feedback: z31.string().optional(),
|
|
1638
|
+
score: z31.number(),
|
|
1639
|
+
suiteId: z31.string().optional()
|
|
1640
|
+
});
|
|
1641
|
+
var LeanEvaluationResultSchema = z31.object({
|
|
1642
|
+
id: z31.string(),
|
|
1643
|
+
runId: z31.string(),
|
|
1644
|
+
timestamp: z31.number(),
|
|
1645
|
+
tags: z31.array(z31.string()).optional(),
|
|
1646
|
+
scenarioId: z31.string(),
|
|
1647
|
+
scenarioVersion: z31.number().optional(),
|
|
1648
|
+
targetId: z31.string(),
|
|
1649
|
+
targetVersion: z31.number().optional(),
|
|
1650
|
+
suiteId: z31.string().optional(),
|
|
1651
|
+
score: z31.number(),
|
|
1652
|
+
time: z31.number().optional(),
|
|
1653
|
+
microcentsSpent: z31.number().optional()
|
|
1364
1654
|
});
|
|
1365
1655
|
|
|
1366
1656
|
// src/evaluation/eval-run-folder.ts
|
|
1367
|
-
import { z as
|
|
1657
|
+
import { z as z32 } from "zod";
|
|
1368
1658
|
var EvalRunFolderSchema = TenantEntitySchema.extend({});
|
|
1369
1659
|
var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
1370
1660
|
id: true,
|
|
@@ -1378,26 +1668,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
|
1378
1668
|
updatedAt: true,
|
|
1379
1669
|
deleted: true
|
|
1380
1670
|
}).partial();
|
|
1381
|
-
var EvalRunFolderMembershipSchema =
|
|
1382
|
-
folderId:
|
|
1383
|
-
evalRunId:
|
|
1384
|
-
projectId:
|
|
1385
|
-
createdAt:
|
|
1671
|
+
var EvalRunFolderMembershipSchema = z32.object({
|
|
1672
|
+
folderId: z32.string(),
|
|
1673
|
+
evalRunId: z32.string(),
|
|
1674
|
+
projectId: z32.string(),
|
|
1675
|
+
createdAt: z32.string()
|
|
1386
1676
|
});
|
|
1387
1677
|
|
|
1388
1678
|
// src/project/project.ts
|
|
1389
|
-
import { z as
|
|
1679
|
+
import { z as z33 } from "zod";
|
|
1390
1680
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1391
|
-
appId:
|
|
1392
|
-
scenarioTags:
|
|
1681
|
+
appId: z33.string().optional().describe("The ID of the app in Dev Center"),
|
|
1682
|
+
scenarioTags: z33.array(z33.string()).optional().describe("Project-level tag vocabulary for scenarios"),
|
|
1393
1683
|
/** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
|
|
1394
|
-
wixAuthToken:
|
|
1684
|
+
wixAuthToken: z33.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
|
|
1395
1685
|
/** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
|
|
1396
|
-
base44AuthFile:
|
|
1686
|
+
base44AuthFile: z33.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
|
|
1397
1687
|
/** Resolved at runtime from the encrypted Wix auth token */
|
|
1398
|
-
wixAuthEmail:
|
|
1688
|
+
wixAuthEmail: z33.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
|
|
1399
1689
|
/** Resolved at runtime from the encrypted Base44 auth file */
|
|
1400
|
-
base44AuthEmail:
|
|
1690
|
+
base44AuthEmail: z33.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
|
|
1401
1691
|
});
|
|
1402
1692
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1403
1693
|
id: true,
|
|
@@ -1406,6 +1696,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
|
|
|
1406
1696
|
deleted: true,
|
|
1407
1697
|
wixAuthEmail: true,
|
|
1408
1698
|
base44AuthEmail: true
|
|
1699
|
+
}).extend({
|
|
1700
|
+
appId: z33.string().describe(
|
|
1701
|
+
"Required: The ID of the app in Dev Center for credential scoping"
|
|
1702
|
+
)
|
|
1409
1703
|
});
|
|
1410
1704
|
var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
|
|
1411
1705
|
|
|
@@ -1423,7 +1717,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
|
1423
1717
|
var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
1424
1718
|
|
|
1425
1719
|
// src/schedule/eval-schedule.ts
|
|
1426
|
-
import { z as
|
|
1720
|
+
import { z as z34 } from "zod";
|
|
1427
1721
|
var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
1428
1722
|
FrequencyType2["DAILY"] = "daily";
|
|
1429
1723
|
FrequencyType2["WEEKDAY"] = "weekday";
|
|
@@ -1433,29 +1727,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
|
1433
1727
|
})(FrequencyType || {});
|
|
1434
1728
|
var EvalScheduleSchema = TenantEntitySchema.extend({
|
|
1435
1729
|
/** Whether the schedule is active */
|
|
1436
|
-
enabled:
|
|
1730
|
+
enabled: z34.boolean(),
|
|
1437
1731
|
/** Test suite to run */
|
|
1438
|
-
suiteId:
|
|
1732
|
+
suiteId: z34.string(),
|
|
1439
1733
|
/** Preset that provides agent + entities for this schedule */
|
|
1440
|
-
presetId:
|
|
1734
|
+
presetId: z34.string(),
|
|
1441
1735
|
/** How often to run */
|
|
1442
|
-
frequencyType:
|
|
1736
|
+
frequencyType: z34.nativeEnum(FrequencyType),
|
|
1443
1737
|
/** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
|
|
1444
|
-
timeOfDay:
|
|
1738
|
+
timeOfDay: z34.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
|
|
1445
1739
|
/** Day of week (0=Sun, 6=Sat) for weekly schedules */
|
|
1446
|
-
dayOfWeek:
|
|
1740
|
+
dayOfWeek: z34.number().min(0).max(6).optional(),
|
|
1447
1741
|
/** Day of month (1-31) for monthly schedules */
|
|
1448
|
-
dayOfMonth:
|
|
1742
|
+
dayOfMonth: z34.number().min(1).max(31).optional(),
|
|
1449
1743
|
/** IANA timezone (e.g., 'America/New_York') */
|
|
1450
|
-
timezone:
|
|
1744
|
+
timezone: z34.string(),
|
|
1451
1745
|
/** ID of the last eval run created by this schedule */
|
|
1452
|
-
lastRunId:
|
|
1746
|
+
lastRunId: z34.string().optional(),
|
|
1453
1747
|
/** Denormalized status of the last run */
|
|
1454
|
-
lastRunStatus:
|
|
1748
|
+
lastRunStatus: z34.string().optional(),
|
|
1455
1749
|
/** ISO timestamp of the last run */
|
|
1456
|
-
lastRunAt:
|
|
1750
|
+
lastRunAt: z34.string().optional(),
|
|
1457
1751
|
/** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
|
|
1458
|
-
nextRunAt:
|
|
1752
|
+
nextRunAt: z34.string().optional()
|
|
1459
1753
|
});
|
|
1460
1754
|
function isValidTimezone(tz) {
|
|
1461
1755
|
try {
|
|
@@ -1468,14 +1762,14 @@ function isValidTimezone(tz) {
|
|
|
1468
1762
|
function validateScheduleFields(data, ctx, options) {
|
|
1469
1763
|
if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
|
|
1470
1764
|
ctx.addIssue({
|
|
1471
|
-
code:
|
|
1765
|
+
code: z34.ZodIssueCode.custom,
|
|
1472
1766
|
message: "dayOfWeek is required for weekly schedules",
|
|
1473
1767
|
path: ["dayOfWeek"]
|
|
1474
1768
|
});
|
|
1475
1769
|
}
|
|
1476
1770
|
if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
|
|
1477
1771
|
ctx.addIssue({
|
|
1478
|
-
code:
|
|
1772
|
+
code: z34.ZodIssueCode.custom,
|
|
1479
1773
|
message: "dayOfMonth is required for monthly schedules",
|
|
1480
1774
|
path: ["dayOfMonth"]
|
|
1481
1775
|
});
|
|
@@ -1483,7 +1777,7 @@ function validateScheduleFields(data, ctx, options) {
|
|
|
1483
1777
|
const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
|
|
1484
1778
|
if (shouldValidateTz && !isValidTimezone(data.timezone)) {
|
|
1485
1779
|
ctx.addIssue({
|
|
1486
|
-
code:
|
|
1780
|
+
code: z34.ZodIssueCode.custom,
|
|
1487
1781
|
message: "Invalid IANA timezone",
|
|
1488
1782
|
path: ["timezone"]
|
|
1489
1783
|
});
|
|
@@ -1506,228 +1800,9 @@ var CreateEvalScheduleInputSchema = BaseCreateScheduleSchema.superRefine((data,
|
|
|
1506
1800
|
var UpdateEvalScheduleInputSchema = BaseCreateScheduleSchema.partial().superRefine((data, ctx) => {
|
|
1507
1801
|
validateScheduleFields(data, ctx, { partial: true });
|
|
1508
1802
|
});
|
|
1509
|
-
|
|
1510
|
-
// src/assertion/system-assertions.ts
|
|
1511
|
-
var SYSTEM_ASSERTION_IDS = {
|
|
1512
|
-
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1513
|
-
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
1514
|
-
BUILD_PASSED: "system:build_passed",
|
|
1515
|
-
TIME_LIMIT: "system:time_limit",
|
|
1516
|
-
COST: "system:cost",
|
|
1517
|
-
LLM_JUDGE: "system:llm_judge",
|
|
1518
|
-
API_CALL: "system:api_call"
|
|
1519
|
-
};
|
|
1520
|
-
function isSystemAssertionId(id) {
|
|
1521
|
-
return id.startsWith("system:");
|
|
1522
|
-
}
|
|
1523
|
-
var SYSTEM_ASSERTIONS = {
|
|
1524
|
-
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1525
|
-
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1526
|
-
name: "Skill Was Called",
|
|
1527
|
-
description: "Check that one or more skills were invoked during the agent run",
|
|
1528
|
-
type: "skill_was_called",
|
|
1529
|
-
parameters: [
|
|
1530
|
-
{
|
|
1531
|
-
name: "skillNames",
|
|
1532
|
-
label: "Skills",
|
|
1533
|
-
type: "string",
|
|
1534
|
-
required: true
|
|
1535
|
-
},
|
|
1536
|
-
{
|
|
1537
|
-
name: "negate",
|
|
1538
|
-
label: "Negate (NOT operator)",
|
|
1539
|
-
type: "boolean",
|
|
1540
|
-
required: false,
|
|
1541
|
-
defaultValue: false
|
|
1542
|
-
}
|
|
1543
|
-
]
|
|
1544
|
-
},
|
|
1545
|
-
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
1546
|
-
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
1547
|
-
name: "Tool Called With Param",
|
|
1548
|
-
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
1549
|
-
type: "tool_called_with_param",
|
|
1550
|
-
parameters: [
|
|
1551
|
-
{
|
|
1552
|
-
name: "toolName",
|
|
1553
|
-
label: "Tool Name",
|
|
1554
|
-
type: "string",
|
|
1555
|
-
required: true
|
|
1556
|
-
},
|
|
1557
|
-
{
|
|
1558
|
-
name: "expectedParams",
|
|
1559
|
-
label: "Expected Parameters (JSON, substring match)",
|
|
1560
|
-
type: "string",
|
|
1561
|
-
required: false
|
|
1562
|
-
},
|
|
1563
|
-
{
|
|
1564
|
-
name: "requireSuccess",
|
|
1565
|
-
label: "Require Successful Call",
|
|
1566
|
-
type: "boolean",
|
|
1567
|
-
required: false,
|
|
1568
|
-
defaultValue: false,
|
|
1569
|
-
advanced: true
|
|
1570
|
-
},
|
|
1571
|
-
{
|
|
1572
|
-
name: "negate",
|
|
1573
|
-
label: "Negate (NOT operator)",
|
|
1574
|
-
type: "boolean",
|
|
1575
|
-
required: false,
|
|
1576
|
-
defaultValue: false
|
|
1577
|
-
}
|
|
1578
|
-
]
|
|
1579
|
-
},
|
|
1580
|
-
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1581
|
-
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1582
|
-
name: "Build Passed",
|
|
1583
|
-
description: "Run a build command and verify it exits with expected code",
|
|
1584
|
-
type: "build_passed",
|
|
1585
|
-
parameters: [
|
|
1586
|
-
{
|
|
1587
|
-
name: "command",
|
|
1588
|
-
label: "Build Command",
|
|
1589
|
-
type: "string",
|
|
1590
|
-
required: false,
|
|
1591
|
-
defaultValue: "yarn build"
|
|
1592
|
-
},
|
|
1593
|
-
{
|
|
1594
|
-
name: "expectedExitCode",
|
|
1595
|
-
label: "Expected Exit Code",
|
|
1596
|
-
type: "number",
|
|
1597
|
-
required: false,
|
|
1598
|
-
defaultValue: 0
|
|
1599
|
-
},
|
|
1600
|
-
{
|
|
1601
|
-
name: "maxBuildTime",
|
|
1602
|
-
label: "Max Build Time (ms)",
|
|
1603
|
-
type: "number",
|
|
1604
|
-
required: false,
|
|
1605
|
-
advanced: true
|
|
1606
|
-
},
|
|
1607
|
-
{
|
|
1608
|
-
name: "maxMemory",
|
|
1609
|
-
label: "Max Memory (MB)",
|
|
1610
|
-
type: "number",
|
|
1611
|
-
required: false,
|
|
1612
|
-
advanced: true
|
|
1613
|
-
}
|
|
1614
|
-
]
|
|
1615
|
-
},
|
|
1616
|
-
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
1617
|
-
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
1618
|
-
name: "Time Limit",
|
|
1619
|
-
description: "Check that the scenario completed within a maximum duration",
|
|
1620
|
-
type: "time_limit",
|
|
1621
|
-
parameters: [
|
|
1622
|
-
{
|
|
1623
|
-
name: "maxDurationMs",
|
|
1624
|
-
label: "Max Duration (ms)",
|
|
1625
|
-
type: "number",
|
|
1626
|
-
required: true,
|
|
1627
|
-
defaultValue: 3e5
|
|
1628
|
-
}
|
|
1629
|
-
]
|
|
1630
|
-
},
|
|
1631
|
-
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
1632
|
-
id: SYSTEM_ASSERTION_IDS.COST,
|
|
1633
|
-
name: "Cost",
|
|
1634
|
-
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
1635
|
-
type: "cost",
|
|
1636
|
-
parameters: [
|
|
1637
|
-
{
|
|
1638
|
-
name: "maxCostUsd",
|
|
1639
|
-
label: "Max Cost (USD)",
|
|
1640
|
-
type: "number",
|
|
1641
|
-
required: true,
|
|
1642
|
-
defaultValue: 1
|
|
1643
|
-
}
|
|
1644
|
-
]
|
|
1645
|
-
},
|
|
1646
|
-
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1647
|
-
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1648
|
-
name: "LLM Judge",
|
|
1649
|
-
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
1650
|
-
type: "llm_judge",
|
|
1651
|
-
parameters: [
|
|
1652
|
-
{
|
|
1653
|
-
name: "prompt",
|
|
1654
|
-
label: "Judge Prompt",
|
|
1655
|
-
type: "string",
|
|
1656
|
-
required: true,
|
|
1657
|
-
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1658
|
-
},
|
|
1659
|
-
{
|
|
1660
|
-
name: "minScore",
|
|
1661
|
-
label: "Minimum Score (0-10)",
|
|
1662
|
-
type: "number",
|
|
1663
|
-
required: false,
|
|
1664
|
-
defaultValue: 7
|
|
1665
|
-
},
|
|
1666
|
-
{
|
|
1667
|
-
name: "model",
|
|
1668
|
-
label: "Model",
|
|
1669
|
-
type: "string",
|
|
1670
|
-
required: false
|
|
1671
|
-
}
|
|
1672
|
-
]
|
|
1673
|
-
},
|
|
1674
|
-
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
1675
|
-
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
1676
|
-
name: "API Call",
|
|
1677
|
-
description: "Call an API endpoint and verify the response contains expected data",
|
|
1678
|
-
type: "api_call",
|
|
1679
|
-
parameters: [
|
|
1680
|
-
{
|
|
1681
|
-
name: "url",
|
|
1682
|
-
label: "URL",
|
|
1683
|
-
type: "string",
|
|
1684
|
-
required: true
|
|
1685
|
-
},
|
|
1686
|
-
{
|
|
1687
|
-
name: "method",
|
|
1688
|
-
label: "HTTP Method",
|
|
1689
|
-
type: "string",
|
|
1690
|
-
required: false,
|
|
1691
|
-
defaultValue: "GET"
|
|
1692
|
-
},
|
|
1693
|
-
{
|
|
1694
|
-
name: "requestBody",
|
|
1695
|
-
label: "Request Body (JSON)",
|
|
1696
|
-
type: "string",
|
|
1697
|
-
required: false
|
|
1698
|
-
},
|
|
1699
|
-
{
|
|
1700
|
-
name: "expectedResponse",
|
|
1701
|
-
label: "Expected Response (JSON)",
|
|
1702
|
-
type: "string",
|
|
1703
|
-
required: true
|
|
1704
|
-
},
|
|
1705
|
-
{
|
|
1706
|
-
name: "requestHeaders",
|
|
1707
|
-
label: "Headers (JSON)",
|
|
1708
|
-
type: "string",
|
|
1709
|
-
required: false,
|
|
1710
|
-
advanced: true
|
|
1711
|
-
},
|
|
1712
|
-
{
|
|
1713
|
-
name: "timeoutMs",
|
|
1714
|
-
label: "Timeout (ms)",
|
|
1715
|
-
type: "number",
|
|
1716
|
-
required: false,
|
|
1717
|
-
defaultValue: 3e4,
|
|
1718
|
-
advanced: true
|
|
1719
|
-
}
|
|
1720
|
-
]
|
|
1721
|
-
}
|
|
1722
|
-
};
|
|
1723
|
-
function getSystemAssertions() {
|
|
1724
|
-
return Object.values(SYSTEM_ASSERTIONS);
|
|
1725
|
-
}
|
|
1726
|
-
function getSystemAssertion(id) {
|
|
1727
|
-
return SYSTEM_ASSERTIONS[id];
|
|
1728
|
-
}
|
|
1729
1803
|
export {
|
|
1730
1804
|
AGENT_TYPE_LABELS,
|
|
1805
|
+
ALLOWED_BUILD_COMMANDS,
|
|
1731
1806
|
ALL_AVAILABLE_MODEL_IDS,
|
|
1732
1807
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
1733
1808
|
AVAILABLE_OPENAI_MODEL_IDS,
|
|
@@ -1761,6 +1836,7 @@ export {
|
|
|
1761
1836
|
BatchSummarySchema,
|
|
1762
1837
|
BuildCheckTestSchema,
|
|
1763
1838
|
BuildPassedAssertionSchema,
|
|
1839
|
+
BuildPassedCommandStringSchema,
|
|
1764
1840
|
BuildPassedConfigSchema,
|
|
1765
1841
|
BulkImportResultItemSchema,
|
|
1766
1842
|
BulkImportResultSchema,
|
|
@@ -1788,6 +1864,7 @@ export {
|
|
|
1788
1864
|
CreateTemplateInputSchema,
|
|
1789
1865
|
CreateTestScenarioInputSchema,
|
|
1790
1866
|
CreateTestSuiteInputSchema,
|
|
1867
|
+
DEFAULT_BUILD_PASSED_COMMAND,
|
|
1791
1868
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
1792
1869
|
DEFAULT_JUDGE_MODEL,
|
|
1793
1870
|
DiffContentSchema,
|
|
@@ -1905,11 +1982,14 @@ export {
|
|
|
1905
1982
|
formatTraceEventLine,
|
|
1906
1983
|
getSystemAssertion,
|
|
1907
1984
|
getSystemAssertions,
|
|
1985
|
+
isAllowedBuildCommandString,
|
|
1908
1986
|
isSystemAssertionId,
|
|
1909
1987
|
isValidSkillFolderName,
|
|
1910
1988
|
normalizeBatchAssertionLink,
|
|
1911
1989
|
normalizeModelId,
|
|
1990
|
+
parseBuildCommandToArgv,
|
|
1912
1991
|
parseTraceEventLine,
|
|
1913
|
-
validateAssertionConfig
|
|
1992
|
+
validateAssertionConfig,
|
|
1993
|
+
validateBuildPassedParamsInAssertionLinks
|
|
1914
1994
|
};
|
|
1915
1995
|
//# sourceMappingURL=index.mjs.map
|