@wix/evalforge-types 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -454,22 +454,145 @@ var EnvironmentSchema = z19.object({
454
454
  });
455
455
 
456
456
  // src/scenario/test-scenario.ts
457
+ import { z as z21 } from "zod";
458
+
459
+ // src/assertion/assertion.ts
457
460
  import { z as z20 } from "zod";
458
- var ExpectedFileSchema = z20.object({
461
+ var AssertionTypeSchema = z20.enum([
462
+ "skill_was_called",
463
+ "build_passed",
464
+ "llm_judge"
465
+ ]);
466
+ var AssertionParameterTypeSchema = z20.enum([
467
+ "string",
468
+ "number",
469
+ "boolean"
470
+ ]);
471
+ var AssertionParameterSchema = z20.object({
472
+ /** Parameter name (used as key in params object) */
473
+ name: z20.string().min(1),
474
+ /** Display label for the parameter */
475
+ label: z20.string().min(1),
476
+ /** Parameter type */
477
+ type: AssertionParameterTypeSchema,
478
+ /** Whether this parameter is required */
479
+ required: z20.boolean(),
480
+ /** Default value (optional, used when not provided) */
481
+ defaultValue: z20.union([z20.string(), z20.number(), z20.boolean()]).optional()
482
+ });
483
+ var ScenarioAssertionLinkSchema = z20.object({
484
+ /** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
485
+ assertionId: z20.string(),
486
+ /** Parameter values for this assertion in this scenario */
487
+ params: z20.record(
488
+ z20.string(),
489
+ z20.union([z20.string(), z20.number(), z20.boolean(), z20.null()])
490
+ ).optional()
491
+ });
492
+ var SkillWasCalledConfigSchema = z20.object({
493
+ /** Name of the skill that must have been called */
494
+ skillName: z20.string().min(1)
495
+ });
496
+ var BuildPassedConfigSchema = z20.strictObject({
497
+ /** Command to run (default: "yarn build") */
498
+ command: z20.string().optional(),
499
+ /** Expected exit code (default: 0) */
500
+ expectedExitCode: z20.number().int().optional()
501
+ });
502
+ var LlmJudgeConfigSchema = z20.object({
503
+ /**
504
+ * Prompt template with placeholders:
505
+ * - {{output}}: agent's final output
506
+ * - {{cwd}}: working directory
507
+ * - {{changedFiles}}: all files changed (new, modified)
508
+ * - {{modifiedFiles}}: only existing files that were modified
509
+ * - {{newFiles}}: only new files that were created
510
+ * - {{trace}}: step-by-step trace of tool calls
511
+ * - Custom parameters defined in the parameters array
512
+ */
513
+ prompt: z20.string().min(1),
514
+ /** Optional system prompt for the judge */
515
+ systemPrompt: z20.string().optional(),
516
+ /** Minimum score to pass (0-100, default 70) */
517
+ minScore: z20.number().int().min(0).max(100).optional(),
518
+ /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
519
+ model: z20.string().optional(),
520
+ /** Max output tokens */
521
+ maxTokens: z20.number().int().optional(),
522
+ /** Temperature (0-1) */
523
+ temperature: z20.number().min(0).max(1).optional(),
524
+ /** User-defined parameters for this assertion */
525
+ parameters: z20.array(AssertionParameterSchema).optional()
526
+ });
527
+ var AssertionConfigSchema = z20.union([
528
+ LlmJudgeConfigSchema,
529
+ // requires prompt - check first
530
+ SkillWasCalledConfigSchema,
531
+ // requires skillName
532
+ BuildPassedConfigSchema,
533
+ // all optional, uses strictObject to reject unknown keys
534
+ z20.object({})
535
+ // fallback empty config
536
+ ]);
537
+ var CustomAssertionSchema = TenantEntitySchema.extend({
538
+ /** The assertion type */
539
+ type: AssertionTypeSchema,
540
+ /** Type-specific configuration */
541
+ config: AssertionConfigSchema
542
+ });
543
+ var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
544
+ id: true,
545
+ createdAt: true,
546
+ updatedAt: true,
547
+ deleted: true
548
+ });
549
+ var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
550
+ function validateAssertionConfig(type, config) {
551
+ switch (type) {
552
+ case "skill_was_called":
553
+ return SkillWasCalledConfigSchema.safeParse(config).success;
554
+ case "build_passed":
555
+ return BuildPassedConfigSchema.safeParse(config).success;
556
+ case "llm_judge":
557
+ return LlmJudgeConfigSchema.safeParse(config).success;
558
+ default:
559
+ return false;
560
+ }
561
+ }
562
+ function getSkillWasCalledConfig(assertion) {
563
+ if (assertion.type !== "skill_was_called") return null;
564
+ const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
565
+ return result.success ? result.data : null;
566
+ }
567
+ function getBuildPassedConfig(assertion) {
568
+ if (assertion.type !== "build_passed") return null;
569
+ const result = BuildPassedConfigSchema.safeParse(assertion.config);
570
+ return result.success ? result.data : null;
571
+ }
572
+ function getLlmJudgeConfig(assertion) {
573
+ if (assertion.type !== "llm_judge") return null;
574
+ const result = LlmJudgeConfigSchema.safeParse(assertion.config);
575
+ return result.success ? result.data : null;
576
+ }
577
+
578
+ // src/scenario/test-scenario.ts
579
+ var ExpectedFileSchema = z21.object({
459
580
  /** Relative path where the file should be created */
460
- path: z20.string(),
581
+ path: z21.string(),
461
582
  /** Optional expected content */
462
- content: z20.string().optional()
583
+ content: z21.string().optional()
463
584
  });
464
585
  var TestScenarioSchema = TenantEntitySchema.extend({
465
586
  /** The prompt sent to the agent to trigger the task */
466
- triggerPrompt: z20.string().min(10),
587
+ triggerPrompt: z21.string().min(10),
467
588
  /** ID of the template to use for this scenario (null = no template) */
468
- templateId: z20.string().nullish(),
589
+ templateId: z21.string().nullish(),
469
590
  /** Inline assertions to evaluate for this scenario (legacy) */
470
- assertions: z20.array(AssertionSchema).optional(),
471
- /** IDs of saved assertions to evaluate (from assertions table) */
472
- assertionIds: z20.array(z20.string()).optional()
591
+ assertions: z21.array(AssertionSchema).optional(),
592
+ /** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
593
+ assertionIds: z21.array(z21.string()).optional(),
594
+ /** Linked assertions with per-scenario parameter values */
595
+ assertionLinks: z21.array(ScenarioAssertionLinkSchema).optional()
473
596
  });
474
597
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
475
598
  id: true,
@@ -480,10 +603,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
480
603
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
481
604
 
482
605
  // src/suite/test-suite.ts
483
- import { z as z21 } from "zod";
606
+ import { z as z22 } from "zod";
484
607
  var TestSuiteSchema = TenantEntitySchema.extend({
485
608
  /** IDs of test scenarios in this suite */
486
- scenarioIds: z21.array(z21.string())
609
+ scenarioIds: z22.array(z22.string())
487
610
  });
488
611
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
489
612
  id: true,
@@ -494,21 +617,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
494
617
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
495
618
 
496
619
  // src/evaluation/metrics.ts
497
- import { z as z22 } from "zod";
498
- var TokenUsageSchema = z22.object({
499
- prompt: z22.number(),
500
- completion: z22.number(),
501
- total: z22.number()
502
- });
503
- var EvalMetricsSchema = z22.object({
504
- totalAssertions: z22.number(),
505
- passed: z22.number(),
506
- failed: z22.number(),
507
- skipped: z22.number(),
508
- errors: z22.number(),
509
- passRate: z22.number(),
510
- avgDuration: z22.number(),
511
- totalDuration: z22.number()
620
+ import { z as z23 } from "zod";
621
+ var TokenUsageSchema = z23.object({
622
+ prompt: z23.number(),
623
+ completion: z23.number(),
624
+ total: z23.number()
625
+ });
626
+ var EvalMetricsSchema = z23.object({
627
+ totalAssertions: z23.number(),
628
+ passed: z23.number(),
629
+ failed: z23.number(),
630
+ skipped: z23.number(),
631
+ errors: z23.number(),
632
+ passRate: z23.number(),
633
+ avgDuration: z23.number(),
634
+ totalDuration: z23.number()
512
635
  });
513
636
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
514
637
  EvalStatus2["PENDING"] = "pending";
@@ -518,7 +641,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
518
641
  EvalStatus2["CANCELLED"] = "cancelled";
519
642
  return EvalStatus2;
520
643
  })(EvalStatus || {});
521
- var EvalStatusSchema = z22.enum(EvalStatus);
644
+ var EvalStatusSchema = z23.enum(EvalStatus);
522
645
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
523
646
  LLMStepType2["COMPLETION"] = "completion";
524
647
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -526,52 +649,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
526
649
  LLMStepType2["THINKING"] = "thinking";
527
650
  return LLMStepType2;
528
651
  })(LLMStepType || {});
529
- var LLMTraceStepSchema = z22.object({
530
- id: z22.string(),
531
- stepNumber: z22.number(),
532
- type: z22.enum(LLMStepType),
533
- model: z22.string(),
534
- provider: z22.string(),
535
- startedAt: z22.string(),
536
- durationMs: z22.number(),
652
+ var LLMTraceStepSchema = z23.object({
653
+ id: z23.string(),
654
+ stepNumber: z23.number(),
655
+ type: z23.enum(LLMStepType),
656
+ model: z23.string(),
657
+ provider: z23.string(),
658
+ startedAt: z23.string(),
659
+ durationMs: z23.number(),
537
660
  tokenUsage: TokenUsageSchema,
538
- costUsd: z22.number(),
539
- toolName: z22.string().optional(),
540
- toolArguments: z22.string().optional(),
541
- inputPreview: z22.string().optional(),
542
- outputPreview: z22.string().optional(),
543
- success: z22.boolean(),
544
- error: z22.string().optional()
545
- });
546
- var LLMBreakdownStatsSchema = z22.object({
547
- count: z22.number(),
548
- durationMs: z22.number(),
549
- tokens: z22.number(),
550
- costUsd: z22.number()
551
- });
552
- var LLMTraceSummarySchema = z22.object({
553
- totalSteps: z22.number(),
554
- totalDurationMs: z22.number(),
661
+ costUsd: z23.number(),
662
+ toolName: z23.string().optional(),
663
+ toolArguments: z23.string().optional(),
664
+ inputPreview: z23.string().optional(),
665
+ outputPreview: z23.string().optional(),
666
+ success: z23.boolean(),
667
+ error: z23.string().optional()
668
+ });
669
+ var LLMBreakdownStatsSchema = z23.object({
670
+ count: z23.number(),
671
+ durationMs: z23.number(),
672
+ tokens: z23.number(),
673
+ costUsd: z23.number()
674
+ });
675
+ var LLMTraceSummarySchema = z23.object({
676
+ totalSteps: z23.number(),
677
+ totalDurationMs: z23.number(),
555
678
  totalTokens: TokenUsageSchema,
556
- totalCostUsd: z22.number(),
557
- stepTypeBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema).optional(),
558
- modelBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema),
559
- modelsUsed: z22.array(z22.string())
560
- });
561
- var LLMTraceSchema = z22.object({
562
- id: z22.string(),
563
- steps: z22.array(LLMTraceStepSchema),
679
+ totalCostUsd: z23.number(),
680
+ stepTypeBreakdown: z23.record(z23.string(), LLMBreakdownStatsSchema).optional(),
681
+ modelBreakdown: z23.record(z23.string(), LLMBreakdownStatsSchema),
682
+ modelsUsed: z23.array(z23.string())
683
+ });
684
+ var LLMTraceSchema = z23.object({
685
+ id: z23.string(),
686
+ steps: z23.array(LLMTraceStepSchema),
564
687
  summary: LLMTraceSummarySchema
565
688
  });
566
689
 
567
690
  // src/evaluation/eval-result.ts
568
- import { z as z25 } from "zod";
691
+ import { z as z26 } from "zod";
569
692
 
570
693
  // src/evaluation/eval-run.ts
571
- import { z as z24 } from "zod";
694
+ import { z as z25 } from "zod";
572
695
 
573
696
  // src/evaluation/live-trace.ts
574
- import { z as z23 } from "zod";
697
+ import { z as z24 } from "zod";
575
698
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
576
699
  LiveTraceEventType2["THINKING"] = "thinking";
577
700
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -585,37 +708,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
585
708
  LiveTraceEventType2["USER"] = "user";
586
709
  return LiveTraceEventType2;
587
710
  })(LiveTraceEventType || {});
588
- var LiveTraceEventSchema = z23.object({
711
+ var LiveTraceEventSchema = z24.object({
589
712
  /** The evaluation run ID */
590
- evalRunId: z23.string(),
713
+ evalRunId: z24.string(),
591
714
  /** The scenario ID being executed */
592
- scenarioId: z23.string(),
715
+ scenarioId: z24.string(),
593
716
  /** The scenario name for display */
594
- scenarioName: z23.string(),
717
+ scenarioName: z24.string(),
595
718
  /** The target ID (skill, agent, etc.) */
596
- targetId: z23.string(),
719
+ targetId: z24.string(),
597
720
  /** The target name for display */
598
- targetName: z23.string(),
721
+ targetName: z24.string(),
599
722
  /** Step number in the current scenario execution */
600
- stepNumber: z23.number(),
723
+ stepNumber: z24.number(),
601
724
  /** Type of trace event */
602
- type: z23.enum(LiveTraceEventType),
725
+ type: z24.enum(LiveTraceEventType),
603
726
  /** Tool name if this is a tool_use event */
604
- toolName: z23.string().optional(),
727
+ toolName: z24.string().optional(),
605
728
  /** Tool arguments preview (truncated JSON) */
606
- toolArgs: z23.string().optional(),
729
+ toolArgs: z24.string().optional(),
607
730
  /** Output preview (truncated text) */
608
- outputPreview: z23.string().optional(),
731
+ outputPreview: z24.string().optional(),
609
732
  /** File path for file operations */
610
- filePath: z23.string().optional(),
733
+ filePath: z24.string().optional(),
611
734
  /** Elapsed time in milliseconds for progress events */
612
- elapsedMs: z23.number().optional(),
735
+ elapsedMs: z24.number().optional(),
613
736
  /** Thinking/reasoning text from Claude */
614
- thinking: z23.string().optional(),
737
+ thinking: z24.string().optional(),
615
738
  /** Timestamp when this event occurred */
616
- timestamp: z23.string(),
739
+ timestamp: z24.string(),
617
740
  /** Whether this is the final event for this scenario */
618
- isComplete: z23.boolean()
741
+ isComplete: z24.boolean()
619
742
  });
620
743
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
621
744
  function parseTraceEventLine(line) {
@@ -643,14 +766,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
643
766
  TriggerType2["MANUAL"] = "MANUAL";
644
767
  return TriggerType2;
645
768
  })(TriggerType || {});
646
- var TriggerMetadataSchema = z24.object({
647
- version: z24.string().optional(),
648
- resourceUpdated: z24.array(z24.string()).optional()
769
+ var TriggerMetadataSchema = z25.object({
770
+ version: z25.string().optional(),
771
+ resourceUpdated: z25.array(z25.string()).optional()
649
772
  });
650
- var TriggerSchema = z24.object({
651
- id: z24.string(),
773
+ var TriggerSchema = z25.object({
774
+ id: z25.string(),
652
775
  metadata: TriggerMetadataSchema.optional(),
653
- type: z24.enum(TriggerType)
776
+ type: z25.enum(TriggerType)
654
777
  });
655
778
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
656
779
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -668,28 +791,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
668
791
  FailureSeverity2["LOW"] = "low";
669
792
  return FailureSeverity2;
670
793
  })(FailureSeverity || {});
671
- var DiffLineTypeSchema = z24.enum(["added", "removed", "unchanged"]);
672
- var DiffLineSchema = z24.object({
794
+ var DiffLineTypeSchema = z25.enum(["added", "removed", "unchanged"]);
795
+ var DiffLineSchema = z25.object({
673
796
  type: DiffLineTypeSchema,
674
- content: z24.string(),
675
- lineNumber: z24.number()
676
- });
677
- var DiffContentSchema = z24.object({
678
- path: z24.string(),
679
- expected: z24.string(),
680
- actual: z24.string(),
681
- diffLines: z24.array(DiffLineSchema),
682
- renamedFrom: z24.string().optional()
683
- });
684
- var CommandExecutionSchema = z24.object({
685
- command: z24.string(),
686
- exitCode: z24.number(),
687
- output: z24.string().optional(),
688
- duration: z24.number()
689
- });
690
- var FileModificationSchema = z24.object({
691
- path: z24.string(),
692
- action: z24.enum(["created", "modified", "deleted"])
797
+ content: z25.string(),
798
+ lineNumber: z25.number()
799
+ });
800
+ var DiffContentSchema = z25.object({
801
+ path: z25.string(),
802
+ expected: z25.string(),
803
+ actual: z25.string(),
804
+ diffLines: z25.array(DiffLineSchema),
805
+ renamedFrom: z25.string().optional()
806
+ });
807
+ var CommandExecutionSchema = z25.object({
808
+ command: z25.string(),
809
+ exitCode: z25.number(),
810
+ output: z25.string().optional(),
811
+ duration: z25.number()
812
+ });
813
+ var FileModificationSchema = z25.object({
814
+ path: z25.string(),
815
+ action: z25.enum(["created", "modified", "deleted"])
693
816
  });
694
817
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
695
818
  TemplateFileStatus2["NEW"] = "new";
@@ -697,75 +820,75 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
697
820
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
698
821
  return TemplateFileStatus2;
699
822
  })(TemplateFileStatus || {});
700
- var TemplateFileSchema = z24.object({
823
+ var TemplateFileSchema = z25.object({
701
824
  /** Relative path within the template */
702
- path: z24.string(),
825
+ path: z25.string(),
703
826
  /** Full file content after execution */
704
- content: z24.string(),
827
+ content: z25.string(),
705
828
  /** File status (new, modified, unchanged) */
706
- status: z24.enum(["new", "modified", "unchanged"])
707
- });
708
- var ApiCallSchema = z24.object({
709
- endpoint: z24.string(),
710
- tokensUsed: z24.number(),
711
- duration: z24.number()
712
- });
713
- var ExecutionTraceSchema = z24.object({
714
- commands: z24.array(CommandExecutionSchema),
715
- filesModified: z24.array(FileModificationSchema),
716
- apiCalls: z24.array(ApiCallSchema),
717
- totalDuration: z24.number()
718
- });
719
- var FailureAnalysisSchema = z24.object({
720
- category: z24.enum(FailureCategory),
721
- severity: z24.enum(FailureSeverity),
722
- summary: z24.string(),
723
- details: z24.string(),
724
- rootCause: z24.string(),
725
- suggestedFix: z24.string(),
726
- relatedAssertions: z24.array(z24.string()),
727
- codeSnippet: z24.string().optional(),
728
- similarIssues: z24.array(z24.string()).optional(),
729
- patternId: z24.string().optional(),
829
+ status: z25.enum(["new", "modified", "unchanged"])
830
+ });
831
+ var ApiCallSchema = z25.object({
832
+ endpoint: z25.string(),
833
+ tokensUsed: z25.number(),
834
+ duration: z25.number()
835
+ });
836
+ var ExecutionTraceSchema = z25.object({
837
+ commands: z25.array(CommandExecutionSchema),
838
+ filesModified: z25.array(FileModificationSchema),
839
+ apiCalls: z25.array(ApiCallSchema),
840
+ totalDuration: z25.number()
841
+ });
842
+ var FailureAnalysisSchema = z25.object({
843
+ category: z25.enum(FailureCategory),
844
+ severity: z25.enum(FailureSeverity),
845
+ summary: z25.string(),
846
+ details: z25.string(),
847
+ rootCause: z25.string(),
848
+ suggestedFix: z25.string(),
849
+ relatedAssertions: z25.array(z25.string()),
850
+ codeSnippet: z25.string().optional(),
851
+ similarIssues: z25.array(z25.string()).optional(),
852
+ patternId: z25.string().optional(),
730
853
  // Extended fields for detailed debugging
731
854
  diff: DiffContentSchema.optional(),
732
855
  executionTrace: ExecutionTraceSchema.optional()
733
856
  });
734
857
  var EvalRunSchema = TenantEntitySchema.extend({
735
858
  /** Agent ID for this run */
736
- agentId: z24.string().optional(),
859
+ agentId: z25.string().optional(),
737
860
  /** Skills group ID for this run */
738
- skillsGroupId: z24.string().optional(),
861
+ skillsGroupId: z25.string().optional(),
739
862
  /** Scenario IDs to run */
740
- scenarioIds: z24.array(z24.string()),
863
+ scenarioIds: z25.array(z25.string()),
741
864
  /** Current status */
742
865
  status: EvalStatusSchema,
743
866
  /** Progress percentage (0-100) */
744
- progress: z24.number(),
867
+ progress: z25.number(),
745
868
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
746
- results: z24.array(z24.lazy(() => EvalRunResultSchema)),
869
+ results: z25.array(z25.lazy(() => EvalRunResultSchema)),
747
870
  /** Aggregated metrics across all results */
748
871
  aggregateMetrics: EvalMetricsSchema,
749
872
  /** Failure analyses */
750
- failureAnalyses: z24.array(FailureAnalysisSchema).optional(),
873
+ failureAnalyses: z25.array(FailureAnalysisSchema).optional(),
751
874
  /** Aggregated LLM trace summary */
752
875
  llmTraceSummary: LLMTraceSummarySchema.optional(),
753
876
  /** What triggered this run */
754
877
  trigger: TriggerSchema.optional(),
755
878
  /** When the run started (set when evaluation is triggered) */
756
- startedAt: z24.string().optional(),
879
+ startedAt: z25.string().optional(),
757
880
  /** When the run completed */
758
- completedAt: z24.string().optional(),
881
+ completedAt: z25.string().optional(),
759
882
  /** Live trace events captured during execution (for playback on results page) */
760
- liveTraceEvents: z24.array(LiveTraceEventSchema).optional(),
883
+ liveTraceEvents: z25.array(LiveTraceEventSchema).optional(),
761
884
  /** Remote job ID for tracking execution in Dev Machines */
762
- jobId: z24.string().optional(),
885
+ jobId: z25.string().optional(),
763
886
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
764
- jobStatus: z24.string().optional(),
887
+ jobStatus: z25.string().optional(),
765
888
  /** Remote job error message if the job failed */
766
- jobError: z24.string().optional(),
889
+ jobError: z25.string().optional(),
767
890
  /** Timestamp of the last job status check */
768
- jobStatusCheckedAt: z24.string().optional()
891
+ jobStatusCheckedAt: z25.string().optional()
769
892
  });
770
893
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
771
894
  id: true,
@@ -778,28 +901,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
778
901
  startedAt: true,
779
902
  completedAt: true
780
903
  });
781
- var EvaluationProgressSchema = z24.object({
782
- runId: z24.string(),
783
- targetId: z24.string(),
784
- totalScenarios: z24.number(),
785
- completedScenarios: z24.number(),
786
- scenarioProgress: z24.array(
787
- z24.object({
788
- scenarioId: z24.string(),
789
- currentStep: z24.string(),
790
- error: z24.string().optional()
904
+ var EvaluationProgressSchema = z25.object({
905
+ runId: z25.string(),
906
+ targetId: z25.string(),
907
+ totalScenarios: z25.number(),
908
+ completedScenarios: z25.number(),
909
+ scenarioProgress: z25.array(
910
+ z25.object({
911
+ scenarioId: z25.string(),
912
+ currentStep: z25.string(),
913
+ error: z25.string().optional()
791
914
  })
792
915
  ),
793
- createdAt: z24.number()
916
+ createdAt: z25.number()
794
917
  });
795
- var EvaluationLogSchema = z24.object({
796
- runId: z24.string(),
797
- scenarioId: z24.string(),
798
- log: z24.object({
799
- level: z24.enum(["info", "error", "debug"]),
800
- message: z24.string().optional(),
801
- args: z24.array(z24.any()).optional(),
802
- error: z24.string().optional()
918
+ var EvaluationLogSchema = z25.object({
919
+ runId: z25.string(),
920
+ scenarioId: z25.string(),
921
+ log: z25.object({
922
+ level: z25.enum(["info", "error", "debug"]),
923
+ message: z25.string().optional(),
924
+ args: z25.array(z25.any()).optional(),
925
+ error: z25.string().optional()
803
926
  })
804
927
  });
805
928
  var LLM_TIMEOUT = 12e4;
@@ -812,91 +935,91 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
812
935
  AssertionResultStatus2["ERROR"] = "error";
813
936
  return AssertionResultStatus2;
814
937
  })(AssertionResultStatus || {});
815
- var AssertionResultSchema = z25.object({
816
- id: z25.string(),
817
- assertionId: z25.string(),
818
- assertionType: z25.string(),
819
- assertionName: z25.string(),
820
- status: z25.enum(AssertionResultStatus),
821
- message: z25.string().optional(),
822
- expected: z25.string().optional(),
823
- actual: z25.string().optional(),
824
- duration: z25.number().optional(),
825
- details: z25.record(z25.string(), z25.unknown()).optional(),
826
- llmTraceSteps: z25.array(LLMTraceStepSchema).optional()
827
- });
828
- var EvalRunResultSchema = z25.object({
829
- id: z25.string(),
830
- targetId: z25.string(),
831
- targetName: z25.string().optional(),
832
- scenarioId: z25.string(),
833
- scenarioName: z25.string(),
938
+ var AssertionResultSchema = z26.object({
939
+ id: z26.string(),
940
+ assertionId: z26.string(),
941
+ assertionType: z26.string(),
942
+ assertionName: z26.string(),
943
+ status: z26.enum(AssertionResultStatus),
944
+ message: z26.string().optional(),
945
+ expected: z26.string().optional(),
946
+ actual: z26.string().optional(),
947
+ duration: z26.number().optional(),
948
+ details: z26.record(z26.string(), z26.unknown()).optional(),
949
+ llmTraceSteps: z26.array(LLMTraceStepSchema).optional()
950
+ });
951
+ var EvalRunResultSchema = z26.object({
952
+ id: z26.string(),
953
+ targetId: z26.string(),
954
+ targetName: z26.string().optional(),
955
+ scenarioId: z26.string(),
956
+ scenarioName: z26.string(),
834
957
  modelConfig: ModelConfigSchema.optional(),
835
- assertionResults: z25.array(AssertionResultSchema),
958
+ assertionResults: z26.array(AssertionResultSchema),
836
959
  metrics: EvalMetricsSchema.optional(),
837
- passed: z25.number(),
838
- failed: z25.number(),
839
- passRate: z25.number(),
840
- duration: z25.number(),
841
- outputText: z25.string().optional(),
842
- files: z25.array(ExpectedFileSchema).optional(),
843
- fileDiffs: z25.array(DiffContentSchema).optional(),
960
+ passed: z26.number(),
961
+ failed: z26.number(),
962
+ passRate: z26.number(),
963
+ duration: z26.number(),
964
+ outputText: z26.string().optional(),
965
+ files: z26.array(ExpectedFileSchema).optional(),
966
+ fileDiffs: z26.array(DiffContentSchema).optional(),
844
967
  /** Full template files after execution with status indicators */
845
- templateFiles: z25.array(TemplateFileSchema).optional(),
846
- startedAt: z25.string().optional(),
847
- completedAt: z25.string().optional(),
968
+ templateFiles: z26.array(TemplateFileSchema).optional(),
969
+ startedAt: z26.string().optional(),
970
+ completedAt: z26.string().optional(),
848
971
  llmTrace: LLMTraceSchema.optional()
849
972
  });
850
- var PromptResultSchema = z25.object({
851
- text: z25.string(),
852
- files: z25.array(z25.unknown()).optional(),
853
- finishReason: z25.string().optional(),
854
- reasoning: z25.string().optional(),
855
- reasoningDetails: z25.unknown().optional(),
856
- toolCalls: z25.array(z25.unknown()).optional(),
857
- toolResults: z25.array(z25.unknown()).optional(),
858
- warnings: z25.array(z25.unknown()).optional(),
859
- sources: z25.array(z25.unknown()).optional(),
860
- steps: z25.array(z25.unknown()),
861
- generationTimeMs: z25.number(),
862
- prompt: z25.string(),
863
- systemPrompt: z25.string(),
864
- usage: z25.object({
865
- totalTokens: z25.number().optional(),
866
- totalMicrocentsSpent: z25.number().optional()
973
+ var PromptResultSchema = z26.object({
974
+ text: z26.string(),
975
+ files: z26.array(z26.unknown()).optional(),
976
+ finishReason: z26.string().optional(),
977
+ reasoning: z26.string().optional(),
978
+ reasoningDetails: z26.unknown().optional(),
979
+ toolCalls: z26.array(z26.unknown()).optional(),
980
+ toolResults: z26.array(z26.unknown()).optional(),
981
+ warnings: z26.array(z26.unknown()).optional(),
982
+ sources: z26.array(z26.unknown()).optional(),
983
+ steps: z26.array(z26.unknown()),
984
+ generationTimeMs: z26.number(),
985
+ prompt: z26.string(),
986
+ systemPrompt: z26.string(),
987
+ usage: z26.object({
988
+ totalTokens: z26.number().optional(),
989
+ totalMicrocentsSpent: z26.number().optional()
867
990
  })
868
991
  });
869
- var EvaluationResultSchema = z25.object({
870
- id: z25.string(),
871
- runId: z25.string(),
872
- timestamp: z25.number(),
992
+ var EvaluationResultSchema = z26.object({
993
+ id: z26.string(),
994
+ runId: z26.string(),
995
+ timestamp: z26.number(),
873
996
  promptResult: PromptResultSchema,
874
- testResults: z25.array(z25.unknown()),
875
- tags: z25.array(z25.string()).optional(),
876
- feedback: z25.string().optional(),
877
- score: z25.number(),
878
- suiteId: z25.string().optional()
879
- });
880
- var LeanEvaluationResultSchema = z25.object({
881
- id: z25.string(),
882
- runId: z25.string(),
883
- timestamp: z25.number(),
884
- tags: z25.array(z25.string()).optional(),
885
- scenarioId: z25.string(),
886
- scenarioVersion: z25.number().optional(),
887
- targetId: z25.string(),
888
- targetVersion: z25.number().optional(),
889
- suiteId: z25.string().optional(),
890
- score: z25.number(),
891
- time: z25.number().optional(),
892
- microcentsSpent: z25.number().optional()
997
+ testResults: z26.array(z26.unknown()),
998
+ tags: z26.array(z26.string()).optional(),
999
+ feedback: z26.string().optional(),
1000
+ score: z26.number(),
1001
+ suiteId: z26.string().optional()
1002
+ });
1003
+ var LeanEvaluationResultSchema = z26.object({
1004
+ id: z26.string(),
1005
+ runId: z26.string(),
1006
+ timestamp: z26.number(),
1007
+ tags: z26.array(z26.string()).optional(),
1008
+ scenarioId: z26.string(),
1009
+ scenarioVersion: z26.number().optional(),
1010
+ targetId: z26.string(),
1011
+ targetVersion: z26.number().optional(),
1012
+ suiteId: z26.string().optional(),
1013
+ score: z26.number(),
1014
+ time: z26.number().optional(),
1015
+ microcentsSpent: z26.number().optional()
893
1016
  });
894
1017
 
895
1018
  // src/project/project.ts
896
- import { z as z26 } from "zod";
1019
+ import { z as z27 } from "zod";
897
1020
  var ProjectSchema = BaseEntitySchema.extend({
898
- appId: z26.string().optional().describe("The ID of the app in Dev Center"),
899
- appSecret: z26.string().optional().describe("The secret of the app in Dev Center")
1021
+ appId: z27.string().optional().describe("The ID of the app in Dev Center"),
1022
+ appSecret: z27.string().optional().describe("The secret of the app in Dev Center")
900
1023
  });
901
1024
  var CreateProjectInputSchema = ProjectSchema.omit({
902
1025
  id: true,
@@ -907,10 +1030,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
907
1030
  var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
908
1031
 
909
1032
  // src/template/template.ts
910
- import { z as z27 } from "zod";
1033
+ import { z as z28 } from "zod";
911
1034
  var TemplateSchema = TenantEntitySchema.extend({
912
1035
  /** URL to download the template from */
913
- downloadUrl: z27.url()
1036
+ downloadUrl: z28.url()
914
1037
  });
915
1038
  var CreateTemplateInputSchema = TemplateSchema.omit({
916
1039
  id: true,
@@ -920,86 +1043,69 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
920
1043
  });
921
1044
  var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
922
1045
 
923
- // src/assertion/assertion.ts
924
- import { z as z28 } from "zod";
925
- var AssertionTypeSchema = z28.enum([
926
- "skill_was_called",
927
- "build_passed",
928
- "llm_judge",
929
- "custom"
930
- ]);
931
- var SkillWasCalledConfigSchema = z28.object({
932
- /** Name of the skill that must have been called */
933
- skillName: z28.string().min(1)
934
- });
935
- var BuildPassedConfigSchema = z28.object({
936
- /** Command to run (default: "yarn build") */
937
- command: z28.string().optional(),
938
- /** Expected exit code (default: 0) */
939
- expectedExitCode: z28.number().int().optional()
940
- });
941
- var LlmJudgeConfigSchema = z28.object({
942
- /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
943
- prompt: z28.string().min(1),
944
- /** Optional system prompt for the judge */
945
- systemPrompt: z28.string().optional(),
946
- /** Minimum score to pass (0-100, default 70) */
947
- minScore: z28.number().int().min(0).max(100).optional(),
948
- /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
949
- model: z28.string().optional(),
950
- /** Max output tokens */
951
- maxTokens: z28.number().int().optional(),
952
- /** Temperature (0-1) */
953
- temperature: z28.number().min(0).max(1).optional()
954
- });
955
- var AssertionConfigSchema = z28.union([
956
- SkillWasCalledConfigSchema,
957
- BuildPassedConfigSchema,
958
- LlmJudgeConfigSchema,
959
- z28.object({})
960
- // Empty config for cases where defaults are used
961
- ]);
962
- var CustomAssertionSchema = TenantEntitySchema.extend({
963
- /** The assertion type */
964
- type: AssertionTypeSchema,
965
- /** Type-specific configuration */
966
- config: AssertionConfigSchema
967
- });
968
- var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
969
- id: true,
970
- createdAt: true,
971
- updatedAt: true,
972
- deleted: true
973
- });
974
- var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
975
- function validateAssertionConfig(type, config) {
976
- switch (type) {
977
- case "skill_was_called":
978
- return SkillWasCalledConfigSchema.safeParse(config).success;
979
- case "build_passed":
980
- return BuildPassedConfigSchema.safeParse(config).success;
981
- case "llm_judge":
982
- case "custom":
983
- return LlmJudgeConfigSchema.safeParse(config).success;
984
- default:
985
- return false;
986
- }
987
- }
988
- function getSkillWasCalledConfig(assertion) {
989
- if (assertion.type !== "skill_was_called") return null;
990
- const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
991
- return result.success ? result.data : null;
1046
+ // src/assertion/system-assertions.ts
1047
+ var SYSTEM_ASSERTION_IDS = {
1048
+ SKILL_WAS_CALLED: "system:skill_was_called",
1049
+ BUILD_PASSED: "system:build_passed"
1050
+ };
1051
+ function isSystemAssertionId(id) {
1052
+ return id.startsWith("system:");
992
1053
  }
993
- function getBuildPassedConfig(assertion) {
994
- if (assertion.type !== "build_passed") return null;
995
- const result = BuildPassedConfigSchema.safeParse(assertion.config);
996
- return result.success ? result.data : null;
1054
+ var SYSTEM_ASSERTIONS = {
1055
+ [SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
1056
+ id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
1057
+ name: "Skill Was Called",
1058
+ description: "Check if a specific skill was invoked during the agent run",
1059
+ type: "skill_was_called",
1060
+ parameters: [
1061
+ {
1062
+ name: "skillName",
1063
+ label: "Skill Name",
1064
+ type: "string",
1065
+ required: true
1066
+ }
1067
+ ]
1068
+ },
1069
+ [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
1070
+ id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
1071
+ name: "Build Passed",
1072
+ description: "Run a build command and verify it exits with expected code",
1073
+ type: "build_passed",
1074
+ parameters: [
1075
+ {
1076
+ name: "command",
1077
+ label: "Build Command",
1078
+ type: "string",
1079
+ required: false,
1080
+ defaultValue: "yarn build"
1081
+ },
1082
+ {
1083
+ name: "expectedExitCode",
1084
+ label: "Expected Exit Code",
1085
+ type: "number",
1086
+ required: false,
1087
+ defaultValue: 0
1088
+ },
1089
+ {
1090
+ name: "maxBuildTime",
1091
+ label: "Max Build Time (ms)",
1092
+ type: "number",
1093
+ required: false
1094
+ },
1095
+ {
1096
+ name: "maxMemory",
1097
+ label: "Max Memory (MB)",
1098
+ type: "number",
1099
+ required: false
1100
+ }
1101
+ ]
1102
+ }
1103
+ };
1104
+ function getSystemAssertions() {
1105
+ return Object.values(SYSTEM_ASSERTIONS);
997
1106
  }
998
- function getLlmJudgeConfig(assertion) {
999
- if (assertion.type !== "llm_judge" && assertion.type !== "custom")
1000
- return null;
1001
- const result = LlmJudgeConfigSchema.safeParse(assertion.config);
1002
- return result.success ? result.data : null;
1107
+ function getSystemAssertion(id) {
1108
+ return SYSTEM_ASSERTIONS[id];
1003
1109
  }
1004
1110
  export {
1005
1111
  AVAILABLE_MODELS,
@@ -1008,6 +1114,8 @@ export {
1008
1114
  AllowedCommands,
1009
1115
  ApiCallSchema,
1010
1116
  AssertionConfigSchema,
1117
+ AssertionParameterSchema,
1118
+ AssertionParameterTypeSchema,
1011
1119
  AssertionResultSchema,
1012
1120
  AssertionResultStatus,
1013
1121
  AssertionSchema,
@@ -1074,6 +1182,9 @@ export {
1074
1182
  ProjectSchema,
1075
1183
  PromptResultSchema,
1076
1184
  SKILL_FOLDER_NAME_REGEX,
1185
+ SYSTEM_ASSERTIONS,
1186
+ SYSTEM_ASSERTION_IDS,
1187
+ ScenarioAssertionLinkSchema,
1077
1188
  SiteConfigTestSchema,
1078
1189
  SkillMetadataSchema,
1079
1190
  SkillSchema,
@@ -1112,6 +1223,9 @@ export {
1112
1223
  getBuildPassedConfig,
1113
1224
  getLlmJudgeConfig,
1114
1225
  getSkillWasCalledConfig,
1226
+ getSystemAssertion,
1227
+ getSystemAssertions,
1228
+ isSystemAssertionId,
1115
1229
  isValidSkillFolderName,
1116
1230
  parseTraceEventLine,
1117
1231
  validateAssertionConfig