@wix/evalforge-types 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -634,96 +634,13 @@ var LLMTraceSchema = import_zod21.z.object({
634
634
  });
635
635
 
636
636
  // src/evaluation/eval-result.ts
637
- var import_zod22 = require("zod");
638
- var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
639
- AssertionResultStatus2["PASSED"] = "passed";
640
- AssertionResultStatus2["FAILED"] = "failed";
641
- AssertionResultStatus2["SKIPPED"] = "skipped";
642
- AssertionResultStatus2["ERROR"] = "error";
643
- return AssertionResultStatus2;
644
- })(AssertionResultStatus || {});
645
- var AssertionResultSchema = import_zod22.z.object({
646
- id: import_zod22.z.string(),
647
- assertionId: import_zod22.z.string(),
648
- assertionType: import_zod22.z.string(),
649
- assertionName: import_zod22.z.string(),
650
- status: import_zod22.z.enum(AssertionResultStatus),
651
- message: import_zod22.z.string().optional(),
652
- expected: import_zod22.z.string().optional(),
653
- actual: import_zod22.z.string().optional(),
654
- duration: import_zod22.z.number().optional(),
655
- details: import_zod22.z.record(import_zod22.z.string(), import_zod22.z.unknown()).optional(),
656
- llmTraceSteps: import_zod22.z.array(LLMTraceStepSchema).optional()
657
- });
658
- var EvalRunResultSchema = import_zod22.z.object({
659
- id: import_zod22.z.string(),
660
- targetId: import_zod22.z.string(),
661
- targetName: import_zod22.z.string().optional(),
662
- scenarioId: import_zod22.z.string(),
663
- scenarioName: import_zod22.z.string(),
664
- modelConfig: ModelConfigSchema.optional(),
665
- assertionResults: import_zod22.z.array(AssertionResultSchema),
666
- metrics: EvalMetricsSchema.optional(),
667
- passed: import_zod22.z.number(),
668
- failed: import_zod22.z.number(),
669
- passRate: import_zod22.z.number(),
670
- duration: import_zod22.z.number(),
671
- outputText: import_zod22.z.string().optional(),
672
- files: import_zod22.z.array(ExpectedFileSchema).optional(),
673
- startedAt: import_zod22.z.string().optional(),
674
- completedAt: import_zod22.z.string().optional(),
675
- llmTrace: LLMTraceSchema.optional()
676
- });
677
- var PromptResultSchema = import_zod22.z.object({
678
- text: import_zod22.z.string(),
679
- files: import_zod22.z.array(import_zod22.z.unknown()).optional(),
680
- finishReason: import_zod22.z.string().optional(),
681
- reasoning: import_zod22.z.string().optional(),
682
- reasoningDetails: import_zod22.z.unknown().optional(),
683
- toolCalls: import_zod22.z.array(import_zod22.z.unknown()).optional(),
684
- toolResults: import_zod22.z.array(import_zod22.z.unknown()).optional(),
685
- warnings: import_zod22.z.array(import_zod22.z.unknown()).optional(),
686
- sources: import_zod22.z.array(import_zod22.z.unknown()).optional(),
687
- steps: import_zod22.z.array(import_zod22.z.unknown()),
688
- generationTimeMs: import_zod22.z.number(),
689
- prompt: import_zod22.z.string(),
690
- systemPrompt: import_zod22.z.string(),
691
- usage: import_zod22.z.object({
692
- totalTokens: import_zod22.z.number().optional(),
693
- totalMicrocentsSpent: import_zod22.z.number().optional()
694
- })
695
- });
696
- var EvaluationResultSchema = import_zod22.z.object({
697
- id: import_zod22.z.string(),
698
- runId: import_zod22.z.string(),
699
- timestamp: import_zod22.z.number(),
700
- promptResult: PromptResultSchema,
701
- testResults: import_zod22.z.array(import_zod22.z.unknown()),
702
- tags: import_zod22.z.array(import_zod22.z.string()).optional(),
703
- feedback: import_zod22.z.string().optional(),
704
- score: import_zod22.z.number(),
705
- suiteId: import_zod22.z.string().optional()
706
- });
707
- var LeanEvaluationResultSchema = import_zod22.z.object({
708
- id: import_zod22.z.string(),
709
- runId: import_zod22.z.string(),
710
- timestamp: import_zod22.z.number(),
711
- tags: import_zod22.z.array(import_zod22.z.string()).optional(),
712
- scenarioId: import_zod22.z.string(),
713
- scenarioVersion: import_zod22.z.number().optional(),
714
- targetId: import_zod22.z.string(),
715
- targetVersion: import_zod22.z.number().optional(),
716
- suiteId: import_zod22.z.string().optional(),
717
- score: import_zod22.z.number(),
718
- time: import_zod22.z.number().optional(),
719
- microcentsSpent: import_zod22.z.number().optional()
720
- });
637
+ var import_zod24 = require("zod");
721
638
 
722
639
  // src/evaluation/eval-run.ts
723
- var import_zod24 = require("zod");
640
+ var import_zod23 = require("zod");
724
641
 
725
642
  // src/evaluation/live-trace.ts
726
- var import_zod23 = require("zod");
643
+ var import_zod22 = require("zod");
727
644
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
728
645
  LiveTraceEventType2["THINKING"] = "thinking";
729
646
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -732,31 +649,31 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
732
649
  LiveTraceEventType2["DIAGNOSTIC"] = "diagnostic";
733
650
  return LiveTraceEventType2;
734
651
  })(LiveTraceEventType || {});
735
- var LiveTraceEventSchema = import_zod23.z.object({
652
+ var LiveTraceEventSchema = import_zod22.z.object({
736
653
  /** The evaluation run ID */
737
- evalRunId: import_zod23.z.string(),
654
+ evalRunId: import_zod22.z.string(),
738
655
  /** The scenario ID being executed */
739
- scenarioId: import_zod23.z.string(),
656
+ scenarioId: import_zod22.z.string(),
740
657
  /** The scenario name for display */
741
- scenarioName: import_zod23.z.string(),
658
+ scenarioName: import_zod22.z.string(),
742
659
  /** The target ID (skill, agent, etc.) */
743
- targetId: import_zod23.z.string(),
660
+ targetId: import_zod22.z.string(),
744
661
  /** The target name for display */
745
- targetName: import_zod23.z.string(),
662
+ targetName: import_zod22.z.string(),
746
663
  /** Step number in the current scenario execution */
747
- stepNumber: import_zod23.z.number(),
664
+ stepNumber: import_zod22.z.number(),
748
665
  /** Type of trace event */
749
- type: import_zod23.z.enum(LiveTraceEventType),
666
+ type: import_zod22.z.enum(LiveTraceEventType),
750
667
  /** Tool name if this is a tool_use event */
751
- toolName: import_zod23.z.string().optional(),
668
+ toolName: import_zod22.z.string().optional(),
752
669
  /** Tool arguments preview (truncated JSON) */
753
- toolArgs: import_zod23.z.string().optional(),
670
+ toolArgs: import_zod22.z.string().optional(),
754
671
  /** Output preview (truncated text) */
755
- outputPreview: import_zod23.z.string().optional(),
672
+ outputPreview: import_zod22.z.string().optional(),
756
673
  /** Timestamp when this event occurred */
757
- timestamp: import_zod23.z.string(),
674
+ timestamp: import_zod22.z.string(),
758
675
  /** Whether this is the final event for this scenario */
759
- isComplete: import_zod23.z.boolean()
676
+ isComplete: import_zod22.z.boolean()
760
677
  });
761
678
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
762
679
  function parseTraceEventLine(line) {
@@ -784,14 +701,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
784
701
  TriggerType2["MANUAL"] = "MANUAL";
785
702
  return TriggerType2;
786
703
  })(TriggerType || {});
787
- var TriggerMetadataSchema = import_zod24.z.object({
788
- version: import_zod24.z.string().optional(),
789
- resourceUpdated: import_zod24.z.array(import_zod24.z.string()).optional()
704
+ var TriggerMetadataSchema = import_zod23.z.object({
705
+ version: import_zod23.z.string().optional(),
706
+ resourceUpdated: import_zod23.z.array(import_zod23.z.string()).optional()
790
707
  });
791
- var TriggerSchema = import_zod24.z.object({
792
- id: import_zod24.z.string(),
708
+ var TriggerSchema = import_zod23.z.object({
709
+ id: import_zod23.z.string(),
793
710
  metadata: TriggerMetadataSchema.optional(),
794
- type: import_zod24.z.enum(TriggerType)
711
+ type: import_zod23.z.enum(TriggerType)
795
712
  });
796
713
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
797
714
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -809,89 +726,89 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
809
726
  FailureSeverity2["LOW"] = "low";
810
727
  return FailureSeverity2;
811
728
  })(FailureSeverity || {});
812
- var DiffLineTypeSchema = import_zod24.z.enum(["added", "removed", "unchanged"]);
813
- var DiffLineSchema = import_zod24.z.object({
729
+ var DiffLineTypeSchema = import_zod23.z.enum(["added", "removed", "unchanged"]);
730
+ var DiffLineSchema = import_zod23.z.object({
814
731
  type: DiffLineTypeSchema,
815
- content: import_zod24.z.string(),
816
- lineNumber: import_zod24.z.number()
817
- });
818
- var DiffContentSchema = import_zod24.z.object({
819
- path: import_zod24.z.string(),
820
- expected: import_zod24.z.string(),
821
- actual: import_zod24.z.string(),
822
- diffLines: import_zod24.z.array(DiffLineSchema)
823
- });
824
- var CommandExecutionSchema = import_zod24.z.object({
825
- command: import_zod24.z.string(),
826
- exitCode: import_zod24.z.number(),
827
- output: import_zod24.z.string().optional(),
828
- duration: import_zod24.z.number()
829
- });
830
- var FileModificationSchema = import_zod24.z.object({
831
- path: import_zod24.z.string(),
832
- action: import_zod24.z.enum(["created", "modified", "deleted"])
833
- });
834
- var ApiCallSchema = import_zod24.z.object({
835
- endpoint: import_zod24.z.string(),
836
- tokensUsed: import_zod24.z.number(),
837
- duration: import_zod24.z.number()
838
- });
839
- var ExecutionTraceSchema = import_zod24.z.object({
840
- commands: import_zod24.z.array(CommandExecutionSchema),
841
- filesModified: import_zod24.z.array(FileModificationSchema),
842
- apiCalls: import_zod24.z.array(ApiCallSchema),
843
- totalDuration: import_zod24.z.number()
844
- });
845
- var FailureAnalysisSchema = import_zod24.z.object({
846
- category: import_zod24.z.enum(FailureCategory),
847
- severity: import_zod24.z.enum(FailureSeverity),
848
- summary: import_zod24.z.string(),
849
- details: import_zod24.z.string(),
850
- rootCause: import_zod24.z.string(),
851
- suggestedFix: import_zod24.z.string(),
852
- relatedAssertions: import_zod24.z.array(import_zod24.z.string()),
853
- codeSnippet: import_zod24.z.string().optional(),
854
- similarIssues: import_zod24.z.array(import_zod24.z.string()).optional(),
855
- patternId: import_zod24.z.string().optional(),
732
+ content: import_zod23.z.string(),
733
+ lineNumber: import_zod23.z.number()
734
+ });
735
+ var DiffContentSchema = import_zod23.z.object({
736
+ path: import_zod23.z.string(),
737
+ expected: import_zod23.z.string(),
738
+ actual: import_zod23.z.string(),
739
+ diffLines: import_zod23.z.array(DiffLineSchema)
740
+ });
741
+ var CommandExecutionSchema = import_zod23.z.object({
742
+ command: import_zod23.z.string(),
743
+ exitCode: import_zod23.z.number(),
744
+ output: import_zod23.z.string().optional(),
745
+ duration: import_zod23.z.number()
746
+ });
747
+ var FileModificationSchema = import_zod23.z.object({
748
+ path: import_zod23.z.string(),
749
+ action: import_zod23.z.enum(["created", "modified", "deleted"])
750
+ });
751
+ var ApiCallSchema = import_zod23.z.object({
752
+ endpoint: import_zod23.z.string(),
753
+ tokensUsed: import_zod23.z.number(),
754
+ duration: import_zod23.z.number()
755
+ });
756
+ var ExecutionTraceSchema = import_zod23.z.object({
757
+ commands: import_zod23.z.array(CommandExecutionSchema),
758
+ filesModified: import_zod23.z.array(FileModificationSchema),
759
+ apiCalls: import_zod23.z.array(ApiCallSchema),
760
+ totalDuration: import_zod23.z.number()
761
+ });
762
+ var FailureAnalysisSchema = import_zod23.z.object({
763
+ category: import_zod23.z.enum(FailureCategory),
764
+ severity: import_zod23.z.enum(FailureSeverity),
765
+ summary: import_zod23.z.string(),
766
+ details: import_zod23.z.string(),
767
+ rootCause: import_zod23.z.string(),
768
+ suggestedFix: import_zod23.z.string(),
769
+ relatedAssertions: import_zod23.z.array(import_zod23.z.string()),
770
+ codeSnippet: import_zod23.z.string().optional(),
771
+ similarIssues: import_zod23.z.array(import_zod23.z.string()).optional(),
772
+ patternId: import_zod23.z.string().optional(),
856
773
  // Extended fields for detailed debugging
857
774
  diff: DiffContentSchema.optional(),
858
775
  executionTrace: ExecutionTraceSchema.optional()
859
776
  });
860
777
  var EvalRunSchema = TenantEntitySchema.extend({
861
778
  /** Agent ID for this run */
862
- agentId: import_zod24.z.string().optional(),
779
+ agentId: import_zod23.z.string().optional(),
863
780
  /** Skills group ID for this run */
864
- skillsGroupId: import_zod24.z.string().optional(),
781
+ skillsGroupId: import_zod23.z.string().optional(),
865
782
  /** Scenario IDs to run */
866
- scenarioIds: import_zod24.z.array(import_zod24.z.string()),
783
+ scenarioIds: import_zod23.z.array(import_zod23.z.string()),
867
784
  /** Current status */
868
785
  status: EvalStatusSchema,
869
786
  /** Progress percentage (0-100) */
870
- progress: import_zod24.z.number(),
787
+ progress: import_zod23.z.number(),
871
788
  /** Results for each scenario/target combination */
872
- results: import_zod24.z.array(EvalRunResultSchema),
789
+ results: import_zod23.z.array(EvalRunResultSchema),
873
790
  /** Aggregated metrics across all results */
874
791
  aggregateMetrics: EvalMetricsSchema,
875
792
  /** Failure analyses */
876
- failureAnalyses: import_zod24.z.array(FailureAnalysisSchema).optional(),
793
+ failureAnalyses: import_zod23.z.array(FailureAnalysisSchema).optional(),
877
794
  /** Aggregated LLM trace summary */
878
795
  llmTraceSummary: LLMTraceSummarySchema.optional(),
879
796
  /** What triggered this run */
880
797
  trigger: TriggerSchema.optional(),
881
798
  /** When the run started (set when evaluation is triggered) */
882
- startedAt: import_zod24.z.string().optional(),
799
+ startedAt: import_zod23.z.string().optional(),
883
800
  /** When the run completed */
884
- completedAt: import_zod24.z.string().optional(),
801
+ completedAt: import_zod23.z.string().optional(),
885
802
  /** Live trace events captured during execution (for playback on results page) */
886
- liveTraceEvents: import_zod24.z.array(LiveTraceEventSchema).optional(),
803
+ liveTraceEvents: import_zod23.z.array(LiveTraceEventSchema).optional(),
887
804
  /** Remote job ID for tracking execution in Dev Machines */
888
- jobId: import_zod24.z.string().optional(),
805
+ jobId: import_zod23.z.string().optional(),
889
806
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
890
- jobStatus: import_zod24.z.string().optional(),
807
+ jobStatus: import_zod23.z.string().optional(),
891
808
  /** Remote job error message if the job failed */
892
- jobError: import_zod24.z.string().optional(),
809
+ jobError: import_zod23.z.string().optional(),
893
810
  /** Timestamp of the last job status check */
894
- jobStatusCheckedAt: import_zod24.z.string().optional()
811
+ jobStatusCheckedAt: import_zod23.z.string().optional()
895
812
  });
896
813
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
897
814
  id: true,
@@ -904,32 +821,119 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
904
821
  startedAt: true,
905
822
  completedAt: true
906
823
  });
907
- var EvaluationProgressSchema = import_zod24.z.object({
908
- runId: import_zod24.z.string(),
909
- targetId: import_zod24.z.string(),
910
- totalScenarios: import_zod24.z.number(),
911
- completedScenarios: import_zod24.z.number(),
912
- scenarioProgress: import_zod24.z.array(
913
- import_zod24.z.object({
914
- scenarioId: import_zod24.z.string(),
915
- currentStep: import_zod24.z.string(),
916
- error: import_zod24.z.string().optional()
824
+ var EvaluationProgressSchema = import_zod23.z.object({
825
+ runId: import_zod23.z.string(),
826
+ targetId: import_zod23.z.string(),
827
+ totalScenarios: import_zod23.z.number(),
828
+ completedScenarios: import_zod23.z.number(),
829
+ scenarioProgress: import_zod23.z.array(
830
+ import_zod23.z.object({
831
+ scenarioId: import_zod23.z.string(),
832
+ currentStep: import_zod23.z.string(),
833
+ error: import_zod23.z.string().optional()
917
834
  })
918
835
  ),
919
- createdAt: import_zod24.z.number()
836
+ createdAt: import_zod23.z.number()
920
837
  });
921
- var EvaluationLogSchema = import_zod24.z.object({
922
- runId: import_zod24.z.string(),
923
- scenarioId: import_zod24.z.string(),
924
- log: import_zod24.z.object({
925
- level: import_zod24.z.enum(["info", "error", "debug"]),
926
- message: import_zod24.z.string().optional(),
927
- args: import_zod24.z.array(import_zod24.z.any()).optional(),
928
- error: import_zod24.z.string().optional()
838
+ var EvaluationLogSchema = import_zod23.z.object({
839
+ runId: import_zod23.z.string(),
840
+ scenarioId: import_zod23.z.string(),
841
+ log: import_zod23.z.object({
842
+ level: import_zod23.z.enum(["info", "error", "debug"]),
843
+ message: import_zod23.z.string().optional(),
844
+ args: import_zod23.z.array(import_zod23.z.any()).optional(),
845
+ error: import_zod23.z.string().optional()
929
846
  })
930
847
  });
931
848
  var LLM_TIMEOUT = 12e4;
932
849
 
850
+ // src/evaluation/eval-result.ts
851
+ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
852
+ AssertionResultStatus2["PASSED"] = "passed";
853
+ AssertionResultStatus2["FAILED"] = "failed";
854
+ AssertionResultStatus2["SKIPPED"] = "skipped";
855
+ AssertionResultStatus2["ERROR"] = "error";
856
+ return AssertionResultStatus2;
857
+ })(AssertionResultStatus || {});
858
+ var AssertionResultSchema = import_zod24.z.object({
859
+ id: import_zod24.z.string(),
860
+ assertionId: import_zod24.z.string(),
861
+ assertionType: import_zod24.z.string(),
862
+ assertionName: import_zod24.z.string(),
863
+ status: import_zod24.z.enum(AssertionResultStatus),
864
+ message: import_zod24.z.string().optional(),
865
+ expected: import_zod24.z.string().optional(),
866
+ actual: import_zod24.z.string().optional(),
867
+ duration: import_zod24.z.number().optional(),
868
+ details: import_zod24.z.record(import_zod24.z.string(), import_zod24.z.unknown()).optional(),
869
+ llmTraceSteps: import_zod24.z.array(LLMTraceStepSchema).optional()
870
+ });
871
+ var EvalRunResultSchema = import_zod24.z.object({
872
+ id: import_zod24.z.string(),
873
+ targetId: import_zod24.z.string(),
874
+ targetName: import_zod24.z.string().optional(),
875
+ scenarioId: import_zod24.z.string(),
876
+ scenarioName: import_zod24.z.string(),
877
+ modelConfig: ModelConfigSchema.optional(),
878
+ assertionResults: import_zod24.z.array(AssertionResultSchema),
879
+ metrics: EvalMetricsSchema.optional(),
880
+ passed: import_zod24.z.number(),
881
+ failed: import_zod24.z.number(),
882
+ passRate: import_zod24.z.number(),
883
+ duration: import_zod24.z.number(),
884
+ outputText: import_zod24.z.string().optional(),
885
+ files: import_zod24.z.array(ExpectedFileSchema).optional(),
886
+ /** File diffs showing changes made by the agent during execution */
887
+ fileDiffs: import_zod24.z.array(DiffContentSchema).optional(),
888
+ startedAt: import_zod24.z.string().optional(),
889
+ completedAt: import_zod24.z.string().optional(),
890
+ llmTrace: LLMTraceSchema.optional()
891
+ });
892
+ var PromptResultSchema = import_zod24.z.object({
893
+ text: import_zod24.z.string(),
894
+ files: import_zod24.z.array(import_zod24.z.unknown()).optional(),
895
+ finishReason: import_zod24.z.string().optional(),
896
+ reasoning: import_zod24.z.string().optional(),
897
+ reasoningDetails: import_zod24.z.unknown().optional(),
898
+ toolCalls: import_zod24.z.array(import_zod24.z.unknown()).optional(),
899
+ toolResults: import_zod24.z.array(import_zod24.z.unknown()).optional(),
900
+ warnings: import_zod24.z.array(import_zod24.z.unknown()).optional(),
901
+ sources: import_zod24.z.array(import_zod24.z.unknown()).optional(),
902
+ steps: import_zod24.z.array(import_zod24.z.unknown()),
903
+ generationTimeMs: import_zod24.z.number(),
904
+ prompt: import_zod24.z.string(),
905
+ systemPrompt: import_zod24.z.string(),
906
+ usage: import_zod24.z.object({
907
+ totalTokens: import_zod24.z.number().optional(),
908
+ totalMicrocentsSpent: import_zod24.z.number().optional()
909
+ })
910
+ });
911
+ var EvaluationResultSchema = import_zod24.z.object({
912
+ id: import_zod24.z.string(),
913
+ runId: import_zod24.z.string(),
914
+ timestamp: import_zod24.z.number(),
915
+ promptResult: PromptResultSchema,
916
+ testResults: import_zod24.z.array(import_zod24.z.unknown()),
917
+ tags: import_zod24.z.array(import_zod24.z.string()).optional(),
918
+ feedback: import_zod24.z.string().optional(),
919
+ score: import_zod24.z.number(),
920
+ suiteId: import_zod24.z.string().optional()
921
+ });
922
+ var LeanEvaluationResultSchema = import_zod24.z.object({
923
+ id: import_zod24.z.string(),
924
+ runId: import_zod24.z.string(),
925
+ timestamp: import_zod24.z.number(),
926
+ tags: import_zod24.z.array(import_zod24.z.string()).optional(),
927
+ scenarioId: import_zod24.z.string(),
928
+ scenarioVersion: import_zod24.z.number().optional(),
929
+ targetId: import_zod24.z.string(),
930
+ targetVersion: import_zod24.z.number().optional(),
931
+ suiteId: import_zod24.z.string().optional(),
932
+ score: import_zod24.z.number(),
933
+ time: import_zod24.z.number().optional(),
934
+ microcentsSpent: import_zod24.z.number().optional()
935
+ });
936
+
933
937
  // src/project/project.ts
934
938
  var import_zod25 = require("zod");
935
939
  var ProjectSchema = BaseEntitySchema.extend({