@wix/evalforge-types 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -634,128 +634,46 @@ var LLMTraceSchema = import_zod21.z.object({
634
634
  });
635
635
 
636
636
  // src/evaluation/eval-result.ts
637
- var import_zod22 = require("zod");
638
- var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
639
- AssertionResultStatus2["PASSED"] = "passed";
640
- AssertionResultStatus2["FAILED"] = "failed";
641
- AssertionResultStatus2["SKIPPED"] = "skipped";
642
- AssertionResultStatus2["ERROR"] = "error";
643
- return AssertionResultStatus2;
644
- })(AssertionResultStatus || {});
645
- var AssertionResultSchema = import_zod22.z.object({
646
- id: import_zod22.z.string(),
647
- assertionId: import_zod22.z.string(),
648
- assertionType: import_zod22.z.string(),
649
- assertionName: import_zod22.z.string(),
650
- status: import_zod22.z.enum(AssertionResultStatus),
651
- message: import_zod22.z.string().optional(),
652
- expected: import_zod22.z.string().optional(),
653
- actual: import_zod22.z.string().optional(),
654
- duration: import_zod22.z.number().optional(),
655
- details: import_zod22.z.record(import_zod22.z.string(), import_zod22.z.unknown()).optional(),
656
- llmTraceSteps: import_zod22.z.array(LLMTraceStepSchema).optional()
657
- });
658
- var EvalRunResultSchema = import_zod22.z.object({
659
- id: import_zod22.z.string(),
660
- targetId: import_zod22.z.string(),
661
- targetName: import_zod22.z.string().optional(),
662
- scenarioId: import_zod22.z.string(),
663
- scenarioName: import_zod22.z.string(),
664
- modelConfig: ModelConfigSchema.optional(),
665
- assertionResults: import_zod22.z.array(AssertionResultSchema),
666
- metrics: EvalMetricsSchema.optional(),
667
- passed: import_zod22.z.number(),
668
- failed: import_zod22.z.number(),
669
- passRate: import_zod22.z.number(),
670
- duration: import_zod22.z.number(),
671
- outputText: import_zod22.z.string().optional(),
672
- files: import_zod22.z.array(ExpectedFileSchema).optional(),
673
- startedAt: import_zod22.z.string().optional(),
674
- completedAt: import_zod22.z.string().optional(),
675
- llmTrace: LLMTraceSchema.optional()
676
- });
677
- var PromptResultSchema = import_zod22.z.object({
678
- text: import_zod22.z.string(),
679
- files: import_zod22.z.array(import_zod22.z.unknown()).optional(),
680
- finishReason: import_zod22.z.string().optional(),
681
- reasoning: import_zod22.z.string().optional(),
682
- reasoningDetails: import_zod22.z.unknown().optional(),
683
- toolCalls: import_zod22.z.array(import_zod22.z.unknown()).optional(),
684
- toolResults: import_zod22.z.array(import_zod22.z.unknown()).optional(),
685
- warnings: import_zod22.z.array(import_zod22.z.unknown()).optional(),
686
- sources: import_zod22.z.array(import_zod22.z.unknown()).optional(),
687
- steps: import_zod22.z.array(import_zod22.z.unknown()),
688
- generationTimeMs: import_zod22.z.number(),
689
- prompt: import_zod22.z.string(),
690
- systemPrompt: import_zod22.z.string(),
691
- usage: import_zod22.z.object({
692
- totalTokens: import_zod22.z.number().optional(),
693
- totalMicrocentsSpent: import_zod22.z.number().optional()
694
- })
695
- });
696
- var EvaluationResultSchema = import_zod22.z.object({
697
- id: import_zod22.z.string(),
698
- runId: import_zod22.z.string(),
699
- timestamp: import_zod22.z.number(),
700
- promptResult: PromptResultSchema,
701
- testResults: import_zod22.z.array(import_zod22.z.unknown()),
702
- tags: import_zod22.z.array(import_zod22.z.string()).optional(),
703
- feedback: import_zod22.z.string().optional(),
704
- score: import_zod22.z.number(),
705
- suiteId: import_zod22.z.string().optional()
706
- });
707
- var LeanEvaluationResultSchema = import_zod22.z.object({
708
- id: import_zod22.z.string(),
709
- runId: import_zod22.z.string(),
710
- timestamp: import_zod22.z.number(),
711
- tags: import_zod22.z.array(import_zod22.z.string()).optional(),
712
- scenarioId: import_zod22.z.string(),
713
- scenarioVersion: import_zod22.z.number().optional(),
714
- targetId: import_zod22.z.string(),
715
- targetVersion: import_zod22.z.number().optional(),
716
- suiteId: import_zod22.z.string().optional(),
717
- score: import_zod22.z.number(),
718
- time: import_zod22.z.number().optional(),
719
- microcentsSpent: import_zod22.z.number().optional()
720
- });
637
+ var import_zod24 = require("zod");
721
638
 
722
639
  // src/evaluation/eval-run.ts
723
- var import_zod24 = require("zod");
640
+ var import_zod23 = require("zod");
724
641
 
725
642
  // src/evaluation/live-trace.ts
726
- var import_zod23 = require("zod");
643
+ var import_zod22 = require("zod");
727
644
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
728
645
  LiveTraceEventType2["THINKING"] = "thinking";
729
646
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
730
647
  LiveTraceEventType2["COMPLETION"] = "completion";
731
648
  LiveTraceEventType2["TOOL_RESULT"] = "tool_result";
649
+ LiveTraceEventType2["DIAGNOSTIC"] = "diagnostic";
732
650
  return LiveTraceEventType2;
733
651
  })(LiveTraceEventType || {});
734
- var LiveTraceEventSchema = import_zod23.z.object({
652
+ var LiveTraceEventSchema = import_zod22.z.object({
735
653
  /** The evaluation run ID */
736
- evalRunId: import_zod23.z.string(),
654
+ evalRunId: import_zod22.z.string(),
737
655
  /** The scenario ID being executed */
738
- scenarioId: import_zod23.z.string(),
656
+ scenarioId: import_zod22.z.string(),
739
657
  /** The scenario name for display */
740
- scenarioName: import_zod23.z.string(),
658
+ scenarioName: import_zod22.z.string(),
741
659
  /** The target ID (skill, agent, etc.) */
742
- targetId: import_zod23.z.string(),
660
+ targetId: import_zod22.z.string(),
743
661
  /** The target name for display */
744
- targetName: import_zod23.z.string(),
662
+ targetName: import_zod22.z.string(),
745
663
  /** Step number in the current scenario execution */
746
- stepNumber: import_zod23.z.number(),
664
+ stepNumber: import_zod22.z.number(),
747
665
  /** Type of trace event */
748
- type: import_zod23.z.enum(LiveTraceEventType),
666
+ type: import_zod22.z.enum(LiveTraceEventType),
749
667
  /** Tool name if this is a tool_use event */
750
- toolName: import_zod23.z.string().optional(),
668
+ toolName: import_zod22.z.string().optional(),
751
669
  /** Tool arguments preview (truncated JSON) */
752
- toolArgs: import_zod23.z.string().optional(),
670
+ toolArgs: import_zod22.z.string().optional(),
753
671
  /** Output preview (truncated text) */
754
- outputPreview: import_zod23.z.string().optional(),
672
+ outputPreview: import_zod22.z.string().optional(),
755
673
  /** Timestamp when this event occurred */
756
- timestamp: import_zod23.z.string(),
674
+ timestamp: import_zod22.z.string(),
757
675
  /** Whether this is the final event for this scenario */
758
- isComplete: import_zod23.z.boolean()
676
+ isComplete: import_zod22.z.boolean()
759
677
  });
760
678
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
761
679
  function parseTraceEventLine(line) {
@@ -783,14 +701,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
783
701
  TriggerType2["MANUAL"] = "MANUAL";
784
702
  return TriggerType2;
785
703
  })(TriggerType || {});
786
- var TriggerMetadataSchema = import_zod24.z.object({
787
- version: import_zod24.z.string().optional(),
788
- resourceUpdated: import_zod24.z.array(import_zod24.z.string()).optional()
704
+ var TriggerMetadataSchema = import_zod23.z.object({
705
+ version: import_zod23.z.string().optional(),
706
+ resourceUpdated: import_zod23.z.array(import_zod23.z.string()).optional()
789
707
  });
790
- var TriggerSchema = import_zod24.z.object({
791
- id: import_zod24.z.string(),
708
+ var TriggerSchema = import_zod23.z.object({
709
+ id: import_zod23.z.string(),
792
710
  metadata: TriggerMetadataSchema.optional(),
793
- type: import_zod24.z.enum(TriggerType)
711
+ type: import_zod23.z.enum(TriggerType)
794
712
  });
795
713
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
796
714
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -808,89 +726,89 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
808
726
  FailureSeverity2["LOW"] = "low";
809
727
  return FailureSeverity2;
810
728
  })(FailureSeverity || {});
811
- var DiffLineTypeSchema = import_zod24.z.enum(["added", "removed", "unchanged"]);
812
- var DiffLineSchema = import_zod24.z.object({
729
+ var DiffLineTypeSchema = import_zod23.z.enum(["added", "removed", "unchanged"]);
730
+ var DiffLineSchema = import_zod23.z.object({
813
731
  type: DiffLineTypeSchema,
814
- content: import_zod24.z.string(),
815
- lineNumber: import_zod24.z.number()
816
- });
817
- var DiffContentSchema = import_zod24.z.object({
818
- path: import_zod24.z.string(),
819
- expected: import_zod24.z.string(),
820
- actual: import_zod24.z.string(),
821
- diffLines: import_zod24.z.array(DiffLineSchema)
822
- });
823
- var CommandExecutionSchema = import_zod24.z.object({
824
- command: import_zod24.z.string(),
825
- exitCode: import_zod24.z.number(),
826
- output: import_zod24.z.string().optional(),
827
- duration: import_zod24.z.number()
828
- });
829
- var FileModificationSchema = import_zod24.z.object({
830
- path: import_zod24.z.string(),
831
- action: import_zod24.z.enum(["created", "modified", "deleted"])
832
- });
833
- var ApiCallSchema = import_zod24.z.object({
834
- endpoint: import_zod24.z.string(),
835
- tokensUsed: import_zod24.z.number(),
836
- duration: import_zod24.z.number()
837
- });
838
- var ExecutionTraceSchema = import_zod24.z.object({
839
- commands: import_zod24.z.array(CommandExecutionSchema),
840
- filesModified: import_zod24.z.array(FileModificationSchema),
841
- apiCalls: import_zod24.z.array(ApiCallSchema),
842
- totalDuration: import_zod24.z.number()
843
- });
844
- var FailureAnalysisSchema = import_zod24.z.object({
845
- category: import_zod24.z.enum(FailureCategory),
846
- severity: import_zod24.z.enum(FailureSeverity),
847
- summary: import_zod24.z.string(),
848
- details: import_zod24.z.string(),
849
- rootCause: import_zod24.z.string(),
850
- suggestedFix: import_zod24.z.string(),
851
- relatedAssertions: import_zod24.z.array(import_zod24.z.string()),
852
- codeSnippet: import_zod24.z.string().optional(),
853
- similarIssues: import_zod24.z.array(import_zod24.z.string()).optional(),
854
- patternId: import_zod24.z.string().optional(),
732
+ content: import_zod23.z.string(),
733
+ lineNumber: import_zod23.z.number()
734
+ });
735
+ var DiffContentSchema = import_zod23.z.object({
736
+ path: import_zod23.z.string(),
737
+ expected: import_zod23.z.string(),
738
+ actual: import_zod23.z.string(),
739
+ diffLines: import_zod23.z.array(DiffLineSchema)
740
+ });
741
+ var CommandExecutionSchema = import_zod23.z.object({
742
+ command: import_zod23.z.string(),
743
+ exitCode: import_zod23.z.number(),
744
+ output: import_zod23.z.string().optional(),
745
+ duration: import_zod23.z.number()
746
+ });
747
+ var FileModificationSchema = import_zod23.z.object({
748
+ path: import_zod23.z.string(),
749
+ action: import_zod23.z.enum(["created", "modified", "deleted"])
750
+ });
751
+ var ApiCallSchema = import_zod23.z.object({
752
+ endpoint: import_zod23.z.string(),
753
+ tokensUsed: import_zod23.z.number(),
754
+ duration: import_zod23.z.number()
755
+ });
756
+ var ExecutionTraceSchema = import_zod23.z.object({
757
+ commands: import_zod23.z.array(CommandExecutionSchema),
758
+ filesModified: import_zod23.z.array(FileModificationSchema),
759
+ apiCalls: import_zod23.z.array(ApiCallSchema),
760
+ totalDuration: import_zod23.z.number()
761
+ });
762
+ var FailureAnalysisSchema = import_zod23.z.object({
763
+ category: import_zod23.z.enum(FailureCategory),
764
+ severity: import_zod23.z.enum(FailureSeverity),
765
+ summary: import_zod23.z.string(),
766
+ details: import_zod23.z.string(),
767
+ rootCause: import_zod23.z.string(),
768
+ suggestedFix: import_zod23.z.string(),
769
+ relatedAssertions: import_zod23.z.array(import_zod23.z.string()),
770
+ codeSnippet: import_zod23.z.string().optional(),
771
+ similarIssues: import_zod23.z.array(import_zod23.z.string()).optional(),
772
+ patternId: import_zod23.z.string().optional(),
855
773
  // Extended fields for detailed debugging
856
774
  diff: DiffContentSchema.optional(),
857
775
  executionTrace: ExecutionTraceSchema.optional()
858
776
  });
859
777
  var EvalRunSchema = TenantEntitySchema.extend({
860
778
  /** Agent ID for this run */
861
- agentId: import_zod24.z.string().optional(),
779
+ agentId: import_zod23.z.string().optional(),
862
780
  /** Skills group ID for this run */
863
- skillsGroupId: import_zod24.z.string().optional(),
781
+ skillsGroupId: import_zod23.z.string().optional(),
864
782
  /** Scenario IDs to run */
865
- scenarioIds: import_zod24.z.array(import_zod24.z.string()),
783
+ scenarioIds: import_zod23.z.array(import_zod23.z.string()),
866
784
  /** Current status */
867
785
  status: EvalStatusSchema,
868
786
  /** Progress percentage (0-100) */
869
- progress: import_zod24.z.number(),
787
+ progress: import_zod23.z.number(),
870
788
  /** Results for each scenario/target combination */
871
- results: import_zod24.z.array(EvalRunResultSchema),
789
+ results: import_zod23.z.array(EvalRunResultSchema),
872
790
  /** Aggregated metrics across all results */
873
791
  aggregateMetrics: EvalMetricsSchema,
874
792
  /** Failure analyses */
875
- failureAnalyses: import_zod24.z.array(FailureAnalysisSchema).optional(),
793
+ failureAnalyses: import_zod23.z.array(FailureAnalysisSchema).optional(),
876
794
  /** Aggregated LLM trace summary */
877
795
  llmTraceSummary: LLMTraceSummarySchema.optional(),
878
796
  /** What triggered this run */
879
797
  trigger: TriggerSchema.optional(),
880
798
  /** When the run started (set when evaluation is triggered) */
881
- startedAt: import_zod24.z.string().optional(),
799
+ startedAt: import_zod23.z.string().optional(),
882
800
  /** When the run completed */
883
- completedAt: import_zod24.z.string().optional(),
801
+ completedAt: import_zod23.z.string().optional(),
884
802
  /** Live trace events captured during execution (for playback on results page) */
885
- liveTraceEvents: import_zod24.z.array(LiveTraceEventSchema).optional(),
803
+ liveTraceEvents: import_zod23.z.array(LiveTraceEventSchema).optional(),
886
804
  /** Remote job ID for tracking execution in Dev Machines */
887
- jobId: import_zod24.z.string().optional(),
805
+ jobId: import_zod23.z.string().optional(),
888
806
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
889
- jobStatus: import_zod24.z.string().optional(),
807
+ jobStatus: import_zod23.z.string().optional(),
890
808
  /** Remote job error message if the job failed */
891
- jobError: import_zod24.z.string().optional(),
809
+ jobError: import_zod23.z.string().optional(),
892
810
  /** Timestamp of the last job status check */
893
- jobStatusCheckedAt: import_zod24.z.string().optional()
811
+ jobStatusCheckedAt: import_zod23.z.string().optional()
894
812
  });
895
813
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
896
814
  id: true,
@@ -903,32 +821,119 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
903
821
  startedAt: true,
904
822
  completedAt: true
905
823
  });
906
- var EvaluationProgressSchema = import_zod24.z.object({
907
- runId: import_zod24.z.string(),
908
- targetId: import_zod24.z.string(),
909
- totalScenarios: import_zod24.z.number(),
910
- completedScenarios: import_zod24.z.number(),
911
- scenarioProgress: import_zod24.z.array(
912
- import_zod24.z.object({
913
- scenarioId: import_zod24.z.string(),
914
- currentStep: import_zod24.z.string(),
915
- error: import_zod24.z.string().optional()
824
+ var EvaluationProgressSchema = import_zod23.z.object({
825
+ runId: import_zod23.z.string(),
826
+ targetId: import_zod23.z.string(),
827
+ totalScenarios: import_zod23.z.number(),
828
+ completedScenarios: import_zod23.z.number(),
829
+ scenarioProgress: import_zod23.z.array(
830
+ import_zod23.z.object({
831
+ scenarioId: import_zod23.z.string(),
832
+ currentStep: import_zod23.z.string(),
833
+ error: import_zod23.z.string().optional()
916
834
  })
917
835
  ),
918
- createdAt: import_zod24.z.number()
836
+ createdAt: import_zod23.z.number()
919
837
  });
920
- var EvaluationLogSchema = import_zod24.z.object({
921
- runId: import_zod24.z.string(),
922
- scenarioId: import_zod24.z.string(),
923
- log: import_zod24.z.object({
924
- level: import_zod24.z.enum(["info", "error", "debug"]),
925
- message: import_zod24.z.string().optional(),
926
- args: import_zod24.z.array(import_zod24.z.any()).optional(),
927
- error: import_zod24.z.string().optional()
838
+ var EvaluationLogSchema = import_zod23.z.object({
839
+ runId: import_zod23.z.string(),
840
+ scenarioId: import_zod23.z.string(),
841
+ log: import_zod23.z.object({
842
+ level: import_zod23.z.enum(["info", "error", "debug"]),
843
+ message: import_zod23.z.string().optional(),
844
+ args: import_zod23.z.array(import_zod23.z.any()).optional(),
845
+ error: import_zod23.z.string().optional()
928
846
  })
929
847
  });
930
848
  var LLM_TIMEOUT = 12e4;
931
849
 
850
+ // src/evaluation/eval-result.ts
851
+ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
852
+ AssertionResultStatus2["PASSED"] = "passed";
853
+ AssertionResultStatus2["FAILED"] = "failed";
854
+ AssertionResultStatus2["SKIPPED"] = "skipped";
855
+ AssertionResultStatus2["ERROR"] = "error";
856
+ return AssertionResultStatus2;
857
+ })(AssertionResultStatus || {});
858
+ var AssertionResultSchema = import_zod24.z.object({
859
+ id: import_zod24.z.string(),
860
+ assertionId: import_zod24.z.string(),
861
+ assertionType: import_zod24.z.string(),
862
+ assertionName: import_zod24.z.string(),
863
+ status: import_zod24.z.enum(AssertionResultStatus),
864
+ message: import_zod24.z.string().optional(),
865
+ expected: import_zod24.z.string().optional(),
866
+ actual: import_zod24.z.string().optional(),
867
+ duration: import_zod24.z.number().optional(),
868
+ details: import_zod24.z.record(import_zod24.z.string(), import_zod24.z.unknown()).optional(),
869
+ llmTraceSteps: import_zod24.z.array(LLMTraceStepSchema).optional()
870
+ });
871
+ var EvalRunResultSchema = import_zod24.z.object({
872
+ id: import_zod24.z.string(),
873
+ targetId: import_zod24.z.string(),
874
+ targetName: import_zod24.z.string().optional(),
875
+ scenarioId: import_zod24.z.string(),
876
+ scenarioName: import_zod24.z.string(),
877
+ modelConfig: ModelConfigSchema.optional(),
878
+ assertionResults: import_zod24.z.array(AssertionResultSchema),
879
+ metrics: EvalMetricsSchema.optional(),
880
+ passed: import_zod24.z.number(),
881
+ failed: import_zod24.z.number(),
882
+ passRate: import_zod24.z.number(),
883
+ duration: import_zod24.z.number(),
884
+ outputText: import_zod24.z.string().optional(),
885
+ files: import_zod24.z.array(ExpectedFileSchema).optional(),
886
+ /** File diffs showing changes made by the agent during execution */
887
+ fileDiffs: import_zod24.z.array(DiffContentSchema).optional(),
888
+ startedAt: import_zod24.z.string().optional(),
889
+ completedAt: import_zod24.z.string().optional(),
890
+ llmTrace: LLMTraceSchema.optional()
891
+ });
892
+ var PromptResultSchema = import_zod24.z.object({
893
+ text: import_zod24.z.string(),
894
+ files: import_zod24.z.array(import_zod24.z.unknown()).optional(),
895
+ finishReason: import_zod24.z.string().optional(),
896
+ reasoning: import_zod24.z.string().optional(),
897
+ reasoningDetails: import_zod24.z.unknown().optional(),
898
+ toolCalls: import_zod24.z.array(import_zod24.z.unknown()).optional(),
899
+ toolResults: import_zod24.z.array(import_zod24.z.unknown()).optional(),
900
+ warnings: import_zod24.z.array(import_zod24.z.unknown()).optional(),
901
+ sources: import_zod24.z.array(import_zod24.z.unknown()).optional(),
902
+ steps: import_zod24.z.array(import_zod24.z.unknown()),
903
+ generationTimeMs: import_zod24.z.number(),
904
+ prompt: import_zod24.z.string(),
905
+ systemPrompt: import_zod24.z.string(),
906
+ usage: import_zod24.z.object({
907
+ totalTokens: import_zod24.z.number().optional(),
908
+ totalMicrocentsSpent: import_zod24.z.number().optional()
909
+ })
910
+ });
911
+ var EvaluationResultSchema = import_zod24.z.object({
912
+ id: import_zod24.z.string(),
913
+ runId: import_zod24.z.string(),
914
+ timestamp: import_zod24.z.number(),
915
+ promptResult: PromptResultSchema,
916
+ testResults: import_zod24.z.array(import_zod24.z.unknown()),
917
+ tags: import_zod24.z.array(import_zod24.z.string()).optional(),
918
+ feedback: import_zod24.z.string().optional(),
919
+ score: import_zod24.z.number(),
920
+ suiteId: import_zod24.z.string().optional()
921
+ });
922
+ var LeanEvaluationResultSchema = import_zod24.z.object({
923
+ id: import_zod24.z.string(),
924
+ runId: import_zod24.z.string(),
925
+ timestamp: import_zod24.z.number(),
926
+ tags: import_zod24.z.array(import_zod24.z.string()).optional(),
927
+ scenarioId: import_zod24.z.string(),
928
+ scenarioVersion: import_zod24.z.number().optional(),
929
+ targetId: import_zod24.z.string(),
930
+ targetVersion: import_zod24.z.number().optional(),
931
+ suiteId: import_zod24.z.string().optional(),
932
+ score: import_zod24.z.number(),
933
+ time: import_zod24.z.number().optional(),
934
+ microcentsSpent: import_zod24.z.number().optional()
935
+ });
936
+
932
937
  // src/project/project.ts
933
938
  var import_zod25 = require("zod");
934
939
  var ProjectSchema = BaseEntitySchema.extend({