@wix/evalforge-types 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -374,34 +374,67 @@ var TestSchema = z17.discriminatedUnion("type", [
374
374
  PlaywrightNLTestSchema
375
375
  ]);
376
376
 
377
- // src/scenario/environment.ts
377
+ // src/scenario/assertions.ts
378
378
  import { z as z18 } from "zod";
379
- var LocalProjectConfigSchema = z18.object({
379
+ var SkillWasCalledAssertionSchema = z18.object({
380
+ type: z18.literal("skill_was_called"),
381
+ /** Name of the skill that must have been called (matched against trace Skill tool args) */
382
+ skillName: z18.string()
383
+ });
384
+ var BuildPassedAssertionSchema = z18.object({
385
+ type: z18.literal("build_passed"),
386
+ /** Command to run (default: "yarn build") */
387
+ command: z18.string().optional(),
388
+ /** Expected exit code (default: 0) */
389
+ expectedExitCode: z18.number().int().optional()
390
+ });
391
+ var LlmJudgeAssertionSchema = z18.object({
392
+ type: z18.literal("llm_judge"),
393
+ /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
394
+ prompt: z18.string(),
395
+ /** Optional system prompt for the judge (default asks for JSON with score) */
396
+ systemPrompt: z18.string().optional(),
397
+ /** Minimum score to pass (0–100, default 70) */
398
+ minScore: z18.number().int().min(0).max(100).optional(),
399
+ /** Model for the judge (e.g. claude-3-5-haiku) */
400
+ model: z18.string().optional(),
401
+ maxTokens: z18.number().int().optional(),
402
+ temperature: z18.number().min(0).max(1).optional()
403
+ });
404
+ var AssertionSchema = z18.discriminatedUnion("type", [
405
+ SkillWasCalledAssertionSchema,
406
+ BuildPassedAssertionSchema,
407
+ LlmJudgeAssertionSchema
408
+ ]);
409
+
410
+ // src/scenario/environment.ts
411
+ import { z as z19 } from "zod";
412
+ var LocalProjectConfigSchema = z19.object({
380
413
  /** Template ID to use for the local project */
381
- templateId: z18.string().optional(),
414
+ templateId: z19.string().optional(),
382
415
  /** Files to create in the project */
383
- files: z18.array(
384
- z18.object({
385
- path: z18.string().min(1),
386
- content: z18.string().min(1)
416
+ files: z19.array(
417
+ z19.object({
418
+ path: z19.string().min(1),
419
+ content: z19.string().min(1)
387
420
  })
388
421
  ).optional()
389
422
  });
390
- var MetaSiteConfigSchema = z18.object({
391
- configurations: z18.array(
392
- z18.object({
393
- name: z18.string().min(1),
394
- apiCalls: z18.array(
395
- z18.object({
396
- url: z18.string().url(),
397
- method: z18.enum(["POST", "PUT"]),
398
- body: z18.string()
423
+ var MetaSiteConfigSchema = z19.object({
424
+ configurations: z19.array(
425
+ z19.object({
426
+ name: z19.string().min(1),
427
+ apiCalls: z19.array(
428
+ z19.object({
429
+ url: z19.string().url(),
430
+ method: z19.enum(["POST", "PUT"]),
431
+ body: z19.string()
399
432
  })
400
433
  )
401
434
  })
402
435
  ).optional()
403
436
  });
404
- var EnvironmentSchema = z18.object({
437
+ var EnvironmentSchema = z19.object({
405
438
  /** Local project configuration */
406
439
  localProject: LocalProjectConfigSchema.optional(),
407
440
  /** Meta site configuration */
@@ -409,18 +442,20 @@ var EnvironmentSchema = z18.object({
409
442
  });
410
443
 
411
444
  // src/scenario/test-scenario.ts
412
- import { z as z19 } from "zod";
413
- var ExpectedFileSchema = z19.object({
445
+ import { z as z20 } from "zod";
446
+ var ExpectedFileSchema = z20.object({
414
447
  /** Relative path where the file should be created */
415
- path: z19.string(),
448
+ path: z20.string(),
416
449
  /** Optional expected content */
417
- content: z19.string().optional()
450
+ content: z20.string().optional()
418
451
  });
419
452
  var TestScenarioSchema = TenantEntitySchema.extend({
420
453
  /** The prompt sent to the agent to trigger the task */
421
- triggerPrompt: z19.string().min(10),
454
+ triggerPrompt: z20.string().min(10),
422
455
  /** ID of the template to use for this scenario */
423
- templateId: z19.string().optional()
456
+ templateId: z20.string().optional(),
457
+ /** Assertions to evaluate for this scenario */
458
+ assertions: z20.array(AssertionSchema).optional()
424
459
  });
425
460
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
426
461
  id: true,
@@ -431,10 +466,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
431
466
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
432
467
 
433
468
  // src/suite/test-suite.ts
434
- import { z as z20 } from "zod";
469
+ import { z as z21 } from "zod";
435
470
  var TestSuiteSchema = TenantEntitySchema.extend({
436
471
  /** IDs of test scenarios in this suite */
437
- scenarioIds: z20.array(z20.string())
472
+ scenarioIds: z21.array(z21.string())
438
473
  });
439
474
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
440
475
  id: true,
@@ -445,21 +480,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
445
480
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
446
481
 
447
482
  // src/evaluation/metrics.ts
448
- import { z as z21 } from "zod";
449
- var TokenUsageSchema = z21.object({
450
- prompt: z21.number(),
451
- completion: z21.number(),
452
- total: z21.number()
453
- });
454
- var EvalMetricsSchema = z21.object({
455
- totalAssertions: z21.number(),
456
- passed: z21.number(),
457
- failed: z21.number(),
458
- skipped: z21.number(),
459
- errors: z21.number(),
460
- passRate: z21.number(),
461
- avgDuration: z21.number(),
462
- totalDuration: z21.number()
483
+ import { z as z22 } from "zod";
484
+ var TokenUsageSchema = z22.object({
485
+ prompt: z22.number(),
486
+ completion: z22.number(),
487
+ total: z22.number()
488
+ });
489
+ var EvalMetricsSchema = z22.object({
490
+ totalAssertions: z22.number(),
491
+ passed: z22.number(),
492
+ failed: z22.number(),
493
+ skipped: z22.number(),
494
+ errors: z22.number(),
495
+ passRate: z22.number(),
496
+ avgDuration: z22.number(),
497
+ totalDuration: z22.number()
463
498
  });
464
499
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
465
500
  EvalStatus2["PENDING"] = "pending";
@@ -469,7 +504,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
469
504
  EvalStatus2["CANCELLED"] = "cancelled";
470
505
  return EvalStatus2;
471
506
  })(EvalStatus || {});
472
- var EvalStatusSchema = z21.enum(EvalStatus);
507
+ var EvalStatusSchema = z22.enum(EvalStatus);
473
508
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
474
509
  LLMStepType2["COMPLETION"] = "completion";
475
510
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -477,52 +512,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
477
512
  LLMStepType2["THINKING"] = "thinking";
478
513
  return LLMStepType2;
479
514
  })(LLMStepType || {});
480
- var LLMTraceStepSchema = z21.object({
481
- id: z21.string(),
482
- stepNumber: z21.number(),
483
- type: z21.enum(LLMStepType),
484
- model: z21.string(),
485
- provider: z21.string(),
486
- startedAt: z21.string(),
487
- durationMs: z21.number(),
515
+ var LLMTraceStepSchema = z22.object({
516
+ id: z22.string(),
517
+ stepNumber: z22.number(),
518
+ type: z22.enum(LLMStepType),
519
+ model: z22.string(),
520
+ provider: z22.string(),
521
+ startedAt: z22.string(),
522
+ durationMs: z22.number(),
488
523
  tokenUsage: TokenUsageSchema,
489
- costUsd: z21.number(),
490
- toolName: z21.string().optional(),
491
- toolArguments: z21.string().optional(),
492
- inputPreview: z21.string().optional(),
493
- outputPreview: z21.string().optional(),
494
- success: z21.boolean(),
495
- error: z21.string().optional()
496
- });
497
- var LLMBreakdownStatsSchema = z21.object({
498
- count: z21.number(),
499
- durationMs: z21.number(),
500
- tokens: z21.number(),
501
- costUsd: z21.number()
502
- });
503
- var LLMTraceSummarySchema = z21.object({
504
- totalSteps: z21.number(),
505
- totalDurationMs: z21.number(),
524
+ costUsd: z22.number(),
525
+ toolName: z22.string().optional(),
526
+ toolArguments: z22.string().optional(),
527
+ inputPreview: z22.string().optional(),
528
+ outputPreview: z22.string().optional(),
529
+ success: z22.boolean(),
530
+ error: z22.string().optional()
531
+ });
532
+ var LLMBreakdownStatsSchema = z22.object({
533
+ count: z22.number(),
534
+ durationMs: z22.number(),
535
+ tokens: z22.number(),
536
+ costUsd: z22.number()
537
+ });
538
+ var LLMTraceSummarySchema = z22.object({
539
+ totalSteps: z22.number(),
540
+ totalDurationMs: z22.number(),
506
541
  totalTokens: TokenUsageSchema,
507
- totalCostUsd: z21.number(),
508
- stepTypeBreakdown: z21.record(z21.string(), LLMBreakdownStatsSchema).optional(),
509
- modelBreakdown: z21.record(z21.string(), LLMBreakdownStatsSchema),
510
- modelsUsed: z21.array(z21.string())
511
- });
512
- var LLMTraceSchema = z21.object({
513
- id: z21.string(),
514
- steps: z21.array(LLMTraceStepSchema),
542
+ totalCostUsd: z22.number(),
543
+ stepTypeBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema).optional(),
544
+ modelBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema),
545
+ modelsUsed: z22.array(z22.string())
546
+ });
547
+ var LLMTraceSchema = z22.object({
548
+ id: z22.string(),
549
+ steps: z22.array(LLMTraceStepSchema),
515
550
  summary: LLMTraceSummarySchema
516
551
  });
517
552
 
518
553
  // src/evaluation/eval-result.ts
519
- import { z as z24 } from "zod";
554
+ import { z as z25 } from "zod";
520
555
 
521
556
  // src/evaluation/eval-run.ts
522
- import { z as z23 } from "zod";
557
+ import { z as z24 } from "zod";
523
558
 
524
559
  // src/evaluation/live-trace.ts
525
- import { z as z22 } from "zod";
560
+ import { z as z23 } from "zod";
526
561
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
527
562
  LiveTraceEventType2["THINKING"] = "thinking";
528
563
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -531,31 +566,31 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
531
566
  LiveTraceEventType2["DIAGNOSTIC"] = "diagnostic";
532
567
  return LiveTraceEventType2;
533
568
  })(LiveTraceEventType || {});
534
- var LiveTraceEventSchema = z22.object({
569
+ var LiveTraceEventSchema = z23.object({
535
570
  /** The evaluation run ID */
536
- evalRunId: z22.string(),
571
+ evalRunId: z23.string(),
537
572
  /** The scenario ID being executed */
538
- scenarioId: z22.string(),
573
+ scenarioId: z23.string(),
539
574
  /** The scenario name for display */
540
- scenarioName: z22.string(),
575
+ scenarioName: z23.string(),
541
576
  /** The target ID (skill, agent, etc.) */
542
- targetId: z22.string(),
577
+ targetId: z23.string(),
543
578
  /** The target name for display */
544
- targetName: z22.string(),
579
+ targetName: z23.string(),
545
580
  /** Step number in the current scenario execution */
546
- stepNumber: z22.number(),
581
+ stepNumber: z23.number(),
547
582
  /** Type of trace event */
548
- type: z22.enum(LiveTraceEventType),
583
+ type: z23.enum(LiveTraceEventType),
549
584
  /** Tool name if this is a tool_use event */
550
- toolName: z22.string().optional(),
585
+ toolName: z23.string().optional(),
551
586
  /** Tool arguments preview (truncated JSON) */
552
- toolArgs: z22.string().optional(),
587
+ toolArgs: z23.string().optional(),
553
588
  /** Output preview (truncated text) */
554
- outputPreview: z22.string().optional(),
589
+ outputPreview: z23.string().optional(),
555
590
  /** Timestamp when this event occurred */
556
- timestamp: z22.string(),
591
+ timestamp: z23.string(),
557
592
  /** Whether this is the final event for this scenario */
558
- isComplete: z22.boolean()
593
+ isComplete: z23.boolean()
559
594
  });
560
595
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
561
596
  function parseTraceEventLine(line) {
@@ -583,14 +618,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
583
618
  TriggerType2["MANUAL"] = "MANUAL";
584
619
  return TriggerType2;
585
620
  })(TriggerType || {});
586
- var TriggerMetadataSchema = z23.object({
587
- version: z23.string().optional(),
588
- resourceUpdated: z23.array(z23.string()).optional()
621
+ var TriggerMetadataSchema = z24.object({
622
+ version: z24.string().optional(),
623
+ resourceUpdated: z24.array(z24.string()).optional()
589
624
  });
590
- var TriggerSchema = z23.object({
591
- id: z23.string(),
625
+ var TriggerSchema = z24.object({
626
+ id: z24.string(),
592
627
  metadata: TriggerMetadataSchema.optional(),
593
- type: z23.enum(TriggerType)
628
+ type: z24.enum(TriggerType)
594
629
  });
595
630
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
596
631
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -608,89 +643,89 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
608
643
  FailureSeverity2["LOW"] = "low";
609
644
  return FailureSeverity2;
610
645
  })(FailureSeverity || {});
611
- var DiffLineTypeSchema = z23.enum(["added", "removed", "unchanged"]);
612
- var DiffLineSchema = z23.object({
646
+ var DiffLineTypeSchema = z24.enum(["added", "removed", "unchanged"]);
647
+ var DiffLineSchema = z24.object({
613
648
  type: DiffLineTypeSchema,
614
- content: z23.string(),
615
- lineNumber: z23.number()
616
- });
617
- var DiffContentSchema = z23.object({
618
- path: z23.string(),
619
- expected: z23.string(),
620
- actual: z23.string(),
621
- diffLines: z23.array(DiffLineSchema)
622
- });
623
- var CommandExecutionSchema = z23.object({
624
- command: z23.string(),
625
- exitCode: z23.number(),
626
- output: z23.string().optional(),
627
- duration: z23.number()
628
- });
629
- var FileModificationSchema = z23.object({
630
- path: z23.string(),
631
- action: z23.enum(["created", "modified", "deleted"])
632
- });
633
- var ApiCallSchema = z23.object({
634
- endpoint: z23.string(),
635
- tokensUsed: z23.number(),
636
- duration: z23.number()
637
- });
638
- var ExecutionTraceSchema = z23.object({
639
- commands: z23.array(CommandExecutionSchema),
640
- filesModified: z23.array(FileModificationSchema),
641
- apiCalls: z23.array(ApiCallSchema),
642
- totalDuration: z23.number()
643
- });
644
- var FailureAnalysisSchema = z23.object({
645
- category: z23.enum(FailureCategory),
646
- severity: z23.enum(FailureSeverity),
647
- summary: z23.string(),
648
- details: z23.string(),
649
- rootCause: z23.string(),
650
- suggestedFix: z23.string(),
651
- relatedAssertions: z23.array(z23.string()),
652
- codeSnippet: z23.string().optional(),
653
- similarIssues: z23.array(z23.string()).optional(),
654
- patternId: z23.string().optional(),
649
+ content: z24.string(),
650
+ lineNumber: z24.number()
651
+ });
652
+ var DiffContentSchema = z24.object({
653
+ path: z24.string(),
654
+ expected: z24.string(),
655
+ actual: z24.string(),
656
+ diffLines: z24.array(DiffLineSchema)
657
+ });
658
+ var CommandExecutionSchema = z24.object({
659
+ command: z24.string(),
660
+ exitCode: z24.number(),
661
+ output: z24.string().optional(),
662
+ duration: z24.number()
663
+ });
664
+ var FileModificationSchema = z24.object({
665
+ path: z24.string(),
666
+ action: z24.enum(["created", "modified", "deleted"])
667
+ });
668
+ var ApiCallSchema = z24.object({
669
+ endpoint: z24.string(),
670
+ tokensUsed: z24.number(),
671
+ duration: z24.number()
672
+ });
673
+ var ExecutionTraceSchema = z24.object({
674
+ commands: z24.array(CommandExecutionSchema),
675
+ filesModified: z24.array(FileModificationSchema),
676
+ apiCalls: z24.array(ApiCallSchema),
677
+ totalDuration: z24.number()
678
+ });
679
+ var FailureAnalysisSchema = z24.object({
680
+ category: z24.enum(FailureCategory),
681
+ severity: z24.enum(FailureSeverity),
682
+ summary: z24.string(),
683
+ details: z24.string(),
684
+ rootCause: z24.string(),
685
+ suggestedFix: z24.string(),
686
+ relatedAssertions: z24.array(z24.string()),
687
+ codeSnippet: z24.string().optional(),
688
+ similarIssues: z24.array(z24.string()).optional(),
689
+ patternId: z24.string().optional(),
655
690
  // Extended fields for detailed debugging
656
691
  diff: DiffContentSchema.optional(),
657
692
  executionTrace: ExecutionTraceSchema.optional()
658
693
  });
659
694
  var EvalRunSchema = TenantEntitySchema.extend({
660
695
  /** Agent ID for this run */
661
- agentId: z23.string().optional(),
696
+ agentId: z24.string().optional(),
662
697
  /** Skills group ID for this run */
663
- skillsGroupId: z23.string().optional(),
698
+ skillsGroupId: z24.string().optional(),
664
699
  /** Scenario IDs to run */
665
- scenarioIds: z23.array(z23.string()),
700
+ scenarioIds: z24.array(z24.string()),
666
701
  /** Current status */
667
702
  status: EvalStatusSchema,
668
703
  /** Progress percentage (0-100) */
669
- progress: z23.number(),
704
+ progress: z24.number(),
670
705
  /** Results for each scenario/target combination */
671
- results: z23.array(EvalRunResultSchema),
706
+ results: z24.array(EvalRunResultSchema),
672
707
  /** Aggregated metrics across all results */
673
708
  aggregateMetrics: EvalMetricsSchema,
674
709
  /** Failure analyses */
675
- failureAnalyses: z23.array(FailureAnalysisSchema).optional(),
710
+ failureAnalyses: z24.array(FailureAnalysisSchema).optional(),
676
711
  /** Aggregated LLM trace summary */
677
712
  llmTraceSummary: LLMTraceSummarySchema.optional(),
678
713
  /** What triggered this run */
679
714
  trigger: TriggerSchema.optional(),
680
715
  /** When the run started (set when evaluation is triggered) */
681
- startedAt: z23.string().optional(),
716
+ startedAt: z24.string().optional(),
682
717
  /** When the run completed */
683
- completedAt: z23.string().optional(),
718
+ completedAt: z24.string().optional(),
684
719
  /** Live trace events captured during execution (for playback on results page) */
685
- liveTraceEvents: z23.array(LiveTraceEventSchema).optional(),
720
+ liveTraceEvents: z24.array(LiveTraceEventSchema).optional(),
686
721
  /** Remote job ID for tracking execution in Dev Machines */
687
- jobId: z23.string().optional(),
722
+ jobId: z24.string().optional(),
688
723
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
689
- jobStatus: z23.string().optional(),
724
+ jobStatus: z24.string().optional(),
690
725
  /** Remote job error message if the job failed */
691
- jobError: z23.string().optional(),
726
+ jobError: z24.string().optional(),
692
727
  /** Timestamp of the last job status check */
693
- jobStatusCheckedAt: z23.string().optional()
728
+ jobStatusCheckedAt: z24.string().optional()
694
729
  });
695
730
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
696
731
  id: true,
@@ -703,28 +738,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
703
738
  startedAt: true,
704
739
  completedAt: true
705
740
  });
706
- var EvaluationProgressSchema = z23.object({
707
- runId: z23.string(),
708
- targetId: z23.string(),
709
- totalScenarios: z23.number(),
710
- completedScenarios: z23.number(),
711
- scenarioProgress: z23.array(
712
- z23.object({
713
- scenarioId: z23.string(),
714
- currentStep: z23.string(),
715
- error: z23.string().optional()
741
+ var EvaluationProgressSchema = z24.object({
742
+ runId: z24.string(),
743
+ targetId: z24.string(),
744
+ totalScenarios: z24.number(),
745
+ completedScenarios: z24.number(),
746
+ scenarioProgress: z24.array(
747
+ z24.object({
748
+ scenarioId: z24.string(),
749
+ currentStep: z24.string(),
750
+ error: z24.string().optional()
716
751
  })
717
752
  ),
718
- createdAt: z23.number()
753
+ createdAt: z24.number()
719
754
  });
720
- var EvaluationLogSchema = z23.object({
721
- runId: z23.string(),
722
- scenarioId: z23.string(),
723
- log: z23.object({
724
- level: z23.enum(["info", "error", "debug"]),
725
- message: z23.string().optional(),
726
- args: z23.array(z23.any()).optional(),
727
- error: z23.string().optional()
755
+ var EvaluationLogSchema = z24.object({
756
+ runId: z24.string(),
757
+ scenarioId: z24.string(),
758
+ log: z24.object({
759
+ level: z24.enum(["info", "error", "debug"]),
760
+ message: z24.string().optional(),
761
+ args: z24.array(z24.any()).optional(),
762
+ error: z24.string().optional()
728
763
  })
729
764
  });
730
765
  var LLM_TIMEOUT = 12e4;
@@ -737,90 +772,89 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
737
772
  AssertionResultStatus2["ERROR"] = "error";
738
773
  return AssertionResultStatus2;
739
774
  })(AssertionResultStatus || {});
740
- var AssertionResultSchema = z24.object({
741
- id: z24.string(),
742
- assertionId: z24.string(),
743
- assertionType: z24.string(),
744
- assertionName: z24.string(),
745
- status: z24.enum(AssertionResultStatus),
746
- message: z24.string().optional(),
747
- expected: z24.string().optional(),
748
- actual: z24.string().optional(),
749
- duration: z24.number().optional(),
750
- details: z24.record(z24.string(), z24.unknown()).optional(),
751
- llmTraceSteps: z24.array(LLMTraceStepSchema).optional()
752
- });
753
- var EvalRunResultSchema = z24.object({
754
- id: z24.string(),
755
- targetId: z24.string(),
756
- targetName: z24.string().optional(),
757
- scenarioId: z24.string(),
758
- scenarioName: z24.string(),
775
+ var AssertionResultSchema = z25.object({
776
+ id: z25.string(),
777
+ assertionId: z25.string(),
778
+ assertionType: z25.string(),
779
+ assertionName: z25.string(),
780
+ status: z25.enum(AssertionResultStatus),
781
+ message: z25.string().optional(),
782
+ expected: z25.string().optional(),
783
+ actual: z25.string().optional(),
784
+ duration: z25.number().optional(),
785
+ details: z25.record(z25.string(), z25.unknown()).optional(),
786
+ llmTraceSteps: z25.array(LLMTraceStepSchema).optional()
787
+ });
788
+ var EvalRunResultSchema = z25.object({
789
+ id: z25.string(),
790
+ targetId: z25.string(),
791
+ targetName: z25.string().optional(),
792
+ scenarioId: z25.string(),
793
+ scenarioName: z25.string(),
759
794
  modelConfig: ModelConfigSchema.optional(),
760
- assertionResults: z24.array(AssertionResultSchema),
795
+ assertionResults: z25.array(AssertionResultSchema),
761
796
  metrics: EvalMetricsSchema.optional(),
762
- passed: z24.number(),
763
- failed: z24.number(),
764
- passRate: z24.number(),
765
- duration: z24.number(),
766
- outputText: z24.string().optional(),
767
- files: z24.array(ExpectedFileSchema).optional(),
768
- /** File diffs showing changes made by the agent during execution */
769
- fileDiffs: z24.array(DiffContentSchema).optional(),
770
- startedAt: z24.string().optional(),
771
- completedAt: z24.string().optional(),
797
+ passed: z25.number(),
798
+ failed: z25.number(),
799
+ passRate: z25.number(),
800
+ duration: z25.number(),
801
+ outputText: z25.string().optional(),
802
+ files: z25.array(ExpectedFileSchema).optional(),
803
+ fileDiffs: z25.array(DiffContentSchema).optional(),
804
+ startedAt: z25.string().optional(),
805
+ completedAt: z25.string().optional(),
772
806
  llmTrace: LLMTraceSchema.optional()
773
807
  });
774
- var PromptResultSchema = z24.object({
775
- text: z24.string(),
776
- files: z24.array(z24.unknown()).optional(),
777
- finishReason: z24.string().optional(),
778
- reasoning: z24.string().optional(),
779
- reasoningDetails: z24.unknown().optional(),
780
- toolCalls: z24.array(z24.unknown()).optional(),
781
- toolResults: z24.array(z24.unknown()).optional(),
782
- warnings: z24.array(z24.unknown()).optional(),
783
- sources: z24.array(z24.unknown()).optional(),
784
- steps: z24.array(z24.unknown()),
785
- generationTimeMs: z24.number(),
786
- prompt: z24.string(),
787
- systemPrompt: z24.string(),
788
- usage: z24.object({
789
- totalTokens: z24.number().optional(),
790
- totalMicrocentsSpent: z24.number().optional()
808
+ var PromptResultSchema = z25.object({
809
+ text: z25.string(),
810
+ files: z25.array(z25.unknown()).optional(),
811
+ finishReason: z25.string().optional(),
812
+ reasoning: z25.string().optional(),
813
+ reasoningDetails: z25.unknown().optional(),
814
+ toolCalls: z25.array(z25.unknown()).optional(),
815
+ toolResults: z25.array(z25.unknown()).optional(),
816
+ warnings: z25.array(z25.unknown()).optional(),
817
+ sources: z25.array(z25.unknown()).optional(),
818
+ steps: z25.array(z25.unknown()),
819
+ generationTimeMs: z25.number(),
820
+ prompt: z25.string(),
821
+ systemPrompt: z25.string(),
822
+ usage: z25.object({
823
+ totalTokens: z25.number().optional(),
824
+ totalMicrocentsSpent: z25.number().optional()
791
825
  })
792
826
  });
793
- var EvaluationResultSchema = z24.object({
794
- id: z24.string(),
795
- runId: z24.string(),
796
- timestamp: z24.number(),
827
+ var EvaluationResultSchema = z25.object({
828
+ id: z25.string(),
829
+ runId: z25.string(),
830
+ timestamp: z25.number(),
797
831
  promptResult: PromptResultSchema,
798
- testResults: z24.array(z24.unknown()),
799
- tags: z24.array(z24.string()).optional(),
800
- feedback: z24.string().optional(),
801
- score: z24.number(),
802
- suiteId: z24.string().optional()
803
- });
804
- var LeanEvaluationResultSchema = z24.object({
805
- id: z24.string(),
806
- runId: z24.string(),
807
- timestamp: z24.number(),
808
- tags: z24.array(z24.string()).optional(),
809
- scenarioId: z24.string(),
810
- scenarioVersion: z24.number().optional(),
811
- targetId: z24.string(),
812
- targetVersion: z24.number().optional(),
813
- suiteId: z24.string().optional(),
814
- score: z24.number(),
815
- time: z24.number().optional(),
816
- microcentsSpent: z24.number().optional()
832
+ testResults: z25.array(z25.unknown()),
833
+ tags: z25.array(z25.string()).optional(),
834
+ feedback: z25.string().optional(),
835
+ score: z25.number(),
836
+ suiteId: z25.string().optional()
837
+ });
838
+ var LeanEvaluationResultSchema = z25.object({
839
+ id: z25.string(),
840
+ runId: z25.string(),
841
+ timestamp: z25.number(),
842
+ tags: z25.array(z25.string()).optional(),
843
+ scenarioId: z25.string(),
844
+ scenarioVersion: z25.number().optional(),
845
+ targetId: z25.string(),
846
+ targetVersion: z25.number().optional(),
847
+ suiteId: z25.string().optional(),
848
+ score: z25.number(),
849
+ time: z25.number().optional(),
850
+ microcentsSpent: z25.number().optional()
817
851
  });
818
852
 
819
853
  // src/project/project.ts
820
- import { z as z25 } from "zod";
854
+ import { z as z26 } from "zod";
821
855
  var ProjectSchema = BaseEntitySchema.extend({
822
- appId: z25.string().optional().describe("The ID of the app in Dev Center"),
823
- appSecret: z25.string().optional().describe("The secret of the app in Dev Center")
856
+ appId: z26.string().optional().describe("The ID of the app in Dev Center"),
857
+ appSecret: z26.string().optional().describe("The secret of the app in Dev Center")
824
858
  });
825
859
  var CreateProjectInputSchema = ProjectSchema.omit({
826
860
  id: true,
@@ -831,10 +865,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
831
865
  var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
832
866
 
833
867
  // src/template/template.ts
834
- import { z as z26 } from "zod";
868
+ import { z as z27 } from "zod";
835
869
  var TemplateSchema = TenantEntitySchema.extend({
836
870
  /** URL to download the template from */
837
- downloadUrl: z26.url()
871
+ downloadUrl: z27.url()
838
872
  });
839
873
  var CreateTemplateInputSchema = TemplateSchema.omit({
840
874
  id: true,
@@ -851,9 +885,11 @@ export {
851
885
  ApiCallSchema,
852
886
  AssertionResultSchema,
853
887
  AssertionResultStatus,
888
+ AssertionSchema,
854
889
  BaseEntitySchema,
855
890
  BaseTestSchema,
856
891
  BuildCheckTestSchema,
892
+ BuildPassedAssertionSchema,
857
893
  CommandExecutionSchema,
858
894
  CommandExecutionTestSchema,
859
895
  CreateAgentInputSchema,
@@ -895,6 +931,7 @@ export {
895
931
  LeanEvaluationResultSchema,
896
932
  LiveTraceEventSchema,
897
933
  LiveTraceEventType,
934
+ LlmJudgeAssertionSchema,
898
935
  LocalProjectConfigSchema,
899
936
  MCPServerConfigSchema,
900
937
  MetaSiteConfigSchema,
@@ -910,6 +947,7 @@ export {
910
947
  SkillMetadataSchema,
911
948
  SkillSchema,
912
949
  SkillVersionSchema,
950
+ SkillWasCalledAssertionSchema,
913
951
  SkillsGroupSchema,
914
952
  TRACE_EVENT_PREFIX,
915
953
  TargetSchema,