@wix/evalforge-types 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -374,34 +374,67 @@ var TestSchema = z17.discriminatedUnion("type", [
374
374
  PlaywrightNLTestSchema
375
375
  ]);
376
376
 
377
- // src/scenario/environment.ts
377
+ // src/scenario/assertions.ts
378
378
  import { z as z18 } from "zod";
379
- var LocalProjectConfigSchema = z18.object({
379
+ var SkillWasCalledAssertionSchema = z18.object({
380
+ type: z18.literal("skill_was_called"),
381
+ /** Name of the skill that must have been called (matched against trace Skill tool args) */
382
+ skillName: z18.string()
383
+ });
384
+ var BuildPassedAssertionSchema = z18.object({
385
+ type: z18.literal("build_passed"),
386
+ /** Command to run (default: "yarn build") */
387
+ command: z18.string().optional(),
388
+ /** Expected exit code (default: 0) */
389
+ expectedExitCode: z18.number().int().optional()
390
+ });
391
+ var LlmJudgeAssertionSchema = z18.object({
392
+ type: z18.literal("llm_judge"),
393
+ /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
394
+ prompt: z18.string(),
395
+ /** Optional system prompt for the judge (default asks for JSON with score) */
396
+ systemPrompt: z18.string().optional(),
397
+ /** Minimum score to pass (0–100, default 70) */
398
+ minScore: z18.number().int().min(0).max(100).optional(),
399
+ /** Model for the judge (e.g. claude-3-5-haiku) */
400
+ model: z18.string().optional(),
401
+ maxTokens: z18.number().int().optional(),
402
+ temperature: z18.number().min(0).max(1).optional()
403
+ });
404
+ var AssertionSchema = z18.discriminatedUnion("type", [
405
+ SkillWasCalledAssertionSchema,
406
+ BuildPassedAssertionSchema,
407
+ LlmJudgeAssertionSchema
408
+ ]);
409
+
410
+ // src/scenario/environment.ts
411
+ import { z as z19 } from "zod";
412
+ var LocalProjectConfigSchema = z19.object({
380
413
  /** Template ID to use for the local project */
381
- templateId: z18.string().optional(),
414
+ templateId: z19.string().optional(),
382
415
  /** Files to create in the project */
383
- files: z18.array(
384
- z18.object({
385
- path: z18.string().min(1),
386
- content: z18.string().min(1)
416
+ files: z19.array(
417
+ z19.object({
418
+ path: z19.string().min(1),
419
+ content: z19.string().min(1)
387
420
  })
388
421
  ).optional()
389
422
  });
390
- var MetaSiteConfigSchema = z18.object({
391
- configurations: z18.array(
392
- z18.object({
393
- name: z18.string().min(1),
394
- apiCalls: z18.array(
395
- z18.object({
396
- url: z18.string().url(),
397
- method: z18.enum(["POST", "PUT"]),
398
- body: z18.string()
423
+ var MetaSiteConfigSchema = z19.object({
424
+ configurations: z19.array(
425
+ z19.object({
426
+ name: z19.string().min(1),
427
+ apiCalls: z19.array(
428
+ z19.object({
429
+ url: z19.string().url(),
430
+ method: z19.enum(["POST", "PUT"]),
431
+ body: z19.string()
399
432
  })
400
433
  )
401
434
  })
402
435
  ).optional()
403
436
  });
404
- var EnvironmentSchema = z18.object({
437
+ var EnvironmentSchema = z19.object({
405
438
  /** Local project configuration */
406
439
  localProject: LocalProjectConfigSchema.optional(),
407
440
  /** Meta site configuration */
@@ -409,18 +442,20 @@ var EnvironmentSchema = z18.object({
409
442
  });
410
443
 
411
444
  // src/scenario/test-scenario.ts
412
- import { z as z19 } from "zod";
413
- var ExpectedFileSchema = z19.object({
445
+ import { z as z20 } from "zod";
446
+ var ExpectedFileSchema = z20.object({
414
447
  /** Relative path where the file should be created */
415
- path: z19.string(),
448
+ path: z20.string(),
416
449
  /** Optional expected content */
417
- content: z19.string().optional()
450
+ content: z20.string().optional()
418
451
  });
419
452
  var TestScenarioSchema = TenantEntitySchema.extend({
420
453
  /** The prompt sent to the agent to trigger the task */
421
- triggerPrompt: z19.string().min(10),
454
+ triggerPrompt: z20.string().min(10),
422
455
  /** ID of the template to use for this scenario */
423
- templateId: z19.string().optional()
456
+ templateId: z20.string().optional(),
457
+ /** Assertions to evaluate for this scenario */
458
+ assertions: z20.array(AssertionSchema).optional()
424
459
  });
425
460
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
426
461
  id: true,
@@ -431,10 +466,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
431
466
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
432
467
 
433
468
  // src/suite/test-suite.ts
434
- import { z as z20 } from "zod";
469
+ import { z as z21 } from "zod";
435
470
  var TestSuiteSchema = TenantEntitySchema.extend({
436
471
  /** IDs of test scenarios in this suite */
437
- scenarioIds: z20.array(z20.string())
472
+ scenarioIds: z21.array(z21.string())
438
473
  });
439
474
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
440
475
  id: true,
@@ -445,21 +480,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
445
480
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
446
481
 
447
482
  // src/evaluation/metrics.ts
448
- import { z as z21 } from "zod";
449
- var TokenUsageSchema = z21.object({
450
- prompt: z21.number(),
451
- completion: z21.number(),
452
- total: z21.number()
453
- });
454
- var EvalMetricsSchema = z21.object({
455
- totalAssertions: z21.number(),
456
- passed: z21.number(),
457
- failed: z21.number(),
458
- skipped: z21.number(),
459
- errors: z21.number(),
460
- passRate: z21.number(),
461
- avgDuration: z21.number(),
462
- totalDuration: z21.number()
483
+ import { z as z22 } from "zod";
484
+ var TokenUsageSchema = z22.object({
485
+ prompt: z22.number(),
486
+ completion: z22.number(),
487
+ total: z22.number()
488
+ });
489
+ var EvalMetricsSchema = z22.object({
490
+ totalAssertions: z22.number(),
491
+ passed: z22.number(),
492
+ failed: z22.number(),
493
+ skipped: z22.number(),
494
+ errors: z22.number(),
495
+ passRate: z22.number(),
496
+ avgDuration: z22.number(),
497
+ totalDuration: z22.number()
463
498
  });
464
499
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
465
500
  EvalStatus2["PENDING"] = "pending";
@@ -469,7 +504,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
469
504
  EvalStatus2["CANCELLED"] = "cancelled";
470
505
  return EvalStatus2;
471
506
  })(EvalStatus || {});
472
- var EvalStatusSchema = z21.enum(EvalStatus);
507
+ var EvalStatusSchema = z22.enum(EvalStatus);
473
508
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
474
509
  LLMStepType2["COMPLETION"] = "completion";
475
510
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -477,129 +512,46 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
477
512
  LLMStepType2["THINKING"] = "thinking";
478
513
  return LLMStepType2;
479
514
  })(LLMStepType || {});
480
- var LLMTraceStepSchema = z21.object({
481
- id: z21.string(),
482
- stepNumber: z21.number(),
483
- type: z21.enum(LLMStepType),
484
- model: z21.string(),
485
- provider: z21.string(),
486
- startedAt: z21.string(),
487
- durationMs: z21.number(),
515
+ var LLMTraceStepSchema = z22.object({
516
+ id: z22.string(),
517
+ stepNumber: z22.number(),
518
+ type: z22.enum(LLMStepType),
519
+ model: z22.string(),
520
+ provider: z22.string(),
521
+ startedAt: z22.string(),
522
+ durationMs: z22.number(),
488
523
  tokenUsage: TokenUsageSchema,
489
- costUsd: z21.number(),
490
- toolName: z21.string().optional(),
491
- toolArguments: z21.string().optional(),
492
- inputPreview: z21.string().optional(),
493
- outputPreview: z21.string().optional(),
494
- success: z21.boolean(),
495
- error: z21.string().optional()
496
- });
497
- var LLMBreakdownStatsSchema = z21.object({
498
- count: z21.number(),
499
- durationMs: z21.number(),
500
- tokens: z21.number(),
501
- costUsd: z21.number()
502
- });
503
- var LLMTraceSummarySchema = z21.object({
504
- totalSteps: z21.number(),
505
- totalDurationMs: z21.number(),
524
+ costUsd: z22.number(),
525
+ toolName: z22.string().optional(),
526
+ toolArguments: z22.string().optional(),
527
+ inputPreview: z22.string().optional(),
528
+ outputPreview: z22.string().optional(),
529
+ success: z22.boolean(),
530
+ error: z22.string().optional()
531
+ });
532
+ var LLMBreakdownStatsSchema = z22.object({
533
+ count: z22.number(),
534
+ durationMs: z22.number(),
535
+ tokens: z22.number(),
536
+ costUsd: z22.number()
537
+ });
538
+ var LLMTraceSummarySchema = z22.object({
539
+ totalSteps: z22.number(),
540
+ totalDurationMs: z22.number(),
506
541
  totalTokens: TokenUsageSchema,
507
- totalCostUsd: z21.number(),
508
- stepTypeBreakdown: z21.record(z21.string(), LLMBreakdownStatsSchema).optional(),
509
- modelBreakdown: z21.record(z21.string(), LLMBreakdownStatsSchema),
510
- modelsUsed: z21.array(z21.string())
511
- });
512
- var LLMTraceSchema = z21.object({
513
- id: z21.string(),
514
- steps: z21.array(LLMTraceStepSchema),
542
+ totalCostUsd: z22.number(),
543
+ stepTypeBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema).optional(),
544
+ modelBreakdown: z22.record(z22.string(), LLMBreakdownStatsSchema),
545
+ modelsUsed: z22.array(z22.string())
546
+ });
547
+ var LLMTraceSchema = z22.object({
548
+ id: z22.string(),
549
+ steps: z22.array(LLMTraceStepSchema),
515
550
  summary: LLMTraceSummarySchema
516
551
  });
517
552
 
518
553
  // src/evaluation/eval-result.ts
519
- import { z as z22 } from "zod";
520
- var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
521
- AssertionResultStatus2["PASSED"] = "passed";
522
- AssertionResultStatus2["FAILED"] = "failed";
523
- AssertionResultStatus2["SKIPPED"] = "skipped";
524
- AssertionResultStatus2["ERROR"] = "error";
525
- return AssertionResultStatus2;
526
- })(AssertionResultStatus || {});
527
- var AssertionResultSchema = z22.object({
528
- id: z22.string(),
529
- assertionId: z22.string(),
530
- assertionType: z22.string(),
531
- assertionName: z22.string(),
532
- status: z22.enum(AssertionResultStatus),
533
- message: z22.string().optional(),
534
- expected: z22.string().optional(),
535
- actual: z22.string().optional(),
536
- duration: z22.number().optional(),
537
- details: z22.record(z22.string(), z22.unknown()).optional(),
538
- llmTraceSteps: z22.array(LLMTraceStepSchema).optional()
539
- });
540
- var EvalRunResultSchema = z22.object({
541
- id: z22.string(),
542
- targetId: z22.string(),
543
- targetName: z22.string().optional(),
544
- scenarioId: z22.string(),
545
- scenarioName: z22.string(),
546
- modelConfig: ModelConfigSchema.optional(),
547
- assertionResults: z22.array(AssertionResultSchema),
548
- metrics: EvalMetricsSchema.optional(),
549
- passed: z22.number(),
550
- failed: z22.number(),
551
- passRate: z22.number(),
552
- duration: z22.number(),
553
- outputText: z22.string().optional(),
554
- files: z22.array(ExpectedFileSchema).optional(),
555
- startedAt: z22.string().optional(),
556
- completedAt: z22.string().optional(),
557
- llmTrace: LLMTraceSchema.optional()
558
- });
559
- var PromptResultSchema = z22.object({
560
- text: z22.string(),
561
- files: z22.array(z22.unknown()).optional(),
562
- finishReason: z22.string().optional(),
563
- reasoning: z22.string().optional(),
564
- reasoningDetails: z22.unknown().optional(),
565
- toolCalls: z22.array(z22.unknown()).optional(),
566
- toolResults: z22.array(z22.unknown()).optional(),
567
- warnings: z22.array(z22.unknown()).optional(),
568
- sources: z22.array(z22.unknown()).optional(),
569
- steps: z22.array(z22.unknown()),
570
- generationTimeMs: z22.number(),
571
- prompt: z22.string(),
572
- systemPrompt: z22.string(),
573
- usage: z22.object({
574
- totalTokens: z22.number().optional(),
575
- totalMicrocentsSpent: z22.number().optional()
576
- })
577
- });
578
- var EvaluationResultSchema = z22.object({
579
- id: z22.string(),
580
- runId: z22.string(),
581
- timestamp: z22.number(),
582
- promptResult: PromptResultSchema,
583
- testResults: z22.array(z22.unknown()),
584
- tags: z22.array(z22.string()).optional(),
585
- feedback: z22.string().optional(),
586
- score: z22.number(),
587
- suiteId: z22.string().optional()
588
- });
589
- var LeanEvaluationResultSchema = z22.object({
590
- id: z22.string(),
591
- runId: z22.string(),
592
- timestamp: z22.number(),
593
- tags: z22.array(z22.string()).optional(),
594
- scenarioId: z22.string(),
595
- scenarioVersion: z22.number().optional(),
596
- targetId: z22.string(),
597
- targetVersion: z22.number().optional(),
598
- suiteId: z22.string().optional(),
599
- score: z22.number(),
600
- time: z22.number().optional(),
601
- microcentsSpent: z22.number().optional()
602
- });
554
+ import { z as z25 } from "zod";
603
555
 
604
556
  // src/evaluation/eval-run.ts
605
557
  import { z as z24 } from "zod";
@@ -812,11 +764,97 @@ var EvaluationLogSchema = z24.object({
812
764
  });
813
765
  var LLM_TIMEOUT = 12e4;
814
766
 
767
+ // src/evaluation/eval-result.ts
768
+ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
769
+ AssertionResultStatus2["PASSED"] = "passed";
770
+ AssertionResultStatus2["FAILED"] = "failed";
771
+ AssertionResultStatus2["SKIPPED"] = "skipped";
772
+ AssertionResultStatus2["ERROR"] = "error";
773
+ return AssertionResultStatus2;
774
+ })(AssertionResultStatus || {});
775
+ var AssertionResultSchema = z25.object({
776
+ id: z25.string(),
777
+ assertionId: z25.string(),
778
+ assertionType: z25.string(),
779
+ assertionName: z25.string(),
780
+ status: z25.enum(AssertionResultStatus),
781
+ message: z25.string().optional(),
782
+ expected: z25.string().optional(),
783
+ actual: z25.string().optional(),
784
+ duration: z25.number().optional(),
785
+ details: z25.record(z25.string(), z25.unknown()).optional(),
786
+ llmTraceSteps: z25.array(LLMTraceStepSchema).optional()
787
+ });
788
+ var EvalRunResultSchema = z25.object({
789
+ id: z25.string(),
790
+ targetId: z25.string(),
791
+ targetName: z25.string().optional(),
792
+ scenarioId: z25.string(),
793
+ scenarioName: z25.string(),
794
+ modelConfig: ModelConfigSchema.optional(),
795
+ assertionResults: z25.array(AssertionResultSchema),
796
+ metrics: EvalMetricsSchema.optional(),
797
+ passed: z25.number(),
798
+ failed: z25.number(),
799
+ passRate: z25.number(),
800
+ duration: z25.number(),
801
+ outputText: z25.string().optional(),
802
+ files: z25.array(ExpectedFileSchema).optional(),
803
+ fileDiffs: z25.array(DiffContentSchema).optional(),
804
+ startedAt: z25.string().optional(),
805
+ completedAt: z25.string().optional(),
806
+ llmTrace: LLMTraceSchema.optional()
807
+ });
808
+ var PromptResultSchema = z25.object({
809
+ text: z25.string(),
810
+ files: z25.array(z25.unknown()).optional(),
811
+ finishReason: z25.string().optional(),
812
+ reasoning: z25.string().optional(),
813
+ reasoningDetails: z25.unknown().optional(),
814
+ toolCalls: z25.array(z25.unknown()).optional(),
815
+ toolResults: z25.array(z25.unknown()).optional(),
816
+ warnings: z25.array(z25.unknown()).optional(),
817
+ sources: z25.array(z25.unknown()).optional(),
818
+ steps: z25.array(z25.unknown()),
819
+ generationTimeMs: z25.number(),
820
+ prompt: z25.string(),
821
+ systemPrompt: z25.string(),
822
+ usage: z25.object({
823
+ totalTokens: z25.number().optional(),
824
+ totalMicrocentsSpent: z25.number().optional()
825
+ })
826
+ });
827
+ var EvaluationResultSchema = z25.object({
828
+ id: z25.string(),
829
+ runId: z25.string(),
830
+ timestamp: z25.number(),
831
+ promptResult: PromptResultSchema,
832
+ testResults: z25.array(z25.unknown()),
833
+ tags: z25.array(z25.string()).optional(),
834
+ feedback: z25.string().optional(),
835
+ score: z25.number(),
836
+ suiteId: z25.string().optional()
837
+ });
838
+ var LeanEvaluationResultSchema = z25.object({
839
+ id: z25.string(),
840
+ runId: z25.string(),
841
+ timestamp: z25.number(),
842
+ tags: z25.array(z25.string()).optional(),
843
+ scenarioId: z25.string(),
844
+ scenarioVersion: z25.number().optional(),
845
+ targetId: z25.string(),
846
+ targetVersion: z25.number().optional(),
847
+ suiteId: z25.string().optional(),
848
+ score: z25.number(),
849
+ time: z25.number().optional(),
850
+ microcentsSpent: z25.number().optional()
851
+ });
852
+
815
853
  // src/project/project.ts
816
- import { z as z25 } from "zod";
854
+ import { z as z26 } from "zod";
817
855
  var ProjectSchema = BaseEntitySchema.extend({
818
- appId: z25.string().optional().describe("The ID of the app in Dev Center"),
819
- appSecret: z25.string().optional().describe("The secret of the app in Dev Center")
856
+ appId: z26.string().optional().describe("The ID of the app in Dev Center"),
857
+ appSecret: z26.string().optional().describe("The secret of the app in Dev Center")
820
858
  });
821
859
  var CreateProjectInputSchema = ProjectSchema.omit({
822
860
  id: true,
@@ -827,10 +865,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
827
865
  var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
828
866
 
829
867
  // src/template/template.ts
830
- import { z as z26 } from "zod";
868
+ import { z as z27 } from "zod";
831
869
  var TemplateSchema = TenantEntitySchema.extend({
832
870
  /** URL to download the template from */
833
- downloadUrl: z26.url()
871
+ downloadUrl: z27.url()
834
872
  });
835
873
  var CreateTemplateInputSchema = TemplateSchema.omit({
836
874
  id: true,
@@ -847,9 +885,11 @@ export {
847
885
  ApiCallSchema,
848
886
  AssertionResultSchema,
849
887
  AssertionResultStatus,
888
+ AssertionSchema,
850
889
  BaseEntitySchema,
851
890
  BaseTestSchema,
852
891
  BuildCheckTestSchema,
892
+ BuildPassedAssertionSchema,
853
893
  CommandExecutionSchema,
854
894
  CommandExecutionTestSchema,
855
895
  CreateAgentInputSchema,
@@ -891,6 +931,7 @@ export {
891
931
  LeanEvaluationResultSchema,
892
932
  LiveTraceEventSchema,
893
933
  LiveTraceEventType,
934
+ LlmJudgeAssertionSchema,
894
935
  LocalProjectConfigSchema,
895
936
  MCPServerConfigSchema,
896
937
  MetaSiteConfigSchema,
@@ -906,6 +947,7 @@ export {
906
947
  SkillMetadataSchema,
907
948
  SkillSchema,
908
949
  SkillVersionSchema,
950
+ SkillWasCalledAssertionSchema,
909
951
  SkillsGroupSchema,
910
952
  TRACE_EVENT_PREFIX,
911
953
  TargetSchema,