@wix/evalforge-types 0.52.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -46,6 +46,9 @@ __export(index_exports, {
46
46
  BuildCheckTestSchema: () => BuildCheckTestSchema,
47
47
  BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
48
48
  BuildPassedConfigSchema: () => BuildPassedConfigSchema,
49
+ BulkImportResultItemSchema: () => BulkImportResultItemSchema,
50
+ BulkImportResultSchema: () => BulkImportResultSchema,
51
+ BulkImportSkillsInputSchema: () => BulkImportSkillsInputSchema,
49
52
  ClaudeModel: () => ClaudeModel2,
50
53
  ClaudeModelSchema: () => ClaudeModelSchema,
51
54
  CommandExecutionSchema: () => CommandExecutionSchema,
@@ -1359,6 +1362,21 @@ var UpdateSkillInputSchema = SkillInputBaseSchema.partial().refine(
1359
1362
  var SkillWithLatestVersionSchema = SkillSchema.extend({
1360
1363
  latestVersion: SkillVersionSchema.optional()
1361
1364
  });
1365
+ var BulkImportSkillsInputSchema = import_zod7.z.object({
1366
+ source: GitHubSourceSchema
1367
+ });
1368
+ var BulkImportResultItemSchema = import_zod7.z.object({
1369
+ name: import_zod7.z.string(),
1370
+ status: import_zod7.z.enum(["created", "skipped", "failed"]),
1371
+ skillId: import_zod7.z.string().optional(),
1372
+ reason: import_zod7.z.string().optional()
1373
+ });
1374
+ var BulkImportResultSchema = import_zod7.z.object({
1375
+ created: import_zod7.z.number(),
1376
+ skipped: import_zod7.z.number(),
1377
+ failed: import_zod7.z.number(),
1378
+ items: import_zod7.z.array(BulkImportResultItemSchema)
1379
+ });
1362
1380
 
1363
1381
  // src/target/sub-agent.ts
1364
1382
  var import_zod8 = require("zod");
@@ -1591,85 +1609,34 @@ var TestSchema = import_zod20.z.discriminatedUnion("type", [
1591
1609
  PlaywrightNLTestSchema
1592
1610
  ]);
1593
1611
 
1594
- // src/scenario/assertions.ts
1595
- var import_zod21 = require("zod");
1596
- var SkillWasCalledAssertionSchema = import_zod21.z.object({
1597
- type: import_zod21.z.literal("skill_was_called"),
1598
- /** Names of the skills that must have been called (matched against trace Skill tool args) */
1599
- skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
1600
- });
1601
- var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
1602
- type: import_zod21.z.literal("tool_called_with_param"),
1603
- /** Name of the tool that must have been called */
1604
- toolName: import_zod21.z.string().min(1),
1605
- /** JSON string of key-value pairs for expected parameters (substring match) */
1606
- expectedParams: import_zod21.z.string().min(1)
1607
- });
1608
- var BuildPassedAssertionSchema = import_zod21.z.object({
1609
- type: import_zod21.z.literal("build_passed"),
1610
- /** Command to run (default: "yarn build") */
1611
- command: import_zod21.z.string().optional(),
1612
- /** Expected exit code (default: 0) */
1613
- expectedExitCode: import_zod21.z.number().int().optional()
1614
- });
1615
- var CostAssertionSchema = import_zod21.z.object({
1616
- type: import_zod21.z.literal("cost"),
1617
- /** Maximum allowed cost in USD */
1618
- maxCostUsd: import_zod21.z.number().positive()
1619
- });
1620
- var LlmJudgeAssertionSchema = import_zod21.z.object({
1621
- type: import_zod21.z.literal("llm_judge"),
1622
- /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
1623
- prompt: import_zod21.z.string(),
1624
- /** Minimum score to pass (0-10, default 7) */
1625
- minScore: import_zod21.z.number().int().min(0).max(10).optional(),
1626
- /** Model for the judge (e.g. claude-3-5-haiku) */
1627
- model: import_zod21.z.string().optional(),
1628
- maxTokens: import_zod21.z.number().int().optional(),
1629
- temperature: import_zod21.z.number().min(0).max(1).optional()
1630
- });
1631
- var TimeAssertionSchema = import_zod21.z.object({
1632
- type: import_zod21.z.literal("time_limit"),
1633
- /** Maximum allowed duration in milliseconds */
1634
- maxDurationMs: import_zod21.z.number().int().positive()
1635
- });
1636
- var AssertionSchema = import_zod21.z.union([
1637
- SkillWasCalledAssertionSchema,
1638
- ToolCalledWithParamAssertionSchema,
1639
- BuildPassedAssertionSchema,
1640
- TimeAssertionSchema,
1641
- CostAssertionSchema,
1642
- LlmJudgeAssertionSchema
1643
- ]);
1644
-
1645
1612
  // src/scenario/environment.ts
1646
- var import_zod22 = require("zod");
1647
- var LocalProjectConfigSchema = import_zod22.z.object({
1613
+ var import_zod21 = require("zod");
1614
+ var LocalProjectConfigSchema = import_zod21.z.object({
1648
1615
  /** Template ID to use for the local project */
1649
- templateId: import_zod22.z.string().optional(),
1616
+ templateId: import_zod21.z.string().optional(),
1650
1617
  /** Files to create in the project */
1651
- files: import_zod22.z.array(
1652
- import_zod22.z.object({
1653
- path: import_zod22.z.string().min(1),
1654
- content: import_zod22.z.string().min(1)
1618
+ files: import_zod21.z.array(
1619
+ import_zod21.z.object({
1620
+ path: import_zod21.z.string().min(1),
1621
+ content: import_zod21.z.string().min(1)
1655
1622
  })
1656
1623
  ).optional()
1657
1624
  });
1658
- var MetaSiteConfigSchema = import_zod22.z.object({
1659
- configurations: import_zod22.z.array(
1660
- import_zod22.z.object({
1661
- name: import_zod22.z.string().min(1),
1662
- apiCalls: import_zod22.z.array(
1663
- import_zod22.z.object({
1664
- url: import_zod22.z.string().url(),
1665
- method: import_zod22.z.enum(["POST", "PUT"]),
1666
- body: import_zod22.z.string()
1625
+ var MetaSiteConfigSchema = import_zod21.z.object({
1626
+ configurations: import_zod21.z.array(
1627
+ import_zod21.z.object({
1628
+ name: import_zod21.z.string().min(1),
1629
+ apiCalls: import_zod21.z.array(
1630
+ import_zod21.z.object({
1631
+ url: import_zod21.z.string().url(),
1632
+ method: import_zod21.z.enum(["POST", "PUT"]),
1633
+ body: import_zod21.z.string()
1667
1634
  })
1668
1635
  )
1669
1636
  })
1670
1637
  ).optional()
1671
1638
  });
1672
- var EnvironmentSchema = import_zod22.z.object({
1639
+ var EnvironmentSchema = import_zod21.z.object({
1673
1640
  /** Local project configuration */
1674
1641
  localProject: LocalProjectConfigSchema.optional(),
1675
1642
  /** Meta site configuration */
@@ -1677,11 +1644,11 @@ var EnvironmentSchema = import_zod22.z.object({
1677
1644
  });
1678
1645
 
1679
1646
  // src/scenario/test-scenario.ts
1680
- var import_zod24 = require("zod");
1647
+ var import_zod23 = require("zod");
1681
1648
 
1682
1649
  // src/assertion/assertion.ts
1683
- var import_zod23 = require("zod");
1684
- var AssertionTypeSchema = import_zod23.z.enum([
1650
+ var import_zod22 = require("zod");
1651
+ var AssertionTypeSchema = import_zod22.z.enum([
1685
1652
  "skill_was_called",
1686
1653
  "tool_called_with_param",
1687
1654
  "build_passed",
@@ -1689,59 +1656,61 @@ var AssertionTypeSchema = import_zod23.z.enum([
1689
1656
  "cost",
1690
1657
  "llm_judge"
1691
1658
  ]);
1692
- var AssertionParameterTypeSchema = import_zod23.z.enum([
1659
+ var AssertionParameterTypeSchema = import_zod22.z.enum([
1693
1660
  "string",
1694
1661
  "number",
1695
1662
  "boolean"
1696
1663
  ]);
1697
- var AssertionParameterSchema = import_zod23.z.object({
1664
+ var AssertionParameterSchema = import_zod22.z.object({
1698
1665
  /** Parameter name (used as key in params object) */
1699
- name: import_zod23.z.string().min(1),
1666
+ name: import_zod22.z.string().min(1),
1700
1667
  /** Display label for the parameter */
1701
- label: import_zod23.z.string().min(1),
1668
+ label: import_zod22.z.string().min(1),
1702
1669
  /** Parameter type */
1703
1670
  type: AssertionParameterTypeSchema,
1704
1671
  /** Whether this parameter is required */
1705
- required: import_zod23.z.boolean(),
1672
+ required: import_zod22.z.boolean(),
1706
1673
  /** Default value (optional, used when not provided) */
1707
- defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
1674
+ defaultValue: import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean()]).optional(),
1708
1675
  /** If true, parameter is hidden by default behind "Show advanced options" */
1709
- advanced: import_zod23.z.boolean().optional()
1676
+ advanced: import_zod22.z.boolean().optional()
1710
1677
  });
1711
- var ScenarioAssertionLinkSchema = import_zod23.z.object({
1678
+ var ScenarioAssertionLinkSchema = import_zod22.z.object({
1712
1679
  /** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
1713
- assertionId: import_zod23.z.string(),
1680
+ assertionId: import_zod22.z.string(),
1714
1681
  /** Parameter values for this assertion in this scenario */
1715
- params: import_zod23.z.record(
1716
- import_zod23.z.string(),
1717
- import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
1682
+ params: import_zod22.z.record(
1683
+ import_zod22.z.string(),
1684
+ import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean(), import_zod22.z.null()])
1718
1685
  ).optional()
1719
1686
  });
1720
- var SkillWasCalledConfigSchema = import_zod23.z.object({
1687
+ var SkillWasCalledConfigSchema = import_zod22.z.object({
1721
1688
  /** Names of the skills that must have been called */
1722
- skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
1689
+ skillNames: import_zod22.z.array(import_zod22.z.string().min(1)).min(1)
1723
1690
  });
1724
- var CostConfigSchema = import_zod23.z.strictObject({
1691
+ var CostConfigSchema = import_zod22.z.strictObject({
1725
1692
  /** Maximum allowed cost in USD */
1726
- maxCostUsd: import_zod23.z.number().positive()
1693
+ maxCostUsd: import_zod22.z.number().positive()
1727
1694
  });
1728
- var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
1695
+ var ToolCalledWithParamConfigSchema = import_zod22.z.strictObject({
1729
1696
  /** Name of the tool that must have been called */
1730
- toolName: import_zod23.z.string().min(1),
1697
+ toolName: import_zod22.z.string().min(1),
1731
1698
  /** JSON string of key-value pairs for expected parameters (substring match) */
1732
- expectedParams: import_zod23.z.string().min(1)
1699
+ expectedParams: import_zod22.z.string().min(1),
1700
+ /** If true, the matching tool call must also have succeeded (step.success === true) */
1701
+ requireSuccess: import_zod22.z.boolean().optional()
1733
1702
  });
1734
- var BuildPassedConfigSchema = import_zod23.z.strictObject({
1703
+ var BuildPassedConfigSchema = import_zod22.z.strictObject({
1735
1704
  /** Command to run (default: "yarn build") */
1736
- command: import_zod23.z.string().optional(),
1705
+ command: import_zod22.z.string().optional(),
1737
1706
  /** Expected exit code (default: 0) */
1738
- expectedExitCode: import_zod23.z.number().int().optional()
1707
+ expectedExitCode: import_zod22.z.number().int().optional()
1739
1708
  });
1740
- var TimeConfigSchema = import_zod23.z.strictObject({
1709
+ var TimeConfigSchema = import_zod22.z.strictObject({
1741
1710
  /** Maximum allowed duration in milliseconds */
1742
- maxDurationMs: import_zod23.z.number().int().positive()
1711
+ maxDurationMs: import_zod22.z.number().int().positive()
1743
1712
  });
1744
- var LlmJudgeConfigSchema = import_zod23.z.object({
1713
+ var LlmJudgeConfigSchema = import_zod22.z.object({
1745
1714
  /**
1746
1715
  * Prompt template with placeholders:
1747
1716
  * - {{output}}: agent's final output
@@ -1752,19 +1721,45 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
1752
1721
  * - {{trace}}: step-by-step trace of tool calls
1753
1722
  * - Custom parameters defined in the parameters array
1754
1723
  */
1755
- prompt: import_zod23.z.string().min(1),
1724
+ prompt: import_zod22.z.string().min(1),
1756
1725
  /** Minimum score to pass (0-10, default 7) */
1757
- minScore: import_zod23.z.number().int().min(0).max(10).optional(),
1726
+ minScore: import_zod22.z.number().int().min(0).max(10).optional(),
1758
1727
  /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
1759
- model: import_zod23.z.string().optional(),
1728
+ model: import_zod22.z.string().optional(),
1760
1729
  /** Max output tokens */
1761
- maxTokens: import_zod23.z.number().int().optional(),
1730
+ maxTokens: import_zod22.z.number().int().optional(),
1762
1731
  /** Temperature (0-1) */
1763
- temperature: import_zod23.z.number().min(0).max(1).optional(),
1732
+ temperature: import_zod22.z.number().min(0).max(1).optional(),
1764
1733
  /** User-defined parameters for this assertion */
1765
- parameters: import_zod23.z.array(AssertionParameterSchema).optional()
1734
+ parameters: import_zod22.z.array(AssertionParameterSchema).optional()
1735
+ });
1736
+ var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
1737
+ type: import_zod22.z.literal("skill_was_called")
1738
+ });
1739
+ var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
1740
+ type: import_zod22.z.literal("tool_called_with_param")
1766
1741
  });
1767
- var AssertionConfigSchema = import_zod23.z.union([
1742
+ var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
1743
+ type: import_zod22.z.literal("build_passed")
1744
+ });
1745
+ var CostAssertionSchema = CostConfigSchema.extend({
1746
+ type: import_zod22.z.literal("cost")
1747
+ });
1748
+ var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
1749
+ type: import_zod22.z.literal("llm_judge")
1750
+ });
1751
+ var TimeAssertionSchema = TimeConfigSchema.extend({
1752
+ type: import_zod22.z.literal("time_limit")
1753
+ });
1754
+ var AssertionSchema = import_zod22.z.union([
1755
+ SkillWasCalledAssertionSchema,
1756
+ ToolCalledWithParamAssertionSchema,
1757
+ BuildPassedAssertionSchema,
1758
+ TimeAssertionSchema,
1759
+ CostAssertionSchema,
1760
+ LlmJudgeAssertionSchema
1761
+ ]);
1762
+ var AssertionConfigSchema = import_zod22.z.union([
1768
1763
  LlmJudgeConfigSchema,
1769
1764
  // requires prompt - check first
1770
1765
  SkillWasCalledConfigSchema,
@@ -1777,7 +1772,7 @@ var AssertionConfigSchema = import_zod23.z.union([
1777
1772
  // requires maxCostUsd, uses strictObject
1778
1773
  BuildPassedConfigSchema,
1779
1774
  // all optional, uses strictObject to reject unknown keys
1780
- import_zod23.z.object({})
1775
+ import_zod22.z.object({})
1781
1776
  // fallback empty config
1782
1777
  ]);
1783
1778
  var CustomAssertionSchema = TenantEntitySchema.extend({
@@ -1828,25 +1823,25 @@ function getLlmJudgeConfig(assertion) {
1828
1823
  }
1829
1824
 
1830
1825
  // src/scenario/test-scenario.ts
1831
- var ExpectedFileSchema = import_zod24.z.object({
1826
+ var ExpectedFileSchema = import_zod23.z.object({
1832
1827
  /** Relative path where the file should be created */
1833
- path: import_zod24.z.string(),
1828
+ path: import_zod23.z.string(),
1834
1829
  /** Optional expected content */
1835
- content: import_zod24.z.string().optional()
1830
+ content: import_zod23.z.string().optional()
1836
1831
  });
1837
1832
  var TestScenarioSchema = TenantEntitySchema.extend({
1838
1833
  /** The prompt sent to the agent to trigger the task */
1839
- triggerPrompt: import_zod24.z.string().min(10),
1834
+ triggerPrompt: import_zod23.z.string().min(10),
1840
1835
  /** ID of the template to use for this scenario (null = no template) */
1841
- templateId: import_zod24.z.string().nullish(),
1836
+ templateId: import_zod23.z.string().nullish(),
1842
1837
  /** Inline assertions to evaluate for this scenario (legacy) */
1843
- assertions: import_zod24.z.array(AssertionSchema).optional(),
1838
+ assertions: import_zod23.z.array(AssertionSchema).optional(),
1844
1839
  /** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
1845
- assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
1840
+ assertionIds: import_zod23.z.array(import_zod23.z.string()).optional(),
1846
1841
  /** Linked assertions with per-scenario parameter values */
1847
- assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional(),
1842
+ assertionLinks: import_zod23.z.array(ScenarioAssertionLinkSchema).optional(),
1848
1843
  /** Tags for categorisation and filtering */
1849
- tags: import_zod24.z.array(import_zod24.z.string()).optional()
1844
+ tags: import_zod23.z.array(import_zod23.z.string()).optional()
1850
1845
  });
1851
1846
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
1852
1847
  id: true,
@@ -1857,10 +1852,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
1857
1852
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
1858
1853
 
1859
1854
  // src/suite/test-suite.ts
1860
- var import_zod25 = require("zod");
1855
+ var import_zod24 = require("zod");
1861
1856
  var TestSuiteSchema = TenantEntitySchema.extend({
1862
1857
  /** IDs of test scenarios in this suite */
1863
- scenarioIds: import_zod25.z.array(import_zod25.z.string())
1858
+ scenarioIds: import_zod24.z.array(import_zod24.z.string())
1864
1859
  });
1865
1860
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1866
1861
  id: true,
@@ -1871,21 +1866,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1871
1866
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
1872
1867
 
1873
1868
  // src/evaluation/metrics.ts
1874
- var import_zod26 = require("zod");
1875
- var TokenUsageSchema = import_zod26.z.object({
1876
- prompt: import_zod26.z.number(),
1877
- completion: import_zod26.z.number(),
1878
- total: import_zod26.z.number()
1879
- });
1880
- var EvalMetricsSchema = import_zod26.z.object({
1881
- totalAssertions: import_zod26.z.number(),
1882
- passed: import_zod26.z.number(),
1883
- failed: import_zod26.z.number(),
1884
- skipped: import_zod26.z.number(),
1885
- errors: import_zod26.z.number(),
1886
- passRate: import_zod26.z.number(),
1887
- avgDuration: import_zod26.z.number(),
1888
- totalDuration: import_zod26.z.number()
1869
+ var import_zod25 = require("zod");
1870
+ var TokenUsageSchema = import_zod25.z.object({
1871
+ prompt: import_zod25.z.number(),
1872
+ completion: import_zod25.z.number(),
1873
+ total: import_zod25.z.number()
1874
+ });
1875
+ var EvalMetricsSchema = import_zod25.z.object({
1876
+ totalAssertions: import_zod25.z.number(),
1877
+ passed: import_zod25.z.number(),
1878
+ failed: import_zod25.z.number(),
1879
+ skipped: import_zod25.z.number(),
1880
+ errors: import_zod25.z.number(),
1881
+ passRate: import_zod25.z.number(),
1882
+ avgDuration: import_zod25.z.number(),
1883
+ totalDuration: import_zod25.z.number()
1889
1884
  });
1890
1885
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1891
1886
  EvalStatus2["PENDING"] = "pending";
@@ -1895,7 +1890,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1895
1890
  EvalStatus2["CANCELLED"] = "cancelled";
1896
1891
  return EvalStatus2;
1897
1892
  })(EvalStatus || {});
1898
- var EvalStatusSchema = import_zod26.z.enum(EvalStatus);
1893
+ var EvalStatusSchema = import_zod25.z.enum(EvalStatus);
1899
1894
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1900
1895
  LLMStepType2["COMPLETION"] = "completion";
1901
1896
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -1903,52 +1898,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1903
1898
  LLMStepType2["THINKING"] = "thinking";
1904
1899
  return LLMStepType2;
1905
1900
  })(LLMStepType || {});
1906
- var LLMTraceStepSchema = import_zod26.z.object({
1907
- id: import_zod26.z.string(),
1908
- stepNumber: import_zod26.z.number(),
1909
- type: import_zod26.z.enum(LLMStepType),
1910
- model: import_zod26.z.string(),
1911
- provider: import_zod26.z.string(),
1912
- startedAt: import_zod26.z.string(),
1913
- durationMs: import_zod26.z.number(),
1901
+ var LLMTraceStepSchema = import_zod25.z.object({
1902
+ id: import_zod25.z.string(),
1903
+ stepNumber: import_zod25.z.number(),
1904
+ type: import_zod25.z.enum(LLMStepType),
1905
+ model: import_zod25.z.string(),
1906
+ provider: import_zod25.z.string(),
1907
+ startedAt: import_zod25.z.string(),
1908
+ durationMs: import_zod25.z.number(),
1914
1909
  tokenUsage: TokenUsageSchema,
1915
- costUsd: import_zod26.z.number(),
1916
- toolName: import_zod26.z.string().optional(),
1917
- toolArguments: import_zod26.z.string().optional(),
1918
- inputPreview: import_zod26.z.string().optional(),
1919
- outputPreview: import_zod26.z.string().optional(),
1920
- success: import_zod26.z.boolean(),
1921
- error: import_zod26.z.string().optional()
1922
- });
1923
- var LLMBreakdownStatsSchema = import_zod26.z.object({
1924
- count: import_zod26.z.number(),
1925
- durationMs: import_zod26.z.number(),
1926
- tokens: import_zod26.z.number(),
1927
- costUsd: import_zod26.z.number()
1928
- });
1929
- var LLMTraceSummarySchema = import_zod26.z.object({
1930
- totalSteps: import_zod26.z.number(),
1931
- totalDurationMs: import_zod26.z.number(),
1910
+ costUsd: import_zod25.z.number(),
1911
+ toolName: import_zod25.z.string().optional(),
1912
+ toolArguments: import_zod25.z.string().optional(),
1913
+ inputPreview: import_zod25.z.string().optional(),
1914
+ outputPreview: import_zod25.z.string().optional(),
1915
+ success: import_zod25.z.boolean(),
1916
+ error: import_zod25.z.string().optional()
1917
+ });
1918
+ var LLMBreakdownStatsSchema = import_zod25.z.object({
1919
+ count: import_zod25.z.number(),
1920
+ durationMs: import_zod25.z.number(),
1921
+ tokens: import_zod25.z.number(),
1922
+ costUsd: import_zod25.z.number()
1923
+ });
1924
+ var LLMTraceSummarySchema = import_zod25.z.object({
1925
+ totalSteps: import_zod25.z.number(),
1926
+ totalDurationMs: import_zod25.z.number(),
1932
1927
  totalTokens: TokenUsageSchema,
1933
- totalCostUsd: import_zod26.z.number(),
1934
- stepTypeBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema).optional(),
1935
- modelBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema),
1936
- modelsUsed: import_zod26.z.array(import_zod26.z.string())
1937
- });
1938
- var LLMTraceSchema = import_zod26.z.object({
1939
- id: import_zod26.z.string(),
1940
- steps: import_zod26.z.array(LLMTraceStepSchema),
1928
+ totalCostUsd: import_zod25.z.number(),
1929
+ stepTypeBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema).optional(),
1930
+ modelBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema),
1931
+ modelsUsed: import_zod25.z.array(import_zod25.z.string())
1932
+ });
1933
+ var LLMTraceSchema = import_zod25.z.object({
1934
+ id: import_zod25.z.string(),
1935
+ steps: import_zod25.z.array(LLMTraceStepSchema),
1941
1936
  summary: LLMTraceSummarySchema
1942
1937
  });
1943
1938
 
1944
1939
  // src/evaluation/eval-result.ts
1945
- var import_zod30 = require("zod");
1940
+ var import_zod29 = require("zod");
1946
1941
 
1947
1942
  // src/evaluation/eval-run.ts
1948
- var import_zod28 = require("zod");
1943
+ var import_zod27 = require("zod");
1949
1944
 
1950
1945
  // src/evaluation/live-trace.ts
1951
- var import_zod27 = require("zod");
1946
+ var import_zod26 = require("zod");
1952
1947
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1953
1948
  LiveTraceEventType2["THINKING"] = "thinking";
1954
1949
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -1962,37 +1957,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1962
1957
  LiveTraceEventType2["USER"] = "user";
1963
1958
  return LiveTraceEventType2;
1964
1959
  })(LiveTraceEventType || {});
1965
- var LiveTraceEventSchema = import_zod27.z.object({
1960
+ var LiveTraceEventSchema = import_zod26.z.object({
1966
1961
  /** The evaluation run ID */
1967
- evalRunId: import_zod27.z.string(),
1962
+ evalRunId: import_zod26.z.string(),
1968
1963
  /** The scenario ID being executed */
1969
- scenarioId: import_zod27.z.string(),
1964
+ scenarioId: import_zod26.z.string(),
1970
1965
  /** The scenario name for display */
1971
- scenarioName: import_zod27.z.string(),
1966
+ scenarioName: import_zod26.z.string(),
1972
1967
  /** The target ID (skill, agent, etc.) */
1973
- targetId: import_zod27.z.string(),
1968
+ targetId: import_zod26.z.string(),
1974
1969
  /** The target name for display */
1975
- targetName: import_zod27.z.string(),
1970
+ targetName: import_zod26.z.string(),
1976
1971
  /** Step number in the current scenario execution */
1977
- stepNumber: import_zod27.z.number(),
1972
+ stepNumber: import_zod26.z.number(),
1978
1973
  /** Type of trace event */
1979
- type: import_zod27.z.enum(LiveTraceEventType),
1974
+ type: import_zod26.z.enum(LiveTraceEventType),
1980
1975
  /** Tool name if this is a tool_use event */
1981
- toolName: import_zod27.z.string().optional(),
1976
+ toolName: import_zod26.z.string().optional(),
1982
1977
  /** Tool arguments preview (truncated JSON) */
1983
- toolArgs: import_zod27.z.string().optional(),
1978
+ toolArgs: import_zod26.z.string().optional(),
1984
1979
  /** Output preview (truncated text) */
1985
- outputPreview: import_zod27.z.string().optional(),
1980
+ outputPreview: import_zod26.z.string().optional(),
1986
1981
  /** File path for file operations */
1987
- filePath: import_zod27.z.string().optional(),
1982
+ filePath: import_zod26.z.string().optional(),
1988
1983
  /** Elapsed time in milliseconds for progress events */
1989
- elapsedMs: import_zod27.z.number().optional(),
1984
+ elapsedMs: import_zod26.z.number().optional(),
1990
1985
  /** Thinking/reasoning text from Claude */
1991
- thinking: import_zod27.z.string().optional(),
1986
+ thinking: import_zod26.z.string().optional(),
1992
1987
  /** Timestamp when this event occurred */
1993
- timestamp: import_zod27.z.string(),
1988
+ timestamp: import_zod26.z.string(),
1994
1989
  /** Whether this is the final event for this scenario */
1995
- isComplete: import_zod27.z.boolean()
1990
+ isComplete: import_zod26.z.boolean()
1996
1991
  });
1997
1992
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
1998
1993
  function parseTraceEventLine(line) {
@@ -2020,14 +2015,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
2020
2015
  TriggerType2["MANUAL"] = "MANUAL";
2021
2016
  return TriggerType2;
2022
2017
  })(TriggerType || {});
2023
- var TriggerMetadataSchema = import_zod28.z.object({
2024
- version: import_zod28.z.string().optional(),
2025
- resourceUpdated: import_zod28.z.array(import_zod28.z.string()).optional()
2018
+ var TriggerMetadataSchema = import_zod27.z.object({
2019
+ version: import_zod27.z.string().optional(),
2020
+ resourceUpdated: import_zod27.z.array(import_zod27.z.string()).optional()
2026
2021
  });
2027
- var TriggerSchema = import_zod28.z.object({
2028
- id: import_zod28.z.string(),
2022
+ var TriggerSchema = import_zod27.z.object({
2023
+ id: import_zod27.z.string(),
2029
2024
  metadata: TriggerMetadataSchema.optional(),
2030
- type: import_zod28.z.enum(TriggerType)
2025
+ type: import_zod27.z.enum(TriggerType)
2031
2026
  });
2032
2027
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
2033
2028
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -2045,28 +2040,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
2045
2040
  FailureSeverity2["LOW"] = "low";
2046
2041
  return FailureSeverity2;
2047
2042
  })(FailureSeverity || {});
2048
- var DiffLineTypeSchema = import_zod28.z.enum(["added", "removed", "unchanged"]);
2049
- var DiffLineSchema = import_zod28.z.object({
2043
+ var DiffLineTypeSchema = import_zod27.z.enum(["added", "removed", "unchanged"]);
2044
+ var DiffLineSchema = import_zod27.z.object({
2050
2045
  type: DiffLineTypeSchema,
2051
- content: import_zod28.z.string(),
2052
- lineNumber: import_zod28.z.number()
2053
- });
2054
- var DiffContentSchema = import_zod28.z.object({
2055
- path: import_zod28.z.string(),
2056
- expected: import_zod28.z.string(),
2057
- actual: import_zod28.z.string(),
2058
- diffLines: import_zod28.z.array(DiffLineSchema),
2059
- renamedFrom: import_zod28.z.string().optional()
2060
- });
2061
- var CommandExecutionSchema = import_zod28.z.object({
2062
- command: import_zod28.z.string(),
2063
- exitCode: import_zod28.z.number(),
2064
- output: import_zod28.z.string().optional(),
2065
- duration: import_zod28.z.number()
2066
- });
2067
- var FileModificationSchema = import_zod28.z.object({
2068
- path: import_zod28.z.string(),
2069
- action: import_zod28.z.enum(["created", "modified", "deleted"])
2046
+ content: import_zod27.z.string(),
2047
+ lineNumber: import_zod27.z.number()
2048
+ });
2049
+ var DiffContentSchema = import_zod27.z.object({
2050
+ path: import_zod27.z.string(),
2051
+ expected: import_zod27.z.string(),
2052
+ actual: import_zod27.z.string(),
2053
+ diffLines: import_zod27.z.array(DiffLineSchema),
2054
+ renamedFrom: import_zod27.z.string().optional()
2055
+ });
2056
+ var CommandExecutionSchema = import_zod27.z.object({
2057
+ command: import_zod27.z.string(),
2058
+ exitCode: import_zod27.z.number(),
2059
+ output: import_zod27.z.string().optional(),
2060
+ duration: import_zod27.z.number()
2061
+ });
2062
+ var FileModificationSchema = import_zod27.z.object({
2063
+ path: import_zod27.z.string(),
2064
+ action: import_zod27.z.enum(["created", "modified", "deleted"])
2070
2065
  });
2071
2066
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
2072
2067
  TemplateFileStatus2["NEW"] = "new";
@@ -2074,87 +2069,87 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
2074
2069
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
2075
2070
  return TemplateFileStatus2;
2076
2071
  })(TemplateFileStatus || {});
2077
- var TemplateFileSchema = import_zod28.z.object({
2072
+ var TemplateFileSchema = import_zod27.z.object({
2078
2073
  /** Relative path within the template */
2079
- path: import_zod28.z.string(),
2074
+ path: import_zod27.z.string(),
2080
2075
  /** Full file content after execution */
2081
- content: import_zod28.z.string(),
2076
+ content: import_zod27.z.string(),
2082
2077
  /** File status (new, modified, unchanged) */
2083
- status: import_zod28.z.enum(["new", "modified", "unchanged"])
2084
- });
2085
- var ApiCallSchema = import_zod28.z.object({
2086
- endpoint: import_zod28.z.string(),
2087
- tokensUsed: import_zod28.z.number(),
2088
- duration: import_zod28.z.number()
2089
- });
2090
- var ExecutionTraceSchema = import_zod28.z.object({
2091
- commands: import_zod28.z.array(CommandExecutionSchema),
2092
- filesModified: import_zod28.z.array(FileModificationSchema),
2093
- apiCalls: import_zod28.z.array(ApiCallSchema),
2094
- totalDuration: import_zod28.z.number()
2095
- });
2096
- var FailureAnalysisSchema = import_zod28.z.object({
2097
- category: import_zod28.z.enum(FailureCategory),
2098
- severity: import_zod28.z.enum(FailureSeverity),
2099
- summary: import_zod28.z.string(),
2100
- details: import_zod28.z.string(),
2101
- rootCause: import_zod28.z.string(),
2102
- suggestedFix: import_zod28.z.string(),
2103
- relatedAssertions: import_zod28.z.array(import_zod28.z.string()),
2104
- codeSnippet: import_zod28.z.string().optional(),
2105
- similarIssues: import_zod28.z.array(import_zod28.z.string()).optional(),
2106
- patternId: import_zod28.z.string().optional(),
2078
+ status: import_zod27.z.enum(["new", "modified", "unchanged"])
2079
+ });
2080
+ var ApiCallSchema = import_zod27.z.object({
2081
+ endpoint: import_zod27.z.string(),
2082
+ tokensUsed: import_zod27.z.number(),
2083
+ duration: import_zod27.z.number()
2084
+ });
2085
+ var ExecutionTraceSchema = import_zod27.z.object({
2086
+ commands: import_zod27.z.array(CommandExecutionSchema),
2087
+ filesModified: import_zod27.z.array(FileModificationSchema),
2088
+ apiCalls: import_zod27.z.array(ApiCallSchema),
2089
+ totalDuration: import_zod27.z.number()
2090
+ });
2091
+ var FailureAnalysisSchema = import_zod27.z.object({
2092
+ category: import_zod27.z.enum(FailureCategory),
2093
+ severity: import_zod27.z.enum(FailureSeverity),
2094
+ summary: import_zod27.z.string(),
2095
+ details: import_zod27.z.string(),
2096
+ rootCause: import_zod27.z.string(),
2097
+ suggestedFix: import_zod27.z.string(),
2098
+ relatedAssertions: import_zod27.z.array(import_zod27.z.string()),
2099
+ codeSnippet: import_zod27.z.string().optional(),
2100
+ similarIssues: import_zod27.z.array(import_zod27.z.string()).optional(),
2101
+ patternId: import_zod27.z.string().optional(),
2107
2102
  // Extended fields for detailed debugging
2108
2103
  diff: DiffContentSchema.optional(),
2109
2104
  executionTrace: ExecutionTraceSchema.optional()
2110
2105
  });
2111
2106
  var EvalRunSchema = TenantEntitySchema.extend({
2112
2107
  /** Agent ID for this run */
2113
- agentId: import_zod28.z.string().optional(),
2108
+ agentId: import_zod27.z.string().optional(),
2114
2109
  /** Preset ID that originated this run (optional) */
2115
- presetId: import_zod28.z.string().optional(),
2110
+ presetId: import_zod27.z.string().optional(),
2116
2111
  /** Skill IDs for this run */
2117
- skillIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2112
+ skillIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2118
2113
  /** Map of skillId to skillVersionId for this run */
2119
- skillVersions: import_zod28.z.record(import_zod28.z.string(), import_zod28.z.string()).optional(),
2114
+ skillVersions: import_zod27.z.record(import_zod27.z.string(), import_zod27.z.string()).optional(),
2120
2115
  /** Scenario IDs to run (always present — resolved server-side from tags when needed) */
2121
- scenarioIds: import_zod28.z.array(import_zod28.z.string()),
2116
+ scenarioIds: import_zod27.z.array(import_zod27.z.string()),
2122
2117
  /** Current status */
2123
2118
  status: EvalStatusSchema,
2124
2119
  /** Progress percentage (0-100) */
2125
- progress: import_zod28.z.number(),
2120
+ progress: import_zod27.z.number(),
2126
2121
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
2127
- results: import_zod28.z.array(import_zod28.z.lazy(() => EvalRunResultSchema)),
2122
+ results: import_zod27.z.array(import_zod27.z.lazy(() => EvalRunResultSchema)),
2128
2123
  /** Aggregated metrics across all results */
2129
2124
  aggregateMetrics: EvalMetricsSchema,
2130
2125
  /** Failure analyses */
2131
- failureAnalyses: import_zod28.z.array(FailureAnalysisSchema).optional(),
2126
+ failureAnalyses: import_zod27.z.array(FailureAnalysisSchema).optional(),
2132
2127
  /** Aggregated LLM trace summary */
2133
2128
  llmTraceSummary: LLMTraceSummarySchema.optional(),
2134
2129
  /** What triggered this run */
2135
2130
  trigger: TriggerSchema.optional(),
2136
2131
  /** When the run started (set when evaluation is triggered) */
2137
- startedAt: import_zod28.z.string().optional(),
2132
+ startedAt: import_zod27.z.string().optional(),
2138
2133
  /** When the run completed */
2139
- completedAt: import_zod28.z.string().optional(),
2134
+ completedAt: import_zod27.z.string().optional(),
2140
2135
  /** Live trace events captured during execution (for playback on results page) */
2141
- liveTraceEvents: import_zod28.z.array(LiveTraceEventSchema).optional(),
2136
+ liveTraceEvents: import_zod27.z.array(LiveTraceEventSchema).optional(),
2142
2137
  /** Remote job ID for tracking execution in Dev Machines */
2143
- jobId: import_zod28.z.string().optional(),
2138
+ jobId: import_zod27.z.string().optional(),
2144
2139
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
2145
- jobStatus: import_zod28.z.string().optional(),
2140
+ jobStatus: import_zod27.z.string().optional(),
2146
2141
  /** Remote job error message if the job failed */
2147
- jobError: import_zod28.z.string().optional(),
2142
+ jobError: import_zod27.z.string().optional(),
2148
2143
  /** Timestamp of the last job status check */
2149
- jobStatusCheckedAt: import_zod28.z.string().optional(),
2144
+ jobStatusCheckedAt: import_zod27.z.string().optional(),
2150
2145
  /** MCP server IDs to enable for this run (optional) */
2151
- mcpIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2146
+ mcpIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2152
2147
  /** Sub-agent IDs to enable for this run (optional) */
2153
- subAgentIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2148
+ subAgentIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2154
2149
  /** Rule IDs to enable for this run (optional) */
2155
- ruleIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2150
+ ruleIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2156
2151
  /** Tags used to select scenarios for this run (for traceability) */
2157
- tags: import_zod28.z.array(import_zod28.z.string()).optional()
2152
+ tags: import_zod27.z.array(import_zod27.z.string()).optional()
2158
2153
  });
2159
2154
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
2160
2155
  id: true,
@@ -2169,60 +2164,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
2169
2164
  scenarioIds: true
2170
2165
  }).extend({
2171
2166
  /** Optional on input — backend resolves from tags when not provided */
2172
- scenarioIds: import_zod28.z.array(import_zod28.z.string()).optional()
2167
+ scenarioIds: import_zod27.z.array(import_zod27.z.string()).optional()
2173
2168
  }).refine(
2174
2169
  (data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
2175
2170
  { message: "Either scenarioIds or tags must be provided" }
2176
2171
  );
2177
- var EvaluationProgressSchema = import_zod28.z.object({
2178
- runId: import_zod28.z.string(),
2179
- targetId: import_zod28.z.string(),
2180
- totalScenarios: import_zod28.z.number(),
2181
- completedScenarios: import_zod28.z.number(),
2182
- scenarioProgress: import_zod28.z.array(
2183
- import_zod28.z.object({
2184
- scenarioId: import_zod28.z.string(),
2185
- currentStep: import_zod28.z.string(),
2186
- error: import_zod28.z.string().optional()
2172
+ var EvaluationProgressSchema = import_zod27.z.object({
2173
+ runId: import_zod27.z.string(),
2174
+ targetId: import_zod27.z.string(),
2175
+ totalScenarios: import_zod27.z.number(),
2176
+ completedScenarios: import_zod27.z.number(),
2177
+ scenarioProgress: import_zod27.z.array(
2178
+ import_zod27.z.object({
2179
+ scenarioId: import_zod27.z.string(),
2180
+ currentStep: import_zod27.z.string(),
2181
+ error: import_zod27.z.string().optional()
2187
2182
  })
2188
2183
  ),
2189
- createdAt: import_zod28.z.number()
2190
- });
2191
- var EvaluationLogSchema = import_zod28.z.object({
2192
- runId: import_zod28.z.string(),
2193
- scenarioId: import_zod28.z.string(),
2194
- log: import_zod28.z.object({
2195
- level: import_zod28.z.enum(["info", "error", "debug"]),
2196
- message: import_zod28.z.string().optional(),
2197
- args: import_zod28.z.array(import_zod28.z.any()).optional(),
2198
- error: import_zod28.z.string().optional()
2184
+ createdAt: import_zod27.z.number()
2185
+ });
2186
+ var EvaluationLogSchema = import_zod27.z.object({
2187
+ runId: import_zod27.z.string(),
2188
+ scenarioId: import_zod27.z.string(),
2189
+ log: import_zod27.z.object({
2190
+ level: import_zod27.z.enum(["info", "error", "debug"]),
2191
+ message: import_zod27.z.string().optional(),
2192
+ args: import_zod27.z.array(import_zod27.z.any()).optional(),
2193
+ error: import_zod27.z.string().optional()
2199
2194
  })
2200
2195
  });
2201
2196
  var LLM_TIMEOUT = 12e4;
2202
2197
 
2203
2198
  // src/evaluation/conversation.ts
2204
- var import_zod29 = require("zod");
2205
- var TextBlockSchema = import_zod29.z.object({
2206
- type: import_zod29.z.literal("text"),
2207
- text: import_zod29.z.string()
2208
- });
2209
- var ThinkingBlockSchema = import_zod29.z.object({
2210
- type: import_zod29.z.literal("thinking"),
2211
- thinking: import_zod29.z.string()
2212
- });
2213
- var ToolUseBlockSchema = import_zod29.z.object({
2214
- type: import_zod29.z.literal("tool_use"),
2215
- toolName: import_zod29.z.string(),
2216
- toolId: import_zod29.z.string(),
2217
- input: import_zod29.z.unknown()
2218
- });
2219
- var ToolResultBlockSchema = import_zod29.z.object({
2220
- type: import_zod29.z.literal("tool_result"),
2221
- toolUseId: import_zod29.z.string(),
2222
- content: import_zod29.z.string(),
2223
- isError: import_zod29.z.boolean().optional()
2224
- });
2225
- var ConversationBlockSchema = import_zod29.z.discriminatedUnion("type", [
2199
+ var import_zod28 = require("zod");
2200
+ var TextBlockSchema = import_zod28.z.object({
2201
+ type: import_zod28.z.literal("text"),
2202
+ text: import_zod28.z.string()
2203
+ });
2204
+ var ThinkingBlockSchema = import_zod28.z.object({
2205
+ type: import_zod28.z.literal("thinking"),
2206
+ thinking: import_zod28.z.string()
2207
+ });
2208
+ var ToolUseBlockSchema = import_zod28.z.object({
2209
+ type: import_zod28.z.literal("tool_use"),
2210
+ toolName: import_zod28.z.string(),
2211
+ toolId: import_zod28.z.string(),
2212
+ input: import_zod28.z.unknown()
2213
+ });
2214
+ var ToolResultBlockSchema = import_zod28.z.object({
2215
+ type: import_zod28.z.literal("tool_result"),
2216
+ toolUseId: import_zod28.z.string(),
2217
+ content: import_zod28.z.string(),
2218
+ isError: import_zod28.z.boolean().optional()
2219
+ });
2220
+ var ConversationBlockSchema = import_zod28.z.discriminatedUnion("type", [
2226
2221
  TextBlockSchema,
2227
2222
  ThinkingBlockSchema,
2228
2223
  ToolUseBlockSchema,
@@ -2233,18 +2228,18 @@ var ConversationMessageRoles = [
2233
2228
  "user",
2234
2229
  "system"
2235
2230
  ];
2236
- var ConversationMessageSchema = import_zod29.z.object({
2237
- role: import_zod29.z.enum(ConversationMessageRoles),
2238
- content: import_zod29.z.array(ConversationBlockSchema),
2239
- timestamp: import_zod29.z.string()
2231
+ var ConversationMessageSchema = import_zod28.z.object({
2232
+ role: import_zod28.z.enum(ConversationMessageRoles),
2233
+ content: import_zod28.z.array(ConversationBlockSchema),
2234
+ timestamp: import_zod28.z.string()
2240
2235
  });
2241
- var ScenarioConversationSchema = import_zod29.z.object({
2242
- id: import_zod29.z.string(),
2243
- projectId: import_zod29.z.string(),
2244
- evalRunId: import_zod29.z.string(),
2245
- resultId: import_zod29.z.string(),
2246
- messages: import_zod29.z.array(ConversationMessageSchema),
2247
- createdAt: import_zod29.z.string()
2236
+ var ScenarioConversationSchema = import_zod28.z.object({
2237
+ id: import_zod28.z.string(),
2238
+ projectId: import_zod28.z.string(),
2239
+ evalRunId: import_zod28.z.string(),
2240
+ resultId: import_zod28.z.string(),
2241
+ messages: import_zod28.z.array(ConversationMessageSchema),
2242
+ createdAt: import_zod28.z.string()
2248
2243
  });
2249
2244
 
2250
2245
  // src/evaluation/eval-result.ts
@@ -2255,100 +2250,100 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
2255
2250
  AssertionResultStatus2["ERROR"] = "error";
2256
2251
  return AssertionResultStatus2;
2257
2252
  })(AssertionResultStatus || {});
2258
- var AssertionResultSchema = import_zod30.z.object({
2259
- id: import_zod30.z.string(),
2260
- assertionId: import_zod30.z.string(),
2261
- assertionType: import_zod30.z.string(),
2262
- assertionName: import_zod30.z.string(),
2263
- status: import_zod30.z.enum(AssertionResultStatus),
2264
- message: import_zod30.z.string().optional(),
2265
- expected: import_zod30.z.string().optional(),
2266
- actual: import_zod30.z.string().optional(),
2267
- duration: import_zod30.z.number().optional(),
2268
- details: import_zod30.z.record(import_zod30.z.string(), import_zod30.z.unknown()).optional(),
2269
- llmTraceSteps: import_zod30.z.array(LLMTraceStepSchema).optional()
2270
- });
2271
- var EvalRunResultSchema = import_zod30.z.object({
2272
- id: import_zod30.z.string(),
2273
- targetId: import_zod30.z.string(),
2274
- targetName: import_zod30.z.string().optional(),
2253
+ var AssertionResultSchema = import_zod29.z.object({
2254
+ id: import_zod29.z.string(),
2255
+ assertionId: import_zod29.z.string(),
2256
+ assertionType: import_zod29.z.string(),
2257
+ assertionName: import_zod29.z.string(),
2258
+ status: import_zod29.z.enum(AssertionResultStatus),
2259
+ message: import_zod29.z.string().optional(),
2260
+ expected: import_zod29.z.string().optional(),
2261
+ actual: import_zod29.z.string().optional(),
2262
+ duration: import_zod29.z.number().optional(),
2263
+ details: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.unknown()).optional(),
2264
+ llmTraceSteps: import_zod29.z.array(LLMTraceStepSchema).optional()
2265
+ });
2266
+ var EvalRunResultSchema = import_zod29.z.object({
2267
+ id: import_zod29.z.string(),
2268
+ targetId: import_zod29.z.string(),
2269
+ targetName: import_zod29.z.string().optional(),
2275
2270
  /** SkillVersion ID used for this evaluation (for version tracking) */
2276
- skillVersionId: import_zod30.z.string().optional(),
2271
+ skillVersionId: import_zod29.z.string().optional(),
2277
2272
  /** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
2278
- skillVersion: import_zod30.z.string().optional(),
2279
- scenarioId: import_zod30.z.string(),
2280
- scenarioName: import_zod30.z.string(),
2273
+ skillVersion: import_zod29.z.string().optional(),
2274
+ scenarioId: import_zod29.z.string(),
2275
+ scenarioName: import_zod29.z.string(),
2281
2276
  modelConfig: ModelConfigSchema.optional(),
2282
- assertionResults: import_zod30.z.array(AssertionResultSchema),
2277
+ assertionResults: import_zod29.z.array(AssertionResultSchema),
2283
2278
  metrics: EvalMetricsSchema.optional(),
2284
- passed: import_zod30.z.number(),
2285
- failed: import_zod30.z.number(),
2286
- passRate: import_zod30.z.number(),
2287
- duration: import_zod30.z.number(),
2288
- outputText: import_zod30.z.string().optional(),
2289
- files: import_zod30.z.array(ExpectedFileSchema).optional(),
2290
- fileDiffs: import_zod30.z.array(DiffContentSchema).optional(),
2279
+ passed: import_zod29.z.number(),
2280
+ failed: import_zod29.z.number(),
2281
+ passRate: import_zod29.z.number(),
2282
+ duration: import_zod29.z.number(),
2283
+ outputText: import_zod29.z.string().optional(),
2284
+ files: import_zod29.z.array(ExpectedFileSchema).optional(),
2285
+ fileDiffs: import_zod29.z.array(DiffContentSchema).optional(),
2291
2286
  /** Full template files after execution with status indicators */
2292
- templateFiles: import_zod30.z.array(TemplateFileSchema).optional(),
2293
- startedAt: import_zod30.z.string().optional(),
2294
- completedAt: import_zod30.z.string().optional(),
2287
+ templateFiles: import_zod29.z.array(TemplateFileSchema).optional(),
2288
+ startedAt: import_zod29.z.string().optional(),
2289
+ completedAt: import_zod29.z.string().optional(),
2295
2290
  llmTrace: LLMTraceSchema.optional(),
2296
2291
  /** Full conversation messages (only present in transit; stripped before DB storage) */
2297
- conversation: import_zod30.z.array(ConversationMessageSchema).optional()
2298
- });
2299
- var PromptResultSchema = import_zod30.z.object({
2300
- text: import_zod30.z.string(),
2301
- files: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2302
- finishReason: import_zod30.z.string().optional(),
2303
- reasoning: import_zod30.z.string().optional(),
2304
- reasoningDetails: import_zod30.z.unknown().optional(),
2305
- toolCalls: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2306
- toolResults: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2307
- warnings: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2308
- sources: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2309
- steps: import_zod30.z.array(import_zod30.z.unknown()),
2310
- generationTimeMs: import_zod30.z.number(),
2311
- prompt: import_zod30.z.string(),
2312
- systemPrompt: import_zod30.z.string(),
2313
- usage: import_zod30.z.object({
2314
- totalTokens: import_zod30.z.number().optional(),
2315
- totalMicrocentsSpent: import_zod30.z.number().optional()
2292
+ conversation: import_zod29.z.array(ConversationMessageSchema).optional()
2293
+ });
2294
+ var PromptResultSchema = import_zod29.z.object({
2295
+ text: import_zod29.z.string(),
2296
+ files: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2297
+ finishReason: import_zod29.z.string().optional(),
2298
+ reasoning: import_zod29.z.string().optional(),
2299
+ reasoningDetails: import_zod29.z.unknown().optional(),
2300
+ toolCalls: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2301
+ toolResults: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2302
+ warnings: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2303
+ sources: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2304
+ steps: import_zod29.z.array(import_zod29.z.unknown()),
2305
+ generationTimeMs: import_zod29.z.number(),
2306
+ prompt: import_zod29.z.string(),
2307
+ systemPrompt: import_zod29.z.string(),
2308
+ usage: import_zod29.z.object({
2309
+ totalTokens: import_zod29.z.number().optional(),
2310
+ totalMicrocentsSpent: import_zod29.z.number().optional()
2316
2311
  })
2317
2312
  });
2318
- var EvaluationResultSchema = import_zod30.z.object({
2319
- id: import_zod30.z.string(),
2320
- runId: import_zod30.z.string(),
2321
- timestamp: import_zod30.z.number(),
2313
+ var EvaluationResultSchema = import_zod29.z.object({
2314
+ id: import_zod29.z.string(),
2315
+ runId: import_zod29.z.string(),
2316
+ timestamp: import_zod29.z.number(),
2322
2317
  promptResult: PromptResultSchema,
2323
- testResults: import_zod30.z.array(import_zod30.z.unknown()),
2324
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
2325
- feedback: import_zod30.z.string().optional(),
2326
- score: import_zod30.z.number(),
2327
- suiteId: import_zod30.z.string().optional()
2328
- });
2329
- var LeanEvaluationResultSchema = import_zod30.z.object({
2330
- id: import_zod30.z.string(),
2331
- runId: import_zod30.z.string(),
2332
- timestamp: import_zod30.z.number(),
2333
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
2334
- scenarioId: import_zod30.z.string(),
2335
- scenarioVersion: import_zod30.z.number().optional(),
2336
- targetId: import_zod30.z.string(),
2337
- targetVersion: import_zod30.z.number().optional(),
2338
- suiteId: import_zod30.z.string().optional(),
2339
- score: import_zod30.z.number(),
2340
- time: import_zod30.z.number().optional(),
2341
- microcentsSpent: import_zod30.z.number().optional()
2318
+ testResults: import_zod29.z.array(import_zod29.z.unknown()),
2319
+ tags: import_zod29.z.array(import_zod29.z.string()).optional(),
2320
+ feedback: import_zod29.z.string().optional(),
2321
+ score: import_zod29.z.number(),
2322
+ suiteId: import_zod29.z.string().optional()
2323
+ });
2324
+ var LeanEvaluationResultSchema = import_zod29.z.object({
2325
+ id: import_zod29.z.string(),
2326
+ runId: import_zod29.z.string(),
2327
+ timestamp: import_zod29.z.number(),
2328
+ tags: import_zod29.z.array(import_zod29.z.string()).optional(),
2329
+ scenarioId: import_zod29.z.string(),
2330
+ scenarioVersion: import_zod29.z.number().optional(),
2331
+ targetId: import_zod29.z.string(),
2332
+ targetVersion: import_zod29.z.number().optional(),
2333
+ suiteId: import_zod29.z.string().optional(),
2334
+ score: import_zod29.z.number(),
2335
+ time: import_zod29.z.number().optional(),
2336
+ microcentsSpent: import_zod29.z.number().optional()
2342
2337
  });
2343
2338
 
2344
2339
  // src/project/project.ts
2345
- var import_zod31 = require("zod");
2340
+ var import_zod30 = require("zod");
2346
2341
  var ProjectSchema = BaseEntitySchema.extend({
2347
- appId: import_zod31.z.string().optional().describe("The ID of the app in Dev Center"),
2348
- appSecret: import_zod31.z.string().optional().describe("The secret of the app in Dev Center"),
2349
- useWixAuth: import_zod31.z.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
2350
- useBase44Auth: import_zod31.z.boolean().optional().describe("Enable Base44 auth for evaluations"),
2351
- scenarioTags: import_zod31.z.array(import_zod31.z.string()).optional().describe("Project-level tag vocabulary for scenarios")
2342
+ appId: import_zod30.z.string().optional().describe("The ID of the app in Dev Center"),
2343
+ appSecret: import_zod30.z.string().optional().describe("The secret of the app in Dev Center"),
2344
+ useWixAuth: import_zod30.z.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
2345
+ useBase44Auth: import_zod30.z.boolean().optional().describe("Enable Base44 auth for evaluations"),
2346
+ scenarioTags: import_zod30.z.array(import_zod30.z.string()).optional().describe("Project-level tag vocabulary for scenarios")
2352
2347
  });
2353
2348
  var CreateProjectInputSchema = ProjectSchema.omit({
2354
2349
  id: true,
@@ -2401,7 +2396,7 @@ var SYSTEM_ASSERTIONS = {
2401
2396
  [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
2402
2397
  id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
2403
2398
  name: "Tool Called With Param",
2404
- description: "Check that a tool was called with expected parameters",
2399
+ description: "Check that a tool was called with expected parameters (tool name is substring matched)",
2405
2400
  type: "tool_called_with_param",
2406
2401
  parameters: [
2407
2402
  {
@@ -2415,6 +2410,14 @@ var SYSTEM_ASSERTIONS = {
2415
2410
  label: "Expected Parameters (JSON, substring match)",
2416
2411
  type: "string",
2417
2412
  required: true
2413
+ },
2414
+ {
2415
+ name: "requireSuccess",
2416
+ label: "Require Successful Call",
2417
+ type: "boolean",
2418
+ required: false,
2419
+ defaultValue: false,
2420
+ advanced: true
2418
2421
  }
2419
2422
  ]
2420
2423
  },
@@ -2547,6 +2550,9 @@ function getSystemAssertion(id) {
2547
2550
  BuildCheckTestSchema,
2548
2551
  BuildPassedAssertionSchema,
2549
2552
  BuildPassedConfigSchema,
2553
+ BulkImportResultItemSchema,
2554
+ BulkImportResultSchema,
2555
+ BulkImportSkillsInputSchema,
2550
2556
  ClaudeModel,
2551
2557
  ClaudeModelSchema,
2552
2558
  CommandExecutionSchema,