@wix/evalforge-types 0.53.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1609,85 +1609,34 @@ var TestSchema = import_zod20.z.discriminatedUnion("type", [
1609
1609
  PlaywrightNLTestSchema
1610
1610
  ]);
1611
1611
 
1612
- // src/scenario/assertions.ts
1613
- var import_zod21 = require("zod");
1614
- var SkillWasCalledAssertionSchema = import_zod21.z.object({
1615
- type: import_zod21.z.literal("skill_was_called"),
1616
- /** Names of the skills that must have been called (matched against trace Skill tool args) */
1617
- skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
1618
- });
1619
- var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
1620
- type: import_zod21.z.literal("tool_called_with_param"),
1621
- /** Name of the tool that must have been called */
1622
- toolName: import_zod21.z.string().min(1),
1623
- /** JSON string of key-value pairs for expected parameters (substring match) */
1624
- expectedParams: import_zod21.z.string().min(1)
1625
- });
1626
- var BuildPassedAssertionSchema = import_zod21.z.object({
1627
- type: import_zod21.z.literal("build_passed"),
1628
- /** Command to run (default: "yarn build") */
1629
- command: import_zod21.z.string().optional(),
1630
- /** Expected exit code (default: 0) */
1631
- expectedExitCode: import_zod21.z.number().int().optional()
1632
- });
1633
- var CostAssertionSchema = import_zod21.z.object({
1634
- type: import_zod21.z.literal("cost"),
1635
- /** Maximum allowed cost in USD */
1636
- maxCostUsd: import_zod21.z.number().positive()
1637
- });
1638
- var LlmJudgeAssertionSchema = import_zod21.z.object({
1639
- type: import_zod21.z.literal("llm_judge"),
1640
- /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
1641
- prompt: import_zod21.z.string(),
1642
- /** Minimum score to pass (0-10, default 7) */
1643
- minScore: import_zod21.z.number().int().min(0).max(10).optional(),
1644
- /** Model for the judge (e.g. claude-3-5-haiku) */
1645
- model: import_zod21.z.string().optional(),
1646
- maxTokens: import_zod21.z.number().int().optional(),
1647
- temperature: import_zod21.z.number().min(0).max(1).optional()
1648
- });
1649
- var TimeAssertionSchema = import_zod21.z.object({
1650
- type: import_zod21.z.literal("time_limit"),
1651
- /** Maximum allowed duration in milliseconds */
1652
- maxDurationMs: import_zod21.z.number().int().positive()
1653
- });
1654
- var AssertionSchema = import_zod21.z.union([
1655
- SkillWasCalledAssertionSchema,
1656
- ToolCalledWithParamAssertionSchema,
1657
- BuildPassedAssertionSchema,
1658
- TimeAssertionSchema,
1659
- CostAssertionSchema,
1660
- LlmJudgeAssertionSchema
1661
- ]);
1662
-
1663
1612
  // src/scenario/environment.ts
1664
- var import_zod22 = require("zod");
1665
- var LocalProjectConfigSchema = import_zod22.z.object({
1613
+ var import_zod21 = require("zod");
1614
+ var LocalProjectConfigSchema = import_zod21.z.object({
1666
1615
  /** Template ID to use for the local project */
1667
- templateId: import_zod22.z.string().optional(),
1616
+ templateId: import_zod21.z.string().optional(),
1668
1617
  /** Files to create in the project */
1669
- files: import_zod22.z.array(
1670
- import_zod22.z.object({
1671
- path: import_zod22.z.string().min(1),
1672
- content: import_zod22.z.string().min(1)
1618
+ files: import_zod21.z.array(
1619
+ import_zod21.z.object({
1620
+ path: import_zod21.z.string().min(1),
1621
+ content: import_zod21.z.string().min(1)
1673
1622
  })
1674
1623
  ).optional()
1675
1624
  });
1676
- var MetaSiteConfigSchema = import_zod22.z.object({
1677
- configurations: import_zod22.z.array(
1678
- import_zod22.z.object({
1679
- name: import_zod22.z.string().min(1),
1680
- apiCalls: import_zod22.z.array(
1681
- import_zod22.z.object({
1682
- url: import_zod22.z.string().url(),
1683
- method: import_zod22.z.enum(["POST", "PUT"]),
1684
- body: import_zod22.z.string()
1625
+ var MetaSiteConfigSchema = import_zod21.z.object({
1626
+ configurations: import_zod21.z.array(
1627
+ import_zod21.z.object({
1628
+ name: import_zod21.z.string().min(1),
1629
+ apiCalls: import_zod21.z.array(
1630
+ import_zod21.z.object({
1631
+ url: import_zod21.z.string().url(),
1632
+ method: import_zod21.z.enum(["POST", "PUT"]),
1633
+ body: import_zod21.z.string()
1685
1634
  })
1686
1635
  )
1687
1636
  })
1688
1637
  ).optional()
1689
1638
  });
1690
- var EnvironmentSchema = import_zod22.z.object({
1639
+ var EnvironmentSchema = import_zod21.z.object({
1691
1640
  /** Local project configuration */
1692
1641
  localProject: LocalProjectConfigSchema.optional(),
1693
1642
  /** Meta site configuration */
@@ -1695,11 +1644,11 @@ var EnvironmentSchema = import_zod22.z.object({
1695
1644
  });
1696
1645
 
1697
1646
  // src/scenario/test-scenario.ts
1698
- var import_zod24 = require("zod");
1647
+ var import_zod23 = require("zod");
1699
1648
 
1700
1649
  // src/assertion/assertion.ts
1701
- var import_zod23 = require("zod");
1702
- var AssertionTypeSchema = import_zod23.z.enum([
1650
+ var import_zod22 = require("zod");
1651
+ var AssertionTypeSchema = import_zod22.z.enum([
1703
1652
  "skill_was_called",
1704
1653
  "tool_called_with_param",
1705
1654
  "build_passed",
@@ -1707,59 +1656,61 @@ var AssertionTypeSchema = import_zod23.z.enum([
1707
1656
  "cost",
1708
1657
  "llm_judge"
1709
1658
  ]);
1710
- var AssertionParameterTypeSchema = import_zod23.z.enum([
1659
+ var AssertionParameterTypeSchema = import_zod22.z.enum([
1711
1660
  "string",
1712
1661
  "number",
1713
1662
  "boolean"
1714
1663
  ]);
1715
- var AssertionParameterSchema = import_zod23.z.object({
1664
+ var AssertionParameterSchema = import_zod22.z.object({
1716
1665
  /** Parameter name (used as key in params object) */
1717
- name: import_zod23.z.string().min(1),
1666
+ name: import_zod22.z.string().min(1),
1718
1667
  /** Display label for the parameter */
1719
- label: import_zod23.z.string().min(1),
1668
+ label: import_zod22.z.string().min(1),
1720
1669
  /** Parameter type */
1721
1670
  type: AssertionParameterTypeSchema,
1722
1671
  /** Whether this parameter is required */
1723
- required: import_zod23.z.boolean(),
1672
+ required: import_zod22.z.boolean(),
1724
1673
  /** Default value (optional, used when not provided) */
1725
- defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
1674
+ defaultValue: import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean()]).optional(),
1726
1675
  /** If true, parameter is hidden by default behind "Show advanced options" */
1727
- advanced: import_zod23.z.boolean().optional()
1676
+ advanced: import_zod22.z.boolean().optional()
1728
1677
  });
1729
- var ScenarioAssertionLinkSchema = import_zod23.z.object({
1678
+ var ScenarioAssertionLinkSchema = import_zod22.z.object({
1730
1679
  /** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
1731
- assertionId: import_zod23.z.string(),
1680
+ assertionId: import_zod22.z.string(),
1732
1681
  /** Parameter values for this assertion in this scenario */
1733
- params: import_zod23.z.record(
1734
- import_zod23.z.string(),
1735
- import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
1682
+ params: import_zod22.z.record(
1683
+ import_zod22.z.string(),
1684
+ import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean(), import_zod22.z.null()])
1736
1685
  ).optional()
1737
1686
  });
1738
- var SkillWasCalledConfigSchema = import_zod23.z.object({
1687
+ var SkillWasCalledConfigSchema = import_zod22.z.object({
1739
1688
  /** Names of the skills that must have been called */
1740
- skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
1689
+ skillNames: import_zod22.z.array(import_zod22.z.string().min(1)).min(1)
1741
1690
  });
1742
- var CostConfigSchema = import_zod23.z.strictObject({
1691
+ var CostConfigSchema = import_zod22.z.strictObject({
1743
1692
  /** Maximum allowed cost in USD */
1744
- maxCostUsd: import_zod23.z.number().positive()
1693
+ maxCostUsd: import_zod22.z.number().positive()
1745
1694
  });
1746
- var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
1695
+ var ToolCalledWithParamConfigSchema = import_zod22.z.strictObject({
1747
1696
  /** Name of the tool that must have been called */
1748
- toolName: import_zod23.z.string().min(1),
1697
+ toolName: import_zod22.z.string().min(1),
1749
1698
  /** JSON string of key-value pairs for expected parameters (substring match) */
1750
- expectedParams: import_zod23.z.string().min(1)
1699
+ expectedParams: import_zod22.z.string().min(1),
1700
+ /** If true, the matching tool call must also have succeeded (step.success === true) */
1701
+ requireSuccess: import_zod22.z.boolean().optional()
1751
1702
  });
1752
- var BuildPassedConfigSchema = import_zod23.z.strictObject({
1703
+ var BuildPassedConfigSchema = import_zod22.z.strictObject({
1753
1704
  /** Command to run (default: "yarn build") */
1754
- command: import_zod23.z.string().optional(),
1705
+ command: import_zod22.z.string().optional(),
1755
1706
  /** Expected exit code (default: 0) */
1756
- expectedExitCode: import_zod23.z.number().int().optional()
1707
+ expectedExitCode: import_zod22.z.number().int().optional()
1757
1708
  });
1758
- var TimeConfigSchema = import_zod23.z.strictObject({
1709
+ var TimeConfigSchema = import_zod22.z.strictObject({
1759
1710
  /** Maximum allowed duration in milliseconds */
1760
- maxDurationMs: import_zod23.z.number().int().positive()
1711
+ maxDurationMs: import_zod22.z.number().int().positive()
1761
1712
  });
1762
- var LlmJudgeConfigSchema = import_zod23.z.object({
1713
+ var LlmJudgeConfigSchema = import_zod22.z.object({
1763
1714
  /**
1764
1715
  * Prompt template with placeholders:
1765
1716
  * - {{output}}: agent's final output
@@ -1770,19 +1721,45 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
1770
1721
  * - {{trace}}: step-by-step trace of tool calls
1771
1722
  * - Custom parameters defined in the parameters array
1772
1723
  */
1773
- prompt: import_zod23.z.string().min(1),
1724
+ prompt: import_zod22.z.string().min(1),
1774
1725
  /** Minimum score to pass (0-10, default 7) */
1775
- minScore: import_zod23.z.number().int().min(0).max(10).optional(),
1726
+ minScore: import_zod22.z.number().int().min(0).max(10).optional(),
1776
1727
  /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
1777
- model: import_zod23.z.string().optional(),
1728
+ model: import_zod22.z.string().optional(),
1778
1729
  /** Max output tokens */
1779
- maxTokens: import_zod23.z.number().int().optional(),
1730
+ maxTokens: import_zod22.z.number().int().optional(),
1780
1731
  /** Temperature (0-1) */
1781
- temperature: import_zod23.z.number().min(0).max(1).optional(),
1732
+ temperature: import_zod22.z.number().min(0).max(1).optional(),
1782
1733
  /** User-defined parameters for this assertion */
1783
- parameters: import_zod23.z.array(AssertionParameterSchema).optional()
1734
+ parameters: import_zod22.z.array(AssertionParameterSchema).optional()
1735
+ });
1736
+ var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
1737
+ type: import_zod22.z.literal("skill_was_called")
1738
+ });
1739
+ var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
1740
+ type: import_zod22.z.literal("tool_called_with_param")
1741
+ });
1742
+ var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
1743
+ type: import_zod22.z.literal("build_passed")
1744
+ });
1745
+ var CostAssertionSchema = CostConfigSchema.extend({
1746
+ type: import_zod22.z.literal("cost")
1747
+ });
1748
+ var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
1749
+ type: import_zod22.z.literal("llm_judge")
1784
1750
  });
1785
- var AssertionConfigSchema = import_zod23.z.union([
1751
+ var TimeAssertionSchema = TimeConfigSchema.extend({
1752
+ type: import_zod22.z.literal("time_limit")
1753
+ });
1754
+ var AssertionSchema = import_zod22.z.union([
1755
+ SkillWasCalledAssertionSchema,
1756
+ ToolCalledWithParamAssertionSchema,
1757
+ BuildPassedAssertionSchema,
1758
+ TimeAssertionSchema,
1759
+ CostAssertionSchema,
1760
+ LlmJudgeAssertionSchema
1761
+ ]);
1762
+ var AssertionConfigSchema = import_zod22.z.union([
1786
1763
  LlmJudgeConfigSchema,
1787
1764
  // requires prompt - check first
1788
1765
  SkillWasCalledConfigSchema,
@@ -1795,7 +1772,7 @@ var AssertionConfigSchema = import_zod23.z.union([
1795
1772
  // requires maxCostUsd, uses strictObject
1796
1773
  BuildPassedConfigSchema,
1797
1774
  // all optional, uses strictObject to reject unknown keys
1798
- import_zod23.z.object({})
1775
+ import_zod22.z.object({})
1799
1776
  // fallback empty config
1800
1777
  ]);
1801
1778
  var CustomAssertionSchema = TenantEntitySchema.extend({
@@ -1846,25 +1823,25 @@ function getLlmJudgeConfig(assertion) {
1846
1823
  }
1847
1824
 
1848
1825
  // src/scenario/test-scenario.ts
1849
- var ExpectedFileSchema = import_zod24.z.object({
1826
+ var ExpectedFileSchema = import_zod23.z.object({
1850
1827
  /** Relative path where the file should be created */
1851
- path: import_zod24.z.string(),
1828
+ path: import_zod23.z.string(),
1852
1829
  /** Optional expected content */
1853
- content: import_zod24.z.string().optional()
1830
+ content: import_zod23.z.string().optional()
1854
1831
  });
1855
1832
  var TestScenarioSchema = TenantEntitySchema.extend({
1856
1833
  /** The prompt sent to the agent to trigger the task */
1857
- triggerPrompt: import_zod24.z.string().min(10),
1834
+ triggerPrompt: import_zod23.z.string().min(10),
1858
1835
  /** ID of the template to use for this scenario (null = no template) */
1859
- templateId: import_zod24.z.string().nullish(),
1836
+ templateId: import_zod23.z.string().nullish(),
1860
1837
  /** Inline assertions to evaluate for this scenario (legacy) */
1861
- assertions: import_zod24.z.array(AssertionSchema).optional(),
1838
+ assertions: import_zod23.z.array(AssertionSchema).optional(),
1862
1839
  /** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
1863
- assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
1840
+ assertionIds: import_zod23.z.array(import_zod23.z.string()).optional(),
1864
1841
  /** Linked assertions with per-scenario parameter values */
1865
- assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional(),
1842
+ assertionLinks: import_zod23.z.array(ScenarioAssertionLinkSchema).optional(),
1866
1843
  /** Tags for categorisation and filtering */
1867
- tags: import_zod24.z.array(import_zod24.z.string()).optional()
1844
+ tags: import_zod23.z.array(import_zod23.z.string()).optional()
1868
1845
  });
1869
1846
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
1870
1847
  id: true,
@@ -1875,10 +1852,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
1875
1852
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
1876
1853
 
1877
1854
  // src/suite/test-suite.ts
1878
- var import_zod25 = require("zod");
1855
+ var import_zod24 = require("zod");
1879
1856
  var TestSuiteSchema = TenantEntitySchema.extend({
1880
1857
  /** IDs of test scenarios in this suite */
1881
- scenarioIds: import_zod25.z.array(import_zod25.z.string())
1858
+ scenarioIds: import_zod24.z.array(import_zod24.z.string())
1882
1859
  });
1883
1860
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1884
1861
  id: true,
@@ -1889,21 +1866,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1889
1866
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
1890
1867
 
1891
1868
  // src/evaluation/metrics.ts
1892
- var import_zod26 = require("zod");
1893
- var TokenUsageSchema = import_zod26.z.object({
1894
- prompt: import_zod26.z.number(),
1895
- completion: import_zod26.z.number(),
1896
- total: import_zod26.z.number()
1897
- });
1898
- var EvalMetricsSchema = import_zod26.z.object({
1899
- totalAssertions: import_zod26.z.number(),
1900
- passed: import_zod26.z.number(),
1901
- failed: import_zod26.z.number(),
1902
- skipped: import_zod26.z.number(),
1903
- errors: import_zod26.z.number(),
1904
- passRate: import_zod26.z.number(),
1905
- avgDuration: import_zod26.z.number(),
1906
- totalDuration: import_zod26.z.number()
1869
+ var import_zod25 = require("zod");
1870
+ var TokenUsageSchema = import_zod25.z.object({
1871
+ prompt: import_zod25.z.number(),
1872
+ completion: import_zod25.z.number(),
1873
+ total: import_zod25.z.number()
1874
+ });
1875
+ var EvalMetricsSchema = import_zod25.z.object({
1876
+ totalAssertions: import_zod25.z.number(),
1877
+ passed: import_zod25.z.number(),
1878
+ failed: import_zod25.z.number(),
1879
+ skipped: import_zod25.z.number(),
1880
+ errors: import_zod25.z.number(),
1881
+ passRate: import_zod25.z.number(),
1882
+ avgDuration: import_zod25.z.number(),
1883
+ totalDuration: import_zod25.z.number()
1907
1884
  });
1908
1885
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1909
1886
  EvalStatus2["PENDING"] = "pending";
@@ -1913,7 +1890,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1913
1890
  EvalStatus2["CANCELLED"] = "cancelled";
1914
1891
  return EvalStatus2;
1915
1892
  })(EvalStatus || {});
1916
- var EvalStatusSchema = import_zod26.z.enum(EvalStatus);
1893
+ var EvalStatusSchema = import_zod25.z.enum(EvalStatus);
1917
1894
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1918
1895
  LLMStepType2["COMPLETION"] = "completion";
1919
1896
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -1921,52 +1898,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1921
1898
  LLMStepType2["THINKING"] = "thinking";
1922
1899
  return LLMStepType2;
1923
1900
  })(LLMStepType || {});
1924
- var LLMTraceStepSchema = import_zod26.z.object({
1925
- id: import_zod26.z.string(),
1926
- stepNumber: import_zod26.z.number(),
1927
- type: import_zod26.z.enum(LLMStepType),
1928
- model: import_zod26.z.string(),
1929
- provider: import_zod26.z.string(),
1930
- startedAt: import_zod26.z.string(),
1931
- durationMs: import_zod26.z.number(),
1901
+ var LLMTraceStepSchema = import_zod25.z.object({
1902
+ id: import_zod25.z.string(),
1903
+ stepNumber: import_zod25.z.number(),
1904
+ type: import_zod25.z.enum(LLMStepType),
1905
+ model: import_zod25.z.string(),
1906
+ provider: import_zod25.z.string(),
1907
+ startedAt: import_zod25.z.string(),
1908
+ durationMs: import_zod25.z.number(),
1932
1909
  tokenUsage: TokenUsageSchema,
1933
- costUsd: import_zod26.z.number(),
1934
- toolName: import_zod26.z.string().optional(),
1935
- toolArguments: import_zod26.z.string().optional(),
1936
- inputPreview: import_zod26.z.string().optional(),
1937
- outputPreview: import_zod26.z.string().optional(),
1938
- success: import_zod26.z.boolean(),
1939
- error: import_zod26.z.string().optional()
1940
- });
1941
- var LLMBreakdownStatsSchema = import_zod26.z.object({
1942
- count: import_zod26.z.number(),
1943
- durationMs: import_zod26.z.number(),
1944
- tokens: import_zod26.z.number(),
1945
- costUsd: import_zod26.z.number()
1946
- });
1947
- var LLMTraceSummarySchema = import_zod26.z.object({
1948
- totalSteps: import_zod26.z.number(),
1949
- totalDurationMs: import_zod26.z.number(),
1910
+ costUsd: import_zod25.z.number(),
1911
+ toolName: import_zod25.z.string().optional(),
1912
+ toolArguments: import_zod25.z.string().optional(),
1913
+ inputPreview: import_zod25.z.string().optional(),
1914
+ outputPreview: import_zod25.z.string().optional(),
1915
+ success: import_zod25.z.boolean(),
1916
+ error: import_zod25.z.string().optional()
1917
+ });
1918
+ var LLMBreakdownStatsSchema = import_zod25.z.object({
1919
+ count: import_zod25.z.number(),
1920
+ durationMs: import_zod25.z.number(),
1921
+ tokens: import_zod25.z.number(),
1922
+ costUsd: import_zod25.z.number()
1923
+ });
1924
+ var LLMTraceSummarySchema = import_zod25.z.object({
1925
+ totalSteps: import_zod25.z.number(),
1926
+ totalDurationMs: import_zod25.z.number(),
1950
1927
  totalTokens: TokenUsageSchema,
1951
- totalCostUsd: import_zod26.z.number(),
1952
- stepTypeBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema).optional(),
1953
- modelBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema),
1954
- modelsUsed: import_zod26.z.array(import_zod26.z.string())
1955
- });
1956
- var LLMTraceSchema = import_zod26.z.object({
1957
- id: import_zod26.z.string(),
1958
- steps: import_zod26.z.array(LLMTraceStepSchema),
1928
+ totalCostUsd: import_zod25.z.number(),
1929
+ stepTypeBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema).optional(),
1930
+ modelBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema),
1931
+ modelsUsed: import_zod25.z.array(import_zod25.z.string())
1932
+ });
1933
+ var LLMTraceSchema = import_zod25.z.object({
1934
+ id: import_zod25.z.string(),
1935
+ steps: import_zod25.z.array(LLMTraceStepSchema),
1959
1936
  summary: LLMTraceSummarySchema
1960
1937
  });
1961
1938
 
1962
1939
  // src/evaluation/eval-result.ts
1963
- var import_zod30 = require("zod");
1940
+ var import_zod29 = require("zod");
1964
1941
 
1965
1942
  // src/evaluation/eval-run.ts
1966
- var import_zod28 = require("zod");
1943
+ var import_zod27 = require("zod");
1967
1944
 
1968
1945
  // src/evaluation/live-trace.ts
1969
- var import_zod27 = require("zod");
1946
+ var import_zod26 = require("zod");
1970
1947
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1971
1948
  LiveTraceEventType2["THINKING"] = "thinking";
1972
1949
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -1980,37 +1957,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1980
1957
  LiveTraceEventType2["USER"] = "user";
1981
1958
  return LiveTraceEventType2;
1982
1959
  })(LiveTraceEventType || {});
1983
- var LiveTraceEventSchema = import_zod27.z.object({
1960
+ var LiveTraceEventSchema = import_zod26.z.object({
1984
1961
  /** The evaluation run ID */
1985
- evalRunId: import_zod27.z.string(),
1962
+ evalRunId: import_zod26.z.string(),
1986
1963
  /** The scenario ID being executed */
1987
- scenarioId: import_zod27.z.string(),
1964
+ scenarioId: import_zod26.z.string(),
1988
1965
  /** The scenario name for display */
1989
- scenarioName: import_zod27.z.string(),
1966
+ scenarioName: import_zod26.z.string(),
1990
1967
  /** The target ID (skill, agent, etc.) */
1991
- targetId: import_zod27.z.string(),
1968
+ targetId: import_zod26.z.string(),
1992
1969
  /** The target name for display */
1993
- targetName: import_zod27.z.string(),
1970
+ targetName: import_zod26.z.string(),
1994
1971
  /** Step number in the current scenario execution */
1995
- stepNumber: import_zod27.z.number(),
1972
+ stepNumber: import_zod26.z.number(),
1996
1973
  /** Type of trace event */
1997
- type: import_zod27.z.enum(LiveTraceEventType),
1974
+ type: import_zod26.z.enum(LiveTraceEventType),
1998
1975
  /** Tool name if this is a tool_use event */
1999
- toolName: import_zod27.z.string().optional(),
1976
+ toolName: import_zod26.z.string().optional(),
2000
1977
  /** Tool arguments preview (truncated JSON) */
2001
- toolArgs: import_zod27.z.string().optional(),
1978
+ toolArgs: import_zod26.z.string().optional(),
2002
1979
  /** Output preview (truncated text) */
2003
- outputPreview: import_zod27.z.string().optional(),
1980
+ outputPreview: import_zod26.z.string().optional(),
2004
1981
  /** File path for file operations */
2005
- filePath: import_zod27.z.string().optional(),
1982
+ filePath: import_zod26.z.string().optional(),
2006
1983
  /** Elapsed time in milliseconds for progress events */
2007
- elapsedMs: import_zod27.z.number().optional(),
1984
+ elapsedMs: import_zod26.z.number().optional(),
2008
1985
  /** Thinking/reasoning text from Claude */
2009
- thinking: import_zod27.z.string().optional(),
1986
+ thinking: import_zod26.z.string().optional(),
2010
1987
  /** Timestamp when this event occurred */
2011
- timestamp: import_zod27.z.string(),
1988
+ timestamp: import_zod26.z.string(),
2012
1989
  /** Whether this is the final event for this scenario */
2013
- isComplete: import_zod27.z.boolean()
1990
+ isComplete: import_zod26.z.boolean()
2014
1991
  });
2015
1992
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
2016
1993
  function parseTraceEventLine(line) {
@@ -2038,14 +2015,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
2038
2015
  TriggerType2["MANUAL"] = "MANUAL";
2039
2016
  return TriggerType2;
2040
2017
  })(TriggerType || {});
2041
- var TriggerMetadataSchema = import_zod28.z.object({
2042
- version: import_zod28.z.string().optional(),
2043
- resourceUpdated: import_zod28.z.array(import_zod28.z.string()).optional()
2018
+ var TriggerMetadataSchema = import_zod27.z.object({
2019
+ version: import_zod27.z.string().optional(),
2020
+ resourceUpdated: import_zod27.z.array(import_zod27.z.string()).optional()
2044
2021
  });
2045
- var TriggerSchema = import_zod28.z.object({
2046
- id: import_zod28.z.string(),
2022
+ var TriggerSchema = import_zod27.z.object({
2023
+ id: import_zod27.z.string(),
2047
2024
  metadata: TriggerMetadataSchema.optional(),
2048
- type: import_zod28.z.enum(TriggerType)
2025
+ type: import_zod27.z.enum(TriggerType)
2049
2026
  });
2050
2027
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
2051
2028
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -2063,28 +2040,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
2063
2040
  FailureSeverity2["LOW"] = "low";
2064
2041
  return FailureSeverity2;
2065
2042
  })(FailureSeverity || {});
2066
- var DiffLineTypeSchema = import_zod28.z.enum(["added", "removed", "unchanged"]);
2067
- var DiffLineSchema = import_zod28.z.object({
2043
+ var DiffLineTypeSchema = import_zod27.z.enum(["added", "removed", "unchanged"]);
2044
+ var DiffLineSchema = import_zod27.z.object({
2068
2045
  type: DiffLineTypeSchema,
2069
- content: import_zod28.z.string(),
2070
- lineNumber: import_zod28.z.number()
2071
- });
2072
- var DiffContentSchema = import_zod28.z.object({
2073
- path: import_zod28.z.string(),
2074
- expected: import_zod28.z.string(),
2075
- actual: import_zod28.z.string(),
2076
- diffLines: import_zod28.z.array(DiffLineSchema),
2077
- renamedFrom: import_zod28.z.string().optional()
2078
- });
2079
- var CommandExecutionSchema = import_zod28.z.object({
2080
- command: import_zod28.z.string(),
2081
- exitCode: import_zod28.z.number(),
2082
- output: import_zod28.z.string().optional(),
2083
- duration: import_zod28.z.number()
2084
- });
2085
- var FileModificationSchema = import_zod28.z.object({
2086
- path: import_zod28.z.string(),
2087
- action: import_zod28.z.enum(["created", "modified", "deleted"])
2046
+ content: import_zod27.z.string(),
2047
+ lineNumber: import_zod27.z.number()
2048
+ });
2049
+ var DiffContentSchema = import_zod27.z.object({
2050
+ path: import_zod27.z.string(),
2051
+ expected: import_zod27.z.string(),
2052
+ actual: import_zod27.z.string(),
2053
+ diffLines: import_zod27.z.array(DiffLineSchema),
2054
+ renamedFrom: import_zod27.z.string().optional()
2055
+ });
2056
+ var CommandExecutionSchema = import_zod27.z.object({
2057
+ command: import_zod27.z.string(),
2058
+ exitCode: import_zod27.z.number(),
2059
+ output: import_zod27.z.string().optional(),
2060
+ duration: import_zod27.z.number()
2061
+ });
2062
+ var FileModificationSchema = import_zod27.z.object({
2063
+ path: import_zod27.z.string(),
2064
+ action: import_zod27.z.enum(["created", "modified", "deleted"])
2088
2065
  });
2089
2066
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
2090
2067
  TemplateFileStatus2["NEW"] = "new";
@@ -2092,87 +2069,87 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
2092
2069
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
2093
2070
  return TemplateFileStatus2;
2094
2071
  })(TemplateFileStatus || {});
2095
- var TemplateFileSchema = import_zod28.z.object({
2072
+ var TemplateFileSchema = import_zod27.z.object({
2096
2073
  /** Relative path within the template */
2097
- path: import_zod28.z.string(),
2074
+ path: import_zod27.z.string(),
2098
2075
  /** Full file content after execution */
2099
- content: import_zod28.z.string(),
2076
+ content: import_zod27.z.string(),
2100
2077
  /** File status (new, modified, unchanged) */
2101
- status: import_zod28.z.enum(["new", "modified", "unchanged"])
2102
- });
2103
- var ApiCallSchema = import_zod28.z.object({
2104
- endpoint: import_zod28.z.string(),
2105
- tokensUsed: import_zod28.z.number(),
2106
- duration: import_zod28.z.number()
2107
- });
2108
- var ExecutionTraceSchema = import_zod28.z.object({
2109
- commands: import_zod28.z.array(CommandExecutionSchema),
2110
- filesModified: import_zod28.z.array(FileModificationSchema),
2111
- apiCalls: import_zod28.z.array(ApiCallSchema),
2112
- totalDuration: import_zod28.z.number()
2113
- });
2114
- var FailureAnalysisSchema = import_zod28.z.object({
2115
- category: import_zod28.z.enum(FailureCategory),
2116
- severity: import_zod28.z.enum(FailureSeverity),
2117
- summary: import_zod28.z.string(),
2118
- details: import_zod28.z.string(),
2119
- rootCause: import_zod28.z.string(),
2120
- suggestedFix: import_zod28.z.string(),
2121
- relatedAssertions: import_zod28.z.array(import_zod28.z.string()),
2122
- codeSnippet: import_zod28.z.string().optional(),
2123
- similarIssues: import_zod28.z.array(import_zod28.z.string()).optional(),
2124
- patternId: import_zod28.z.string().optional(),
2078
+ status: import_zod27.z.enum(["new", "modified", "unchanged"])
2079
+ });
2080
+ var ApiCallSchema = import_zod27.z.object({
2081
+ endpoint: import_zod27.z.string(),
2082
+ tokensUsed: import_zod27.z.number(),
2083
+ duration: import_zod27.z.number()
2084
+ });
2085
+ var ExecutionTraceSchema = import_zod27.z.object({
2086
+ commands: import_zod27.z.array(CommandExecutionSchema),
2087
+ filesModified: import_zod27.z.array(FileModificationSchema),
2088
+ apiCalls: import_zod27.z.array(ApiCallSchema),
2089
+ totalDuration: import_zod27.z.number()
2090
+ });
2091
+ var FailureAnalysisSchema = import_zod27.z.object({
2092
+ category: import_zod27.z.enum(FailureCategory),
2093
+ severity: import_zod27.z.enum(FailureSeverity),
2094
+ summary: import_zod27.z.string(),
2095
+ details: import_zod27.z.string(),
2096
+ rootCause: import_zod27.z.string(),
2097
+ suggestedFix: import_zod27.z.string(),
2098
+ relatedAssertions: import_zod27.z.array(import_zod27.z.string()),
2099
+ codeSnippet: import_zod27.z.string().optional(),
2100
+ similarIssues: import_zod27.z.array(import_zod27.z.string()).optional(),
2101
+ patternId: import_zod27.z.string().optional(),
2125
2102
  // Extended fields for detailed debugging
2126
2103
  diff: DiffContentSchema.optional(),
2127
2104
  executionTrace: ExecutionTraceSchema.optional()
2128
2105
  });
2129
2106
  var EvalRunSchema = TenantEntitySchema.extend({
2130
2107
  /** Agent ID for this run */
2131
- agentId: import_zod28.z.string().optional(),
2108
+ agentId: import_zod27.z.string().optional(),
2132
2109
  /** Preset ID that originated this run (optional) */
2133
- presetId: import_zod28.z.string().optional(),
2110
+ presetId: import_zod27.z.string().optional(),
2134
2111
  /** Skill IDs for this run */
2135
- skillIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2112
+ skillIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2136
2113
  /** Map of skillId to skillVersionId for this run */
2137
- skillVersions: import_zod28.z.record(import_zod28.z.string(), import_zod28.z.string()).optional(),
2114
+ skillVersions: import_zod27.z.record(import_zod27.z.string(), import_zod27.z.string()).optional(),
2138
2115
  /** Scenario IDs to run (always present — resolved server-side from tags when needed) */
2139
- scenarioIds: import_zod28.z.array(import_zod28.z.string()),
2116
+ scenarioIds: import_zod27.z.array(import_zod27.z.string()),
2140
2117
  /** Current status */
2141
2118
  status: EvalStatusSchema,
2142
2119
  /** Progress percentage (0-100) */
2143
- progress: import_zod28.z.number(),
2120
+ progress: import_zod27.z.number(),
2144
2121
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
2145
- results: import_zod28.z.array(import_zod28.z.lazy(() => EvalRunResultSchema)),
2122
+ results: import_zod27.z.array(import_zod27.z.lazy(() => EvalRunResultSchema)),
2146
2123
  /** Aggregated metrics across all results */
2147
2124
  aggregateMetrics: EvalMetricsSchema,
2148
2125
  /** Failure analyses */
2149
- failureAnalyses: import_zod28.z.array(FailureAnalysisSchema).optional(),
2126
+ failureAnalyses: import_zod27.z.array(FailureAnalysisSchema).optional(),
2150
2127
  /** Aggregated LLM trace summary */
2151
2128
  llmTraceSummary: LLMTraceSummarySchema.optional(),
2152
2129
  /** What triggered this run */
2153
2130
  trigger: TriggerSchema.optional(),
2154
2131
  /** When the run started (set when evaluation is triggered) */
2155
- startedAt: import_zod28.z.string().optional(),
2132
+ startedAt: import_zod27.z.string().optional(),
2156
2133
  /** When the run completed */
2157
- completedAt: import_zod28.z.string().optional(),
2134
+ completedAt: import_zod27.z.string().optional(),
2158
2135
  /** Live trace events captured during execution (for playback on results page) */
2159
- liveTraceEvents: import_zod28.z.array(LiveTraceEventSchema).optional(),
2136
+ liveTraceEvents: import_zod27.z.array(LiveTraceEventSchema).optional(),
2160
2137
  /** Remote job ID for tracking execution in Dev Machines */
2161
- jobId: import_zod28.z.string().optional(),
2138
+ jobId: import_zod27.z.string().optional(),
2162
2139
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
2163
- jobStatus: import_zod28.z.string().optional(),
2140
+ jobStatus: import_zod27.z.string().optional(),
2164
2141
  /** Remote job error message if the job failed */
2165
- jobError: import_zod28.z.string().optional(),
2142
+ jobError: import_zod27.z.string().optional(),
2166
2143
  /** Timestamp of the last job status check */
2167
- jobStatusCheckedAt: import_zod28.z.string().optional(),
2144
+ jobStatusCheckedAt: import_zod27.z.string().optional(),
2168
2145
  /** MCP server IDs to enable for this run (optional) */
2169
- mcpIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2146
+ mcpIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2170
2147
  /** Sub-agent IDs to enable for this run (optional) */
2171
- subAgentIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2148
+ subAgentIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2172
2149
  /** Rule IDs to enable for this run (optional) */
2173
- ruleIds: import_zod28.z.array(import_zod28.z.string()).optional(),
2150
+ ruleIds: import_zod27.z.array(import_zod27.z.string()).optional(),
2174
2151
  /** Tags used to select scenarios for this run (for traceability) */
2175
- tags: import_zod28.z.array(import_zod28.z.string()).optional()
2152
+ tags: import_zod27.z.array(import_zod27.z.string()).optional()
2176
2153
  });
2177
2154
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
2178
2155
  id: true,
@@ -2187,60 +2164,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
2187
2164
  scenarioIds: true
2188
2165
  }).extend({
2189
2166
  /** Optional on input — backend resolves from tags when not provided */
2190
- scenarioIds: import_zod28.z.array(import_zod28.z.string()).optional()
2167
+ scenarioIds: import_zod27.z.array(import_zod27.z.string()).optional()
2191
2168
  }).refine(
2192
2169
  (data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
2193
2170
  { message: "Either scenarioIds or tags must be provided" }
2194
2171
  );
2195
- var EvaluationProgressSchema = import_zod28.z.object({
2196
- runId: import_zod28.z.string(),
2197
- targetId: import_zod28.z.string(),
2198
- totalScenarios: import_zod28.z.number(),
2199
- completedScenarios: import_zod28.z.number(),
2200
- scenarioProgress: import_zod28.z.array(
2201
- import_zod28.z.object({
2202
- scenarioId: import_zod28.z.string(),
2203
- currentStep: import_zod28.z.string(),
2204
- error: import_zod28.z.string().optional()
2172
+ var EvaluationProgressSchema = import_zod27.z.object({
2173
+ runId: import_zod27.z.string(),
2174
+ targetId: import_zod27.z.string(),
2175
+ totalScenarios: import_zod27.z.number(),
2176
+ completedScenarios: import_zod27.z.number(),
2177
+ scenarioProgress: import_zod27.z.array(
2178
+ import_zod27.z.object({
2179
+ scenarioId: import_zod27.z.string(),
2180
+ currentStep: import_zod27.z.string(),
2181
+ error: import_zod27.z.string().optional()
2205
2182
  })
2206
2183
  ),
2207
- createdAt: import_zod28.z.number()
2208
- });
2209
- var EvaluationLogSchema = import_zod28.z.object({
2210
- runId: import_zod28.z.string(),
2211
- scenarioId: import_zod28.z.string(),
2212
- log: import_zod28.z.object({
2213
- level: import_zod28.z.enum(["info", "error", "debug"]),
2214
- message: import_zod28.z.string().optional(),
2215
- args: import_zod28.z.array(import_zod28.z.any()).optional(),
2216
- error: import_zod28.z.string().optional()
2184
+ createdAt: import_zod27.z.number()
2185
+ });
2186
+ var EvaluationLogSchema = import_zod27.z.object({
2187
+ runId: import_zod27.z.string(),
2188
+ scenarioId: import_zod27.z.string(),
2189
+ log: import_zod27.z.object({
2190
+ level: import_zod27.z.enum(["info", "error", "debug"]),
2191
+ message: import_zod27.z.string().optional(),
2192
+ args: import_zod27.z.array(import_zod27.z.any()).optional(),
2193
+ error: import_zod27.z.string().optional()
2217
2194
  })
2218
2195
  });
2219
2196
  var LLM_TIMEOUT = 12e4;
2220
2197
 
2221
2198
  // src/evaluation/conversation.ts
2222
- var import_zod29 = require("zod");
2223
- var TextBlockSchema = import_zod29.z.object({
2224
- type: import_zod29.z.literal("text"),
2225
- text: import_zod29.z.string()
2226
- });
2227
- var ThinkingBlockSchema = import_zod29.z.object({
2228
- type: import_zod29.z.literal("thinking"),
2229
- thinking: import_zod29.z.string()
2230
- });
2231
- var ToolUseBlockSchema = import_zod29.z.object({
2232
- type: import_zod29.z.literal("tool_use"),
2233
- toolName: import_zod29.z.string(),
2234
- toolId: import_zod29.z.string(),
2235
- input: import_zod29.z.unknown()
2236
- });
2237
- var ToolResultBlockSchema = import_zod29.z.object({
2238
- type: import_zod29.z.literal("tool_result"),
2239
- toolUseId: import_zod29.z.string(),
2240
- content: import_zod29.z.string(),
2241
- isError: import_zod29.z.boolean().optional()
2242
- });
2243
- var ConversationBlockSchema = import_zod29.z.discriminatedUnion("type", [
2199
+ var import_zod28 = require("zod");
2200
+ var TextBlockSchema = import_zod28.z.object({
2201
+ type: import_zod28.z.literal("text"),
2202
+ text: import_zod28.z.string()
2203
+ });
2204
+ var ThinkingBlockSchema = import_zod28.z.object({
2205
+ type: import_zod28.z.literal("thinking"),
2206
+ thinking: import_zod28.z.string()
2207
+ });
2208
+ var ToolUseBlockSchema = import_zod28.z.object({
2209
+ type: import_zod28.z.literal("tool_use"),
2210
+ toolName: import_zod28.z.string(),
2211
+ toolId: import_zod28.z.string(),
2212
+ input: import_zod28.z.unknown()
2213
+ });
2214
+ var ToolResultBlockSchema = import_zod28.z.object({
2215
+ type: import_zod28.z.literal("tool_result"),
2216
+ toolUseId: import_zod28.z.string(),
2217
+ content: import_zod28.z.string(),
2218
+ isError: import_zod28.z.boolean().optional()
2219
+ });
2220
+ var ConversationBlockSchema = import_zod28.z.discriminatedUnion("type", [
2244
2221
  TextBlockSchema,
2245
2222
  ThinkingBlockSchema,
2246
2223
  ToolUseBlockSchema,
@@ -2251,18 +2228,18 @@ var ConversationMessageRoles = [
2251
2228
  "user",
2252
2229
  "system"
2253
2230
  ];
2254
- var ConversationMessageSchema = import_zod29.z.object({
2255
- role: import_zod29.z.enum(ConversationMessageRoles),
2256
- content: import_zod29.z.array(ConversationBlockSchema),
2257
- timestamp: import_zod29.z.string()
2231
+ var ConversationMessageSchema = import_zod28.z.object({
2232
+ role: import_zod28.z.enum(ConversationMessageRoles),
2233
+ content: import_zod28.z.array(ConversationBlockSchema),
2234
+ timestamp: import_zod28.z.string()
2258
2235
  });
2259
- var ScenarioConversationSchema = import_zod29.z.object({
2260
- id: import_zod29.z.string(),
2261
- projectId: import_zod29.z.string(),
2262
- evalRunId: import_zod29.z.string(),
2263
- resultId: import_zod29.z.string(),
2264
- messages: import_zod29.z.array(ConversationMessageSchema),
2265
- createdAt: import_zod29.z.string()
2236
+ var ScenarioConversationSchema = import_zod28.z.object({
2237
+ id: import_zod28.z.string(),
2238
+ projectId: import_zod28.z.string(),
2239
+ evalRunId: import_zod28.z.string(),
2240
+ resultId: import_zod28.z.string(),
2241
+ messages: import_zod28.z.array(ConversationMessageSchema),
2242
+ createdAt: import_zod28.z.string()
2266
2243
  });
2267
2244
 
2268
2245
  // src/evaluation/eval-result.ts
@@ -2273,100 +2250,100 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
2273
2250
  AssertionResultStatus2["ERROR"] = "error";
2274
2251
  return AssertionResultStatus2;
2275
2252
  })(AssertionResultStatus || {});
2276
- var AssertionResultSchema = import_zod30.z.object({
2277
- id: import_zod30.z.string(),
2278
- assertionId: import_zod30.z.string(),
2279
- assertionType: import_zod30.z.string(),
2280
- assertionName: import_zod30.z.string(),
2281
- status: import_zod30.z.enum(AssertionResultStatus),
2282
- message: import_zod30.z.string().optional(),
2283
- expected: import_zod30.z.string().optional(),
2284
- actual: import_zod30.z.string().optional(),
2285
- duration: import_zod30.z.number().optional(),
2286
- details: import_zod30.z.record(import_zod30.z.string(), import_zod30.z.unknown()).optional(),
2287
- llmTraceSteps: import_zod30.z.array(LLMTraceStepSchema).optional()
2288
- });
2289
- var EvalRunResultSchema = import_zod30.z.object({
2290
- id: import_zod30.z.string(),
2291
- targetId: import_zod30.z.string(),
2292
- targetName: import_zod30.z.string().optional(),
2253
+ var AssertionResultSchema = import_zod29.z.object({
2254
+ id: import_zod29.z.string(),
2255
+ assertionId: import_zod29.z.string(),
2256
+ assertionType: import_zod29.z.string(),
2257
+ assertionName: import_zod29.z.string(),
2258
+ status: import_zod29.z.enum(AssertionResultStatus),
2259
+ message: import_zod29.z.string().optional(),
2260
+ expected: import_zod29.z.string().optional(),
2261
+ actual: import_zod29.z.string().optional(),
2262
+ duration: import_zod29.z.number().optional(),
2263
+ details: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.unknown()).optional(),
2264
+ llmTraceSteps: import_zod29.z.array(LLMTraceStepSchema).optional()
2265
+ });
2266
+ var EvalRunResultSchema = import_zod29.z.object({
2267
+ id: import_zod29.z.string(),
2268
+ targetId: import_zod29.z.string(),
2269
+ targetName: import_zod29.z.string().optional(),
2293
2270
  /** SkillVersion ID used for this evaluation (for version tracking) */
2294
- skillVersionId: import_zod30.z.string().optional(),
2271
+ skillVersionId: import_zod29.z.string().optional(),
2295
2272
  /** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
2296
- skillVersion: import_zod30.z.string().optional(),
2297
- scenarioId: import_zod30.z.string(),
2298
- scenarioName: import_zod30.z.string(),
2273
+ skillVersion: import_zod29.z.string().optional(),
2274
+ scenarioId: import_zod29.z.string(),
2275
+ scenarioName: import_zod29.z.string(),
2299
2276
  modelConfig: ModelConfigSchema.optional(),
2300
- assertionResults: import_zod30.z.array(AssertionResultSchema),
2277
+ assertionResults: import_zod29.z.array(AssertionResultSchema),
2301
2278
  metrics: EvalMetricsSchema.optional(),
2302
- passed: import_zod30.z.number(),
2303
- failed: import_zod30.z.number(),
2304
- passRate: import_zod30.z.number(),
2305
- duration: import_zod30.z.number(),
2306
- outputText: import_zod30.z.string().optional(),
2307
- files: import_zod30.z.array(ExpectedFileSchema).optional(),
2308
- fileDiffs: import_zod30.z.array(DiffContentSchema).optional(),
2279
+ passed: import_zod29.z.number(),
2280
+ failed: import_zod29.z.number(),
2281
+ passRate: import_zod29.z.number(),
2282
+ duration: import_zod29.z.number(),
2283
+ outputText: import_zod29.z.string().optional(),
2284
+ files: import_zod29.z.array(ExpectedFileSchema).optional(),
2285
+ fileDiffs: import_zod29.z.array(DiffContentSchema).optional(),
2309
2286
  /** Full template files after execution with status indicators */
2310
- templateFiles: import_zod30.z.array(TemplateFileSchema).optional(),
2311
- startedAt: import_zod30.z.string().optional(),
2312
- completedAt: import_zod30.z.string().optional(),
2287
+ templateFiles: import_zod29.z.array(TemplateFileSchema).optional(),
2288
+ startedAt: import_zod29.z.string().optional(),
2289
+ completedAt: import_zod29.z.string().optional(),
2313
2290
  llmTrace: LLMTraceSchema.optional(),
2314
2291
  /** Full conversation messages (only present in transit; stripped before DB storage) */
2315
- conversation: import_zod30.z.array(ConversationMessageSchema).optional()
2316
- });
2317
- var PromptResultSchema = import_zod30.z.object({
2318
- text: import_zod30.z.string(),
2319
- files: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2320
- finishReason: import_zod30.z.string().optional(),
2321
- reasoning: import_zod30.z.string().optional(),
2322
- reasoningDetails: import_zod30.z.unknown().optional(),
2323
- toolCalls: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2324
- toolResults: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2325
- warnings: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2326
- sources: import_zod30.z.array(import_zod30.z.unknown()).optional(),
2327
- steps: import_zod30.z.array(import_zod30.z.unknown()),
2328
- generationTimeMs: import_zod30.z.number(),
2329
- prompt: import_zod30.z.string(),
2330
- systemPrompt: import_zod30.z.string(),
2331
- usage: import_zod30.z.object({
2332
- totalTokens: import_zod30.z.number().optional(),
2333
- totalMicrocentsSpent: import_zod30.z.number().optional()
2292
+ conversation: import_zod29.z.array(ConversationMessageSchema).optional()
2293
+ });
2294
+ var PromptResultSchema = import_zod29.z.object({
2295
+ text: import_zod29.z.string(),
2296
+ files: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2297
+ finishReason: import_zod29.z.string().optional(),
2298
+ reasoning: import_zod29.z.string().optional(),
2299
+ reasoningDetails: import_zod29.z.unknown().optional(),
2300
+ toolCalls: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2301
+ toolResults: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2302
+ warnings: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2303
+ sources: import_zod29.z.array(import_zod29.z.unknown()).optional(),
2304
+ steps: import_zod29.z.array(import_zod29.z.unknown()),
2305
+ generationTimeMs: import_zod29.z.number(),
2306
+ prompt: import_zod29.z.string(),
2307
+ systemPrompt: import_zod29.z.string(),
2308
+ usage: import_zod29.z.object({
2309
+ totalTokens: import_zod29.z.number().optional(),
2310
+ totalMicrocentsSpent: import_zod29.z.number().optional()
2334
2311
  })
2335
2312
  });
2336
- var EvaluationResultSchema = import_zod30.z.object({
2337
- id: import_zod30.z.string(),
2338
- runId: import_zod30.z.string(),
2339
- timestamp: import_zod30.z.number(),
2313
+ var EvaluationResultSchema = import_zod29.z.object({
2314
+ id: import_zod29.z.string(),
2315
+ runId: import_zod29.z.string(),
2316
+ timestamp: import_zod29.z.number(),
2340
2317
  promptResult: PromptResultSchema,
2341
- testResults: import_zod30.z.array(import_zod30.z.unknown()),
2342
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
2343
- feedback: import_zod30.z.string().optional(),
2344
- score: import_zod30.z.number(),
2345
- suiteId: import_zod30.z.string().optional()
2346
- });
2347
- var LeanEvaluationResultSchema = import_zod30.z.object({
2348
- id: import_zod30.z.string(),
2349
- runId: import_zod30.z.string(),
2350
- timestamp: import_zod30.z.number(),
2351
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
2352
- scenarioId: import_zod30.z.string(),
2353
- scenarioVersion: import_zod30.z.number().optional(),
2354
- targetId: import_zod30.z.string(),
2355
- targetVersion: import_zod30.z.number().optional(),
2356
- suiteId: import_zod30.z.string().optional(),
2357
- score: import_zod30.z.number(),
2358
- time: import_zod30.z.number().optional(),
2359
- microcentsSpent: import_zod30.z.number().optional()
2318
+ testResults: import_zod29.z.array(import_zod29.z.unknown()),
2319
+ tags: import_zod29.z.array(import_zod29.z.string()).optional(),
2320
+ feedback: import_zod29.z.string().optional(),
2321
+ score: import_zod29.z.number(),
2322
+ suiteId: import_zod29.z.string().optional()
2323
+ });
2324
+ var LeanEvaluationResultSchema = import_zod29.z.object({
2325
+ id: import_zod29.z.string(),
2326
+ runId: import_zod29.z.string(),
2327
+ timestamp: import_zod29.z.number(),
2328
+ tags: import_zod29.z.array(import_zod29.z.string()).optional(),
2329
+ scenarioId: import_zod29.z.string(),
2330
+ scenarioVersion: import_zod29.z.number().optional(),
2331
+ targetId: import_zod29.z.string(),
2332
+ targetVersion: import_zod29.z.number().optional(),
2333
+ suiteId: import_zod29.z.string().optional(),
2334
+ score: import_zod29.z.number(),
2335
+ time: import_zod29.z.number().optional(),
2336
+ microcentsSpent: import_zod29.z.number().optional()
2360
2337
  });
2361
2338
 
2362
2339
  // src/project/project.ts
2363
- var import_zod31 = require("zod");
2340
+ var import_zod30 = require("zod");
2364
2341
  var ProjectSchema = BaseEntitySchema.extend({
2365
- appId: import_zod31.z.string().optional().describe("The ID of the app in Dev Center"),
2366
- appSecret: import_zod31.z.string().optional().describe("The secret of the app in Dev Center"),
2367
- useWixAuth: import_zod31.z.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
2368
- useBase44Auth: import_zod31.z.boolean().optional().describe("Enable Base44 auth for evaluations"),
2369
- scenarioTags: import_zod31.z.array(import_zod31.z.string()).optional().describe("Project-level tag vocabulary for scenarios")
2342
+ appId: import_zod30.z.string().optional().describe("The ID of the app in Dev Center"),
2343
+ appSecret: import_zod30.z.string().optional().describe("The secret of the app in Dev Center"),
2344
+ useWixAuth: import_zod30.z.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
2345
+ useBase44Auth: import_zod30.z.boolean().optional().describe("Enable Base44 auth for evaluations"),
2346
+ scenarioTags: import_zod30.z.array(import_zod30.z.string()).optional().describe("Project-level tag vocabulary for scenarios")
2370
2347
  });
2371
2348
  var CreateProjectInputSchema = ProjectSchema.omit({
2372
2349
  id: true,
@@ -2419,7 +2396,7 @@ var SYSTEM_ASSERTIONS = {
2419
2396
  [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
2420
2397
  id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
2421
2398
  name: "Tool Called With Param",
2422
- description: "Check that a tool was called with expected parameters",
2399
+ description: "Check that a tool was called with expected parameters (tool name is substring matched)",
2423
2400
  type: "tool_called_with_param",
2424
2401
  parameters: [
2425
2402
  {
@@ -2433,6 +2410,14 @@ var SYSTEM_ASSERTIONS = {
2433
2410
  label: "Expected Parameters (JSON, substring match)",
2434
2411
  type: "string",
2435
2412
  required: true
2413
+ },
2414
+ {
2415
+ name: "requireSuccess",
2416
+ label: "Require Successful Call",
2417
+ type: "boolean",
2418
+ required: false,
2419
+ defaultValue: false,
2420
+ advanced: true
2436
2421
  }
2437
2422
  ]
2438
2423
  },