@wix/evalforge-types 0.53.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +376 -391
- package/build/index.js.map +4 -4
- package/build/index.mjs +376 -391
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +106 -4
- package/build/types/scenario/index.d.ts +0 -1
- package/build/types/scenario/test-scenario.d.ts +69 -30
- package/package.json +2 -2
- package/build/types/scenario/assertions.d.ts +0 -98
package/build/index.js
CHANGED
|
@@ -1609,85 +1609,34 @@ var TestSchema = import_zod20.z.discriminatedUnion("type", [
|
|
|
1609
1609
|
PlaywrightNLTestSchema
|
|
1610
1610
|
]);
|
|
1611
1611
|
|
|
1612
|
-
// src/scenario/assertions.ts
|
|
1613
|
-
var import_zod21 = require("zod");
|
|
1614
|
-
var SkillWasCalledAssertionSchema = import_zod21.z.object({
|
|
1615
|
-
type: import_zod21.z.literal("skill_was_called"),
|
|
1616
|
-
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1617
|
-
skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
|
|
1618
|
-
});
|
|
1619
|
-
var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
|
|
1620
|
-
type: import_zod21.z.literal("tool_called_with_param"),
|
|
1621
|
-
/** Name of the tool that must have been called */
|
|
1622
|
-
toolName: import_zod21.z.string().min(1),
|
|
1623
|
-
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1624
|
-
expectedParams: import_zod21.z.string().min(1)
|
|
1625
|
-
});
|
|
1626
|
-
var BuildPassedAssertionSchema = import_zod21.z.object({
|
|
1627
|
-
type: import_zod21.z.literal("build_passed"),
|
|
1628
|
-
/** Command to run (default: "yarn build") */
|
|
1629
|
-
command: import_zod21.z.string().optional(),
|
|
1630
|
-
/** Expected exit code (default: 0) */
|
|
1631
|
-
expectedExitCode: import_zod21.z.number().int().optional()
|
|
1632
|
-
});
|
|
1633
|
-
var CostAssertionSchema = import_zod21.z.object({
|
|
1634
|
-
type: import_zod21.z.literal("cost"),
|
|
1635
|
-
/** Maximum allowed cost in USD */
|
|
1636
|
-
maxCostUsd: import_zod21.z.number().positive()
|
|
1637
|
-
});
|
|
1638
|
-
var LlmJudgeAssertionSchema = import_zod21.z.object({
|
|
1639
|
-
type: import_zod21.z.literal("llm_judge"),
|
|
1640
|
-
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1641
|
-
prompt: import_zod21.z.string(),
|
|
1642
|
-
/** Minimum score to pass (0-10, default 7) */
|
|
1643
|
-
minScore: import_zod21.z.number().int().min(0).max(10).optional(),
|
|
1644
|
-
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
1645
|
-
model: import_zod21.z.string().optional(),
|
|
1646
|
-
maxTokens: import_zod21.z.number().int().optional(),
|
|
1647
|
-
temperature: import_zod21.z.number().min(0).max(1).optional()
|
|
1648
|
-
});
|
|
1649
|
-
var TimeAssertionSchema = import_zod21.z.object({
|
|
1650
|
-
type: import_zod21.z.literal("time_limit"),
|
|
1651
|
-
/** Maximum allowed duration in milliseconds */
|
|
1652
|
-
maxDurationMs: import_zod21.z.number().int().positive()
|
|
1653
|
-
});
|
|
1654
|
-
var AssertionSchema = import_zod21.z.union([
|
|
1655
|
-
SkillWasCalledAssertionSchema,
|
|
1656
|
-
ToolCalledWithParamAssertionSchema,
|
|
1657
|
-
BuildPassedAssertionSchema,
|
|
1658
|
-
TimeAssertionSchema,
|
|
1659
|
-
CostAssertionSchema,
|
|
1660
|
-
LlmJudgeAssertionSchema
|
|
1661
|
-
]);
|
|
1662
|
-
|
|
1663
1612
|
// src/scenario/environment.ts
|
|
1664
|
-
var
|
|
1665
|
-
var LocalProjectConfigSchema =
|
|
1613
|
+
var import_zod21 = require("zod");
|
|
1614
|
+
var LocalProjectConfigSchema = import_zod21.z.object({
|
|
1666
1615
|
/** Template ID to use for the local project */
|
|
1667
|
-
templateId:
|
|
1616
|
+
templateId: import_zod21.z.string().optional(),
|
|
1668
1617
|
/** Files to create in the project */
|
|
1669
|
-
files:
|
|
1670
|
-
|
|
1671
|
-
path:
|
|
1672
|
-
content:
|
|
1618
|
+
files: import_zod21.z.array(
|
|
1619
|
+
import_zod21.z.object({
|
|
1620
|
+
path: import_zod21.z.string().min(1),
|
|
1621
|
+
content: import_zod21.z.string().min(1)
|
|
1673
1622
|
})
|
|
1674
1623
|
).optional()
|
|
1675
1624
|
});
|
|
1676
|
-
var MetaSiteConfigSchema =
|
|
1677
|
-
configurations:
|
|
1678
|
-
|
|
1679
|
-
name:
|
|
1680
|
-
apiCalls:
|
|
1681
|
-
|
|
1682
|
-
url:
|
|
1683
|
-
method:
|
|
1684
|
-
body:
|
|
1625
|
+
var MetaSiteConfigSchema = import_zod21.z.object({
|
|
1626
|
+
configurations: import_zod21.z.array(
|
|
1627
|
+
import_zod21.z.object({
|
|
1628
|
+
name: import_zod21.z.string().min(1),
|
|
1629
|
+
apiCalls: import_zod21.z.array(
|
|
1630
|
+
import_zod21.z.object({
|
|
1631
|
+
url: import_zod21.z.string().url(),
|
|
1632
|
+
method: import_zod21.z.enum(["POST", "PUT"]),
|
|
1633
|
+
body: import_zod21.z.string()
|
|
1685
1634
|
})
|
|
1686
1635
|
)
|
|
1687
1636
|
})
|
|
1688
1637
|
).optional()
|
|
1689
1638
|
});
|
|
1690
|
-
var EnvironmentSchema =
|
|
1639
|
+
var EnvironmentSchema = import_zod21.z.object({
|
|
1691
1640
|
/** Local project configuration */
|
|
1692
1641
|
localProject: LocalProjectConfigSchema.optional(),
|
|
1693
1642
|
/** Meta site configuration */
|
|
@@ -1695,11 +1644,11 @@ var EnvironmentSchema = import_zod22.z.object({
|
|
|
1695
1644
|
});
|
|
1696
1645
|
|
|
1697
1646
|
// src/scenario/test-scenario.ts
|
|
1698
|
-
var
|
|
1647
|
+
var import_zod23 = require("zod");
|
|
1699
1648
|
|
|
1700
1649
|
// src/assertion/assertion.ts
|
|
1701
|
-
var
|
|
1702
|
-
var AssertionTypeSchema =
|
|
1650
|
+
var import_zod22 = require("zod");
|
|
1651
|
+
var AssertionTypeSchema = import_zod22.z.enum([
|
|
1703
1652
|
"skill_was_called",
|
|
1704
1653
|
"tool_called_with_param",
|
|
1705
1654
|
"build_passed",
|
|
@@ -1707,59 +1656,61 @@ var AssertionTypeSchema = import_zod23.z.enum([
|
|
|
1707
1656
|
"cost",
|
|
1708
1657
|
"llm_judge"
|
|
1709
1658
|
]);
|
|
1710
|
-
var AssertionParameterTypeSchema =
|
|
1659
|
+
var AssertionParameterTypeSchema = import_zod22.z.enum([
|
|
1711
1660
|
"string",
|
|
1712
1661
|
"number",
|
|
1713
1662
|
"boolean"
|
|
1714
1663
|
]);
|
|
1715
|
-
var AssertionParameterSchema =
|
|
1664
|
+
var AssertionParameterSchema = import_zod22.z.object({
|
|
1716
1665
|
/** Parameter name (used as key in params object) */
|
|
1717
|
-
name:
|
|
1666
|
+
name: import_zod22.z.string().min(1),
|
|
1718
1667
|
/** Display label for the parameter */
|
|
1719
|
-
label:
|
|
1668
|
+
label: import_zod22.z.string().min(1),
|
|
1720
1669
|
/** Parameter type */
|
|
1721
1670
|
type: AssertionParameterTypeSchema,
|
|
1722
1671
|
/** Whether this parameter is required */
|
|
1723
|
-
required:
|
|
1672
|
+
required: import_zod22.z.boolean(),
|
|
1724
1673
|
/** Default value (optional, used when not provided) */
|
|
1725
|
-
defaultValue:
|
|
1674
|
+
defaultValue: import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean()]).optional(),
|
|
1726
1675
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
1727
|
-
advanced:
|
|
1676
|
+
advanced: import_zod22.z.boolean().optional()
|
|
1728
1677
|
});
|
|
1729
|
-
var ScenarioAssertionLinkSchema =
|
|
1678
|
+
var ScenarioAssertionLinkSchema = import_zod22.z.object({
|
|
1730
1679
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
1731
|
-
assertionId:
|
|
1680
|
+
assertionId: import_zod22.z.string(),
|
|
1732
1681
|
/** Parameter values for this assertion in this scenario */
|
|
1733
|
-
params:
|
|
1734
|
-
|
|
1735
|
-
|
|
1682
|
+
params: import_zod22.z.record(
|
|
1683
|
+
import_zod22.z.string(),
|
|
1684
|
+
import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean(), import_zod22.z.null()])
|
|
1736
1685
|
).optional()
|
|
1737
1686
|
});
|
|
1738
|
-
var SkillWasCalledConfigSchema =
|
|
1687
|
+
var SkillWasCalledConfigSchema = import_zod22.z.object({
|
|
1739
1688
|
/** Names of the skills that must have been called */
|
|
1740
|
-
skillNames:
|
|
1689
|
+
skillNames: import_zod22.z.array(import_zod22.z.string().min(1)).min(1)
|
|
1741
1690
|
});
|
|
1742
|
-
var CostConfigSchema =
|
|
1691
|
+
var CostConfigSchema = import_zod22.z.strictObject({
|
|
1743
1692
|
/** Maximum allowed cost in USD */
|
|
1744
|
-
maxCostUsd:
|
|
1693
|
+
maxCostUsd: import_zod22.z.number().positive()
|
|
1745
1694
|
});
|
|
1746
|
-
var ToolCalledWithParamConfigSchema =
|
|
1695
|
+
var ToolCalledWithParamConfigSchema = import_zod22.z.strictObject({
|
|
1747
1696
|
/** Name of the tool that must have been called */
|
|
1748
|
-
toolName:
|
|
1697
|
+
toolName: import_zod22.z.string().min(1),
|
|
1749
1698
|
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1750
|
-
expectedParams:
|
|
1699
|
+
expectedParams: import_zod22.z.string().min(1),
|
|
1700
|
+
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
1701
|
+
requireSuccess: import_zod22.z.boolean().optional()
|
|
1751
1702
|
});
|
|
1752
|
-
var BuildPassedConfigSchema =
|
|
1703
|
+
var BuildPassedConfigSchema = import_zod22.z.strictObject({
|
|
1753
1704
|
/** Command to run (default: "yarn build") */
|
|
1754
|
-
command:
|
|
1705
|
+
command: import_zod22.z.string().optional(),
|
|
1755
1706
|
/** Expected exit code (default: 0) */
|
|
1756
|
-
expectedExitCode:
|
|
1707
|
+
expectedExitCode: import_zod22.z.number().int().optional()
|
|
1757
1708
|
});
|
|
1758
|
-
var TimeConfigSchema =
|
|
1709
|
+
var TimeConfigSchema = import_zod22.z.strictObject({
|
|
1759
1710
|
/** Maximum allowed duration in milliseconds */
|
|
1760
|
-
maxDurationMs:
|
|
1711
|
+
maxDurationMs: import_zod22.z.number().int().positive()
|
|
1761
1712
|
});
|
|
1762
|
-
var LlmJudgeConfigSchema =
|
|
1713
|
+
var LlmJudgeConfigSchema = import_zod22.z.object({
|
|
1763
1714
|
/**
|
|
1764
1715
|
* Prompt template with placeholders:
|
|
1765
1716
|
* - {{output}}: agent's final output
|
|
@@ -1770,19 +1721,45 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
|
|
|
1770
1721
|
* - {{trace}}: step-by-step trace of tool calls
|
|
1771
1722
|
* - Custom parameters defined in the parameters array
|
|
1772
1723
|
*/
|
|
1773
|
-
prompt:
|
|
1724
|
+
prompt: import_zod22.z.string().min(1),
|
|
1774
1725
|
/** Minimum score to pass (0-10, default 7) */
|
|
1775
|
-
minScore:
|
|
1726
|
+
minScore: import_zod22.z.number().int().min(0).max(10).optional(),
|
|
1776
1727
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1777
|
-
model:
|
|
1728
|
+
model: import_zod22.z.string().optional(),
|
|
1778
1729
|
/** Max output tokens */
|
|
1779
|
-
maxTokens:
|
|
1730
|
+
maxTokens: import_zod22.z.number().int().optional(),
|
|
1780
1731
|
/** Temperature (0-1) */
|
|
1781
|
-
temperature:
|
|
1732
|
+
temperature: import_zod22.z.number().min(0).max(1).optional(),
|
|
1782
1733
|
/** User-defined parameters for this assertion */
|
|
1783
|
-
parameters:
|
|
1734
|
+
parameters: import_zod22.z.array(AssertionParameterSchema).optional()
|
|
1735
|
+
});
|
|
1736
|
+
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
1737
|
+
type: import_zod22.z.literal("skill_was_called")
|
|
1738
|
+
});
|
|
1739
|
+
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
1740
|
+
type: import_zod22.z.literal("tool_called_with_param")
|
|
1741
|
+
});
|
|
1742
|
+
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
1743
|
+
type: import_zod22.z.literal("build_passed")
|
|
1744
|
+
});
|
|
1745
|
+
var CostAssertionSchema = CostConfigSchema.extend({
|
|
1746
|
+
type: import_zod22.z.literal("cost")
|
|
1747
|
+
});
|
|
1748
|
+
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
1749
|
+
type: import_zod22.z.literal("llm_judge")
|
|
1784
1750
|
});
|
|
1785
|
-
var
|
|
1751
|
+
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
1752
|
+
type: import_zod22.z.literal("time_limit")
|
|
1753
|
+
});
|
|
1754
|
+
var AssertionSchema = import_zod22.z.union([
|
|
1755
|
+
SkillWasCalledAssertionSchema,
|
|
1756
|
+
ToolCalledWithParamAssertionSchema,
|
|
1757
|
+
BuildPassedAssertionSchema,
|
|
1758
|
+
TimeAssertionSchema,
|
|
1759
|
+
CostAssertionSchema,
|
|
1760
|
+
LlmJudgeAssertionSchema
|
|
1761
|
+
]);
|
|
1762
|
+
var AssertionConfigSchema = import_zod22.z.union([
|
|
1786
1763
|
LlmJudgeConfigSchema,
|
|
1787
1764
|
// requires prompt - check first
|
|
1788
1765
|
SkillWasCalledConfigSchema,
|
|
@@ -1795,7 +1772,7 @@ var AssertionConfigSchema = import_zod23.z.union([
|
|
|
1795
1772
|
// requires maxCostUsd, uses strictObject
|
|
1796
1773
|
BuildPassedConfigSchema,
|
|
1797
1774
|
// all optional, uses strictObject to reject unknown keys
|
|
1798
|
-
|
|
1775
|
+
import_zod22.z.object({})
|
|
1799
1776
|
// fallback empty config
|
|
1800
1777
|
]);
|
|
1801
1778
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -1846,25 +1823,25 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1846
1823
|
}
|
|
1847
1824
|
|
|
1848
1825
|
// src/scenario/test-scenario.ts
|
|
1849
|
-
var ExpectedFileSchema =
|
|
1826
|
+
var ExpectedFileSchema = import_zod23.z.object({
|
|
1850
1827
|
/** Relative path where the file should be created */
|
|
1851
|
-
path:
|
|
1828
|
+
path: import_zod23.z.string(),
|
|
1852
1829
|
/** Optional expected content */
|
|
1853
|
-
content:
|
|
1830
|
+
content: import_zod23.z.string().optional()
|
|
1854
1831
|
});
|
|
1855
1832
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
1856
1833
|
/** The prompt sent to the agent to trigger the task */
|
|
1857
|
-
triggerPrompt:
|
|
1834
|
+
triggerPrompt: import_zod23.z.string().min(10),
|
|
1858
1835
|
/** ID of the template to use for this scenario (null = no template) */
|
|
1859
|
-
templateId:
|
|
1836
|
+
templateId: import_zod23.z.string().nullish(),
|
|
1860
1837
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
1861
|
-
assertions:
|
|
1838
|
+
assertions: import_zod23.z.array(AssertionSchema).optional(),
|
|
1862
1839
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
1863
|
-
assertionIds:
|
|
1840
|
+
assertionIds: import_zod23.z.array(import_zod23.z.string()).optional(),
|
|
1864
1841
|
/** Linked assertions with per-scenario parameter values */
|
|
1865
|
-
assertionLinks:
|
|
1842
|
+
assertionLinks: import_zod23.z.array(ScenarioAssertionLinkSchema).optional(),
|
|
1866
1843
|
/** Tags for categorisation and filtering */
|
|
1867
|
-
tags:
|
|
1844
|
+
tags: import_zod23.z.array(import_zod23.z.string()).optional()
|
|
1868
1845
|
});
|
|
1869
1846
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
1870
1847
|
id: true,
|
|
@@ -1875,10 +1852,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
1875
1852
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
1876
1853
|
|
|
1877
1854
|
// src/suite/test-suite.ts
|
|
1878
|
-
var
|
|
1855
|
+
var import_zod24 = require("zod");
|
|
1879
1856
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1880
1857
|
/** IDs of test scenarios in this suite */
|
|
1881
|
-
scenarioIds:
|
|
1858
|
+
scenarioIds: import_zod24.z.array(import_zod24.z.string())
|
|
1882
1859
|
});
|
|
1883
1860
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1884
1861
|
id: true,
|
|
@@ -1889,21 +1866,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1889
1866
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1890
1867
|
|
|
1891
1868
|
// src/evaluation/metrics.ts
|
|
1892
|
-
var
|
|
1893
|
-
var TokenUsageSchema =
|
|
1894
|
-
prompt:
|
|
1895
|
-
completion:
|
|
1896
|
-
total:
|
|
1897
|
-
});
|
|
1898
|
-
var EvalMetricsSchema =
|
|
1899
|
-
totalAssertions:
|
|
1900
|
-
passed:
|
|
1901
|
-
failed:
|
|
1902
|
-
skipped:
|
|
1903
|
-
errors:
|
|
1904
|
-
passRate:
|
|
1905
|
-
avgDuration:
|
|
1906
|
-
totalDuration:
|
|
1869
|
+
var import_zod25 = require("zod");
|
|
1870
|
+
var TokenUsageSchema = import_zod25.z.object({
|
|
1871
|
+
prompt: import_zod25.z.number(),
|
|
1872
|
+
completion: import_zod25.z.number(),
|
|
1873
|
+
total: import_zod25.z.number()
|
|
1874
|
+
});
|
|
1875
|
+
var EvalMetricsSchema = import_zod25.z.object({
|
|
1876
|
+
totalAssertions: import_zod25.z.number(),
|
|
1877
|
+
passed: import_zod25.z.number(),
|
|
1878
|
+
failed: import_zod25.z.number(),
|
|
1879
|
+
skipped: import_zod25.z.number(),
|
|
1880
|
+
errors: import_zod25.z.number(),
|
|
1881
|
+
passRate: import_zod25.z.number(),
|
|
1882
|
+
avgDuration: import_zod25.z.number(),
|
|
1883
|
+
totalDuration: import_zod25.z.number()
|
|
1907
1884
|
});
|
|
1908
1885
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1909
1886
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1913,7 +1890,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1913
1890
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1914
1891
|
return EvalStatus2;
|
|
1915
1892
|
})(EvalStatus || {});
|
|
1916
|
-
var EvalStatusSchema =
|
|
1893
|
+
var EvalStatusSchema = import_zod25.z.enum(EvalStatus);
|
|
1917
1894
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1918
1895
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1919
1896
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1921,52 +1898,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1921
1898
|
LLMStepType2["THINKING"] = "thinking";
|
|
1922
1899
|
return LLMStepType2;
|
|
1923
1900
|
})(LLMStepType || {});
|
|
1924
|
-
var LLMTraceStepSchema =
|
|
1925
|
-
id:
|
|
1926
|
-
stepNumber:
|
|
1927
|
-
type:
|
|
1928
|
-
model:
|
|
1929
|
-
provider:
|
|
1930
|
-
startedAt:
|
|
1931
|
-
durationMs:
|
|
1901
|
+
var LLMTraceStepSchema = import_zod25.z.object({
|
|
1902
|
+
id: import_zod25.z.string(),
|
|
1903
|
+
stepNumber: import_zod25.z.number(),
|
|
1904
|
+
type: import_zod25.z.enum(LLMStepType),
|
|
1905
|
+
model: import_zod25.z.string(),
|
|
1906
|
+
provider: import_zod25.z.string(),
|
|
1907
|
+
startedAt: import_zod25.z.string(),
|
|
1908
|
+
durationMs: import_zod25.z.number(),
|
|
1932
1909
|
tokenUsage: TokenUsageSchema,
|
|
1933
|
-
costUsd:
|
|
1934
|
-
toolName:
|
|
1935
|
-
toolArguments:
|
|
1936
|
-
inputPreview:
|
|
1937
|
-
outputPreview:
|
|
1938
|
-
success:
|
|
1939
|
-
error:
|
|
1940
|
-
});
|
|
1941
|
-
var LLMBreakdownStatsSchema =
|
|
1942
|
-
count:
|
|
1943
|
-
durationMs:
|
|
1944
|
-
tokens:
|
|
1945
|
-
costUsd:
|
|
1946
|
-
});
|
|
1947
|
-
var LLMTraceSummarySchema =
|
|
1948
|
-
totalSteps:
|
|
1949
|
-
totalDurationMs:
|
|
1910
|
+
costUsd: import_zod25.z.number(),
|
|
1911
|
+
toolName: import_zod25.z.string().optional(),
|
|
1912
|
+
toolArguments: import_zod25.z.string().optional(),
|
|
1913
|
+
inputPreview: import_zod25.z.string().optional(),
|
|
1914
|
+
outputPreview: import_zod25.z.string().optional(),
|
|
1915
|
+
success: import_zod25.z.boolean(),
|
|
1916
|
+
error: import_zod25.z.string().optional()
|
|
1917
|
+
});
|
|
1918
|
+
var LLMBreakdownStatsSchema = import_zod25.z.object({
|
|
1919
|
+
count: import_zod25.z.number(),
|
|
1920
|
+
durationMs: import_zod25.z.number(),
|
|
1921
|
+
tokens: import_zod25.z.number(),
|
|
1922
|
+
costUsd: import_zod25.z.number()
|
|
1923
|
+
});
|
|
1924
|
+
var LLMTraceSummarySchema = import_zod25.z.object({
|
|
1925
|
+
totalSteps: import_zod25.z.number(),
|
|
1926
|
+
totalDurationMs: import_zod25.z.number(),
|
|
1950
1927
|
totalTokens: TokenUsageSchema,
|
|
1951
|
-
totalCostUsd:
|
|
1952
|
-
stepTypeBreakdown:
|
|
1953
|
-
modelBreakdown:
|
|
1954
|
-
modelsUsed:
|
|
1955
|
-
});
|
|
1956
|
-
var LLMTraceSchema =
|
|
1957
|
-
id:
|
|
1958
|
-
steps:
|
|
1928
|
+
totalCostUsd: import_zod25.z.number(),
|
|
1929
|
+
stepTypeBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
1930
|
+
modelBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema),
|
|
1931
|
+
modelsUsed: import_zod25.z.array(import_zod25.z.string())
|
|
1932
|
+
});
|
|
1933
|
+
var LLMTraceSchema = import_zod25.z.object({
|
|
1934
|
+
id: import_zod25.z.string(),
|
|
1935
|
+
steps: import_zod25.z.array(LLMTraceStepSchema),
|
|
1959
1936
|
summary: LLMTraceSummarySchema
|
|
1960
1937
|
});
|
|
1961
1938
|
|
|
1962
1939
|
// src/evaluation/eval-result.ts
|
|
1963
|
-
var
|
|
1940
|
+
var import_zod29 = require("zod");
|
|
1964
1941
|
|
|
1965
1942
|
// src/evaluation/eval-run.ts
|
|
1966
|
-
var
|
|
1943
|
+
var import_zod27 = require("zod");
|
|
1967
1944
|
|
|
1968
1945
|
// src/evaluation/live-trace.ts
|
|
1969
|
-
var
|
|
1946
|
+
var import_zod26 = require("zod");
|
|
1970
1947
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1971
1948
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1972
1949
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1980,37 +1957,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1980
1957
|
LiveTraceEventType2["USER"] = "user";
|
|
1981
1958
|
return LiveTraceEventType2;
|
|
1982
1959
|
})(LiveTraceEventType || {});
|
|
1983
|
-
var LiveTraceEventSchema =
|
|
1960
|
+
var LiveTraceEventSchema = import_zod26.z.object({
|
|
1984
1961
|
/** The evaluation run ID */
|
|
1985
|
-
evalRunId:
|
|
1962
|
+
evalRunId: import_zod26.z.string(),
|
|
1986
1963
|
/** The scenario ID being executed */
|
|
1987
|
-
scenarioId:
|
|
1964
|
+
scenarioId: import_zod26.z.string(),
|
|
1988
1965
|
/** The scenario name for display */
|
|
1989
|
-
scenarioName:
|
|
1966
|
+
scenarioName: import_zod26.z.string(),
|
|
1990
1967
|
/** The target ID (skill, agent, etc.) */
|
|
1991
|
-
targetId:
|
|
1968
|
+
targetId: import_zod26.z.string(),
|
|
1992
1969
|
/** The target name for display */
|
|
1993
|
-
targetName:
|
|
1970
|
+
targetName: import_zod26.z.string(),
|
|
1994
1971
|
/** Step number in the current scenario execution */
|
|
1995
|
-
stepNumber:
|
|
1972
|
+
stepNumber: import_zod26.z.number(),
|
|
1996
1973
|
/** Type of trace event */
|
|
1997
|
-
type:
|
|
1974
|
+
type: import_zod26.z.enum(LiveTraceEventType),
|
|
1998
1975
|
/** Tool name if this is a tool_use event */
|
|
1999
|
-
toolName:
|
|
1976
|
+
toolName: import_zod26.z.string().optional(),
|
|
2000
1977
|
/** Tool arguments preview (truncated JSON) */
|
|
2001
|
-
toolArgs:
|
|
1978
|
+
toolArgs: import_zod26.z.string().optional(),
|
|
2002
1979
|
/** Output preview (truncated text) */
|
|
2003
|
-
outputPreview:
|
|
1980
|
+
outputPreview: import_zod26.z.string().optional(),
|
|
2004
1981
|
/** File path for file operations */
|
|
2005
|
-
filePath:
|
|
1982
|
+
filePath: import_zod26.z.string().optional(),
|
|
2006
1983
|
/** Elapsed time in milliseconds for progress events */
|
|
2007
|
-
elapsedMs:
|
|
1984
|
+
elapsedMs: import_zod26.z.number().optional(),
|
|
2008
1985
|
/** Thinking/reasoning text from Claude */
|
|
2009
|
-
thinking:
|
|
1986
|
+
thinking: import_zod26.z.string().optional(),
|
|
2010
1987
|
/** Timestamp when this event occurred */
|
|
2011
|
-
timestamp:
|
|
1988
|
+
timestamp: import_zod26.z.string(),
|
|
2012
1989
|
/** Whether this is the final event for this scenario */
|
|
2013
|
-
isComplete:
|
|
1990
|
+
isComplete: import_zod26.z.boolean()
|
|
2014
1991
|
});
|
|
2015
1992
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
2016
1993
|
function parseTraceEventLine(line) {
|
|
@@ -2038,14 +2015,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
2038
2015
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
2039
2016
|
return TriggerType2;
|
|
2040
2017
|
})(TriggerType || {});
|
|
2041
|
-
var TriggerMetadataSchema =
|
|
2042
|
-
version:
|
|
2043
|
-
resourceUpdated:
|
|
2018
|
+
var TriggerMetadataSchema = import_zod27.z.object({
|
|
2019
|
+
version: import_zod27.z.string().optional(),
|
|
2020
|
+
resourceUpdated: import_zod27.z.array(import_zod27.z.string()).optional()
|
|
2044
2021
|
});
|
|
2045
|
-
var TriggerSchema =
|
|
2046
|
-
id:
|
|
2022
|
+
var TriggerSchema = import_zod27.z.object({
|
|
2023
|
+
id: import_zod27.z.string(),
|
|
2047
2024
|
metadata: TriggerMetadataSchema.optional(),
|
|
2048
|
-
type:
|
|
2025
|
+
type: import_zod27.z.enum(TriggerType)
|
|
2049
2026
|
});
|
|
2050
2027
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
2051
2028
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -2063,28 +2040,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
2063
2040
|
FailureSeverity2["LOW"] = "low";
|
|
2064
2041
|
return FailureSeverity2;
|
|
2065
2042
|
})(FailureSeverity || {});
|
|
2066
|
-
var DiffLineTypeSchema =
|
|
2067
|
-
var DiffLineSchema =
|
|
2043
|
+
var DiffLineTypeSchema = import_zod27.z.enum(["added", "removed", "unchanged"]);
|
|
2044
|
+
var DiffLineSchema = import_zod27.z.object({
|
|
2068
2045
|
type: DiffLineTypeSchema,
|
|
2069
|
-
content:
|
|
2070
|
-
lineNumber:
|
|
2071
|
-
});
|
|
2072
|
-
var DiffContentSchema =
|
|
2073
|
-
path:
|
|
2074
|
-
expected:
|
|
2075
|
-
actual:
|
|
2076
|
-
diffLines:
|
|
2077
|
-
renamedFrom:
|
|
2078
|
-
});
|
|
2079
|
-
var CommandExecutionSchema =
|
|
2080
|
-
command:
|
|
2081
|
-
exitCode:
|
|
2082
|
-
output:
|
|
2083
|
-
duration:
|
|
2084
|
-
});
|
|
2085
|
-
var FileModificationSchema =
|
|
2086
|
-
path:
|
|
2087
|
-
action:
|
|
2046
|
+
content: import_zod27.z.string(),
|
|
2047
|
+
lineNumber: import_zod27.z.number()
|
|
2048
|
+
});
|
|
2049
|
+
var DiffContentSchema = import_zod27.z.object({
|
|
2050
|
+
path: import_zod27.z.string(),
|
|
2051
|
+
expected: import_zod27.z.string(),
|
|
2052
|
+
actual: import_zod27.z.string(),
|
|
2053
|
+
diffLines: import_zod27.z.array(DiffLineSchema),
|
|
2054
|
+
renamedFrom: import_zod27.z.string().optional()
|
|
2055
|
+
});
|
|
2056
|
+
var CommandExecutionSchema = import_zod27.z.object({
|
|
2057
|
+
command: import_zod27.z.string(),
|
|
2058
|
+
exitCode: import_zod27.z.number(),
|
|
2059
|
+
output: import_zod27.z.string().optional(),
|
|
2060
|
+
duration: import_zod27.z.number()
|
|
2061
|
+
});
|
|
2062
|
+
var FileModificationSchema = import_zod27.z.object({
|
|
2063
|
+
path: import_zod27.z.string(),
|
|
2064
|
+
action: import_zod27.z.enum(["created", "modified", "deleted"])
|
|
2088
2065
|
});
|
|
2089
2066
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
2090
2067
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -2092,87 +2069,87 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
2092
2069
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
2093
2070
|
return TemplateFileStatus2;
|
|
2094
2071
|
})(TemplateFileStatus || {});
|
|
2095
|
-
var TemplateFileSchema =
|
|
2072
|
+
var TemplateFileSchema = import_zod27.z.object({
|
|
2096
2073
|
/** Relative path within the template */
|
|
2097
|
-
path:
|
|
2074
|
+
path: import_zod27.z.string(),
|
|
2098
2075
|
/** Full file content after execution */
|
|
2099
|
-
content:
|
|
2076
|
+
content: import_zod27.z.string(),
|
|
2100
2077
|
/** File status (new, modified, unchanged) */
|
|
2101
|
-
status:
|
|
2102
|
-
});
|
|
2103
|
-
var ApiCallSchema =
|
|
2104
|
-
endpoint:
|
|
2105
|
-
tokensUsed:
|
|
2106
|
-
duration:
|
|
2107
|
-
});
|
|
2108
|
-
var ExecutionTraceSchema =
|
|
2109
|
-
commands:
|
|
2110
|
-
filesModified:
|
|
2111
|
-
apiCalls:
|
|
2112
|
-
totalDuration:
|
|
2113
|
-
});
|
|
2114
|
-
var FailureAnalysisSchema =
|
|
2115
|
-
category:
|
|
2116
|
-
severity:
|
|
2117
|
-
summary:
|
|
2118
|
-
details:
|
|
2119
|
-
rootCause:
|
|
2120
|
-
suggestedFix:
|
|
2121
|
-
relatedAssertions:
|
|
2122
|
-
codeSnippet:
|
|
2123
|
-
similarIssues:
|
|
2124
|
-
patternId:
|
|
2078
|
+
status: import_zod27.z.enum(["new", "modified", "unchanged"])
|
|
2079
|
+
});
|
|
2080
|
+
var ApiCallSchema = import_zod27.z.object({
|
|
2081
|
+
endpoint: import_zod27.z.string(),
|
|
2082
|
+
tokensUsed: import_zod27.z.number(),
|
|
2083
|
+
duration: import_zod27.z.number()
|
|
2084
|
+
});
|
|
2085
|
+
var ExecutionTraceSchema = import_zod27.z.object({
|
|
2086
|
+
commands: import_zod27.z.array(CommandExecutionSchema),
|
|
2087
|
+
filesModified: import_zod27.z.array(FileModificationSchema),
|
|
2088
|
+
apiCalls: import_zod27.z.array(ApiCallSchema),
|
|
2089
|
+
totalDuration: import_zod27.z.number()
|
|
2090
|
+
});
|
|
2091
|
+
var FailureAnalysisSchema = import_zod27.z.object({
|
|
2092
|
+
category: import_zod27.z.enum(FailureCategory),
|
|
2093
|
+
severity: import_zod27.z.enum(FailureSeverity),
|
|
2094
|
+
summary: import_zod27.z.string(),
|
|
2095
|
+
details: import_zod27.z.string(),
|
|
2096
|
+
rootCause: import_zod27.z.string(),
|
|
2097
|
+
suggestedFix: import_zod27.z.string(),
|
|
2098
|
+
relatedAssertions: import_zod27.z.array(import_zod27.z.string()),
|
|
2099
|
+
codeSnippet: import_zod27.z.string().optional(),
|
|
2100
|
+
similarIssues: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2101
|
+
patternId: import_zod27.z.string().optional(),
|
|
2125
2102
|
// Extended fields for detailed debugging
|
|
2126
2103
|
diff: DiffContentSchema.optional(),
|
|
2127
2104
|
executionTrace: ExecutionTraceSchema.optional()
|
|
2128
2105
|
});
|
|
2129
2106
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
2130
2107
|
/** Agent ID for this run */
|
|
2131
|
-
agentId:
|
|
2108
|
+
agentId: import_zod27.z.string().optional(),
|
|
2132
2109
|
/** Preset ID that originated this run (optional) */
|
|
2133
|
-
presetId:
|
|
2110
|
+
presetId: import_zod27.z.string().optional(),
|
|
2134
2111
|
/** Skill IDs for this run */
|
|
2135
|
-
skillIds:
|
|
2112
|
+
skillIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2136
2113
|
/** Map of skillId to skillVersionId for this run */
|
|
2137
|
-
skillVersions:
|
|
2114
|
+
skillVersions: import_zod27.z.record(import_zod27.z.string(), import_zod27.z.string()).optional(),
|
|
2138
2115
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
2139
|
-
scenarioIds:
|
|
2116
|
+
scenarioIds: import_zod27.z.array(import_zod27.z.string()),
|
|
2140
2117
|
/** Current status */
|
|
2141
2118
|
status: EvalStatusSchema,
|
|
2142
2119
|
/** Progress percentage (0-100) */
|
|
2143
|
-
progress:
|
|
2120
|
+
progress: import_zod27.z.number(),
|
|
2144
2121
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
2145
|
-
results:
|
|
2122
|
+
results: import_zod27.z.array(import_zod27.z.lazy(() => EvalRunResultSchema)),
|
|
2146
2123
|
/** Aggregated metrics across all results */
|
|
2147
2124
|
aggregateMetrics: EvalMetricsSchema,
|
|
2148
2125
|
/** Failure analyses */
|
|
2149
|
-
failureAnalyses:
|
|
2126
|
+
failureAnalyses: import_zod27.z.array(FailureAnalysisSchema).optional(),
|
|
2150
2127
|
/** Aggregated LLM trace summary */
|
|
2151
2128
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
2152
2129
|
/** What triggered this run */
|
|
2153
2130
|
trigger: TriggerSchema.optional(),
|
|
2154
2131
|
/** When the run started (set when evaluation is triggered) */
|
|
2155
|
-
startedAt:
|
|
2132
|
+
startedAt: import_zod27.z.string().optional(),
|
|
2156
2133
|
/** When the run completed */
|
|
2157
|
-
completedAt:
|
|
2134
|
+
completedAt: import_zod27.z.string().optional(),
|
|
2158
2135
|
/** Live trace events captured during execution (for playback on results page) */
|
|
2159
|
-
liveTraceEvents:
|
|
2136
|
+
liveTraceEvents: import_zod27.z.array(LiveTraceEventSchema).optional(),
|
|
2160
2137
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
2161
|
-
jobId:
|
|
2138
|
+
jobId: import_zod27.z.string().optional(),
|
|
2162
2139
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
2163
|
-
jobStatus:
|
|
2140
|
+
jobStatus: import_zod27.z.string().optional(),
|
|
2164
2141
|
/** Remote job error message if the job failed */
|
|
2165
|
-
jobError:
|
|
2142
|
+
jobError: import_zod27.z.string().optional(),
|
|
2166
2143
|
/** Timestamp of the last job status check */
|
|
2167
|
-
jobStatusCheckedAt:
|
|
2144
|
+
jobStatusCheckedAt: import_zod27.z.string().optional(),
|
|
2168
2145
|
/** MCP server IDs to enable for this run (optional) */
|
|
2169
|
-
mcpIds:
|
|
2146
|
+
mcpIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2170
2147
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
2171
|
-
subAgentIds:
|
|
2148
|
+
subAgentIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2172
2149
|
/** Rule IDs to enable for this run (optional) */
|
|
2173
|
-
ruleIds:
|
|
2150
|
+
ruleIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2174
2151
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
2175
|
-
tags:
|
|
2152
|
+
tags: import_zod27.z.array(import_zod27.z.string()).optional()
|
|
2176
2153
|
});
|
|
2177
2154
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
2178
2155
|
id: true,
|
|
@@ -2187,60 +2164,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
2187
2164
|
scenarioIds: true
|
|
2188
2165
|
}).extend({
|
|
2189
2166
|
/** Optional on input — backend resolves from tags when not provided */
|
|
2190
|
-
scenarioIds:
|
|
2167
|
+
scenarioIds: import_zod27.z.array(import_zod27.z.string()).optional()
|
|
2191
2168
|
}).refine(
|
|
2192
2169
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
2193
2170
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
2194
2171
|
);
|
|
2195
|
-
var EvaluationProgressSchema =
|
|
2196
|
-
runId:
|
|
2197
|
-
targetId:
|
|
2198
|
-
totalScenarios:
|
|
2199
|
-
completedScenarios:
|
|
2200
|
-
scenarioProgress:
|
|
2201
|
-
|
|
2202
|
-
scenarioId:
|
|
2203
|
-
currentStep:
|
|
2204
|
-
error:
|
|
2172
|
+
var EvaluationProgressSchema = import_zod27.z.object({
|
|
2173
|
+
runId: import_zod27.z.string(),
|
|
2174
|
+
targetId: import_zod27.z.string(),
|
|
2175
|
+
totalScenarios: import_zod27.z.number(),
|
|
2176
|
+
completedScenarios: import_zod27.z.number(),
|
|
2177
|
+
scenarioProgress: import_zod27.z.array(
|
|
2178
|
+
import_zod27.z.object({
|
|
2179
|
+
scenarioId: import_zod27.z.string(),
|
|
2180
|
+
currentStep: import_zod27.z.string(),
|
|
2181
|
+
error: import_zod27.z.string().optional()
|
|
2205
2182
|
})
|
|
2206
2183
|
),
|
|
2207
|
-
createdAt:
|
|
2208
|
-
});
|
|
2209
|
-
var EvaluationLogSchema =
|
|
2210
|
-
runId:
|
|
2211
|
-
scenarioId:
|
|
2212
|
-
log:
|
|
2213
|
-
level:
|
|
2214
|
-
message:
|
|
2215
|
-
args:
|
|
2216
|
-
error:
|
|
2184
|
+
createdAt: import_zod27.z.number()
|
|
2185
|
+
});
|
|
2186
|
+
var EvaluationLogSchema = import_zod27.z.object({
|
|
2187
|
+
runId: import_zod27.z.string(),
|
|
2188
|
+
scenarioId: import_zod27.z.string(),
|
|
2189
|
+
log: import_zod27.z.object({
|
|
2190
|
+
level: import_zod27.z.enum(["info", "error", "debug"]),
|
|
2191
|
+
message: import_zod27.z.string().optional(),
|
|
2192
|
+
args: import_zod27.z.array(import_zod27.z.any()).optional(),
|
|
2193
|
+
error: import_zod27.z.string().optional()
|
|
2217
2194
|
})
|
|
2218
2195
|
});
|
|
2219
2196
|
var LLM_TIMEOUT = 12e4;
|
|
2220
2197
|
|
|
2221
2198
|
// src/evaluation/conversation.ts
|
|
2222
|
-
var
|
|
2223
|
-
var TextBlockSchema =
|
|
2224
|
-
type:
|
|
2225
|
-
text:
|
|
2226
|
-
});
|
|
2227
|
-
var ThinkingBlockSchema =
|
|
2228
|
-
type:
|
|
2229
|
-
thinking:
|
|
2230
|
-
});
|
|
2231
|
-
var ToolUseBlockSchema =
|
|
2232
|
-
type:
|
|
2233
|
-
toolName:
|
|
2234
|
-
toolId:
|
|
2235
|
-
input:
|
|
2236
|
-
});
|
|
2237
|
-
var ToolResultBlockSchema =
|
|
2238
|
-
type:
|
|
2239
|
-
toolUseId:
|
|
2240
|
-
content:
|
|
2241
|
-
isError:
|
|
2242
|
-
});
|
|
2243
|
-
var ConversationBlockSchema =
|
|
2199
|
+
var import_zod28 = require("zod");
|
|
2200
|
+
var TextBlockSchema = import_zod28.z.object({
|
|
2201
|
+
type: import_zod28.z.literal("text"),
|
|
2202
|
+
text: import_zod28.z.string()
|
|
2203
|
+
});
|
|
2204
|
+
var ThinkingBlockSchema = import_zod28.z.object({
|
|
2205
|
+
type: import_zod28.z.literal("thinking"),
|
|
2206
|
+
thinking: import_zod28.z.string()
|
|
2207
|
+
});
|
|
2208
|
+
var ToolUseBlockSchema = import_zod28.z.object({
|
|
2209
|
+
type: import_zod28.z.literal("tool_use"),
|
|
2210
|
+
toolName: import_zod28.z.string(),
|
|
2211
|
+
toolId: import_zod28.z.string(),
|
|
2212
|
+
input: import_zod28.z.unknown()
|
|
2213
|
+
});
|
|
2214
|
+
var ToolResultBlockSchema = import_zod28.z.object({
|
|
2215
|
+
type: import_zod28.z.literal("tool_result"),
|
|
2216
|
+
toolUseId: import_zod28.z.string(),
|
|
2217
|
+
content: import_zod28.z.string(),
|
|
2218
|
+
isError: import_zod28.z.boolean().optional()
|
|
2219
|
+
});
|
|
2220
|
+
var ConversationBlockSchema = import_zod28.z.discriminatedUnion("type", [
|
|
2244
2221
|
TextBlockSchema,
|
|
2245
2222
|
ThinkingBlockSchema,
|
|
2246
2223
|
ToolUseBlockSchema,
|
|
@@ -2251,18 +2228,18 @@ var ConversationMessageRoles = [
|
|
|
2251
2228
|
"user",
|
|
2252
2229
|
"system"
|
|
2253
2230
|
];
|
|
2254
|
-
var ConversationMessageSchema =
|
|
2255
|
-
role:
|
|
2256
|
-
content:
|
|
2257
|
-
timestamp:
|
|
2231
|
+
var ConversationMessageSchema = import_zod28.z.object({
|
|
2232
|
+
role: import_zod28.z.enum(ConversationMessageRoles),
|
|
2233
|
+
content: import_zod28.z.array(ConversationBlockSchema),
|
|
2234
|
+
timestamp: import_zod28.z.string()
|
|
2258
2235
|
});
|
|
2259
|
-
var ScenarioConversationSchema =
|
|
2260
|
-
id:
|
|
2261
|
-
projectId:
|
|
2262
|
-
evalRunId:
|
|
2263
|
-
resultId:
|
|
2264
|
-
messages:
|
|
2265
|
-
createdAt:
|
|
2236
|
+
var ScenarioConversationSchema = import_zod28.z.object({
|
|
2237
|
+
id: import_zod28.z.string(),
|
|
2238
|
+
projectId: import_zod28.z.string(),
|
|
2239
|
+
evalRunId: import_zod28.z.string(),
|
|
2240
|
+
resultId: import_zod28.z.string(),
|
|
2241
|
+
messages: import_zod28.z.array(ConversationMessageSchema),
|
|
2242
|
+
createdAt: import_zod28.z.string()
|
|
2266
2243
|
});
|
|
2267
2244
|
|
|
2268
2245
|
// src/evaluation/eval-result.ts
|
|
@@ -2273,100 +2250,100 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
2273
2250
|
AssertionResultStatus2["ERROR"] = "error";
|
|
2274
2251
|
return AssertionResultStatus2;
|
|
2275
2252
|
})(AssertionResultStatus || {});
|
|
2276
|
-
var AssertionResultSchema =
|
|
2277
|
-
id:
|
|
2278
|
-
assertionId:
|
|
2279
|
-
assertionType:
|
|
2280
|
-
assertionName:
|
|
2281
|
-
status:
|
|
2282
|
-
message:
|
|
2283
|
-
expected:
|
|
2284
|
-
actual:
|
|
2285
|
-
duration:
|
|
2286
|
-
details:
|
|
2287
|
-
llmTraceSteps:
|
|
2288
|
-
});
|
|
2289
|
-
var EvalRunResultSchema =
|
|
2290
|
-
id:
|
|
2291
|
-
targetId:
|
|
2292
|
-
targetName:
|
|
2253
|
+
var AssertionResultSchema = import_zod29.z.object({
|
|
2254
|
+
id: import_zod29.z.string(),
|
|
2255
|
+
assertionId: import_zod29.z.string(),
|
|
2256
|
+
assertionType: import_zod29.z.string(),
|
|
2257
|
+
assertionName: import_zod29.z.string(),
|
|
2258
|
+
status: import_zod29.z.enum(AssertionResultStatus),
|
|
2259
|
+
message: import_zod29.z.string().optional(),
|
|
2260
|
+
expected: import_zod29.z.string().optional(),
|
|
2261
|
+
actual: import_zod29.z.string().optional(),
|
|
2262
|
+
duration: import_zod29.z.number().optional(),
|
|
2263
|
+
details: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.unknown()).optional(),
|
|
2264
|
+
llmTraceSteps: import_zod29.z.array(LLMTraceStepSchema).optional()
|
|
2265
|
+
});
|
|
2266
|
+
var EvalRunResultSchema = import_zod29.z.object({
|
|
2267
|
+
id: import_zod29.z.string(),
|
|
2268
|
+
targetId: import_zod29.z.string(),
|
|
2269
|
+
targetName: import_zod29.z.string().optional(),
|
|
2293
2270
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
2294
|
-
skillVersionId:
|
|
2271
|
+
skillVersionId: import_zod29.z.string().optional(),
|
|
2295
2272
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
2296
|
-
skillVersion:
|
|
2297
|
-
scenarioId:
|
|
2298
|
-
scenarioName:
|
|
2273
|
+
skillVersion: import_zod29.z.string().optional(),
|
|
2274
|
+
scenarioId: import_zod29.z.string(),
|
|
2275
|
+
scenarioName: import_zod29.z.string(),
|
|
2299
2276
|
modelConfig: ModelConfigSchema.optional(),
|
|
2300
|
-
assertionResults:
|
|
2277
|
+
assertionResults: import_zod29.z.array(AssertionResultSchema),
|
|
2301
2278
|
metrics: EvalMetricsSchema.optional(),
|
|
2302
|
-
passed:
|
|
2303
|
-
failed:
|
|
2304
|
-
passRate:
|
|
2305
|
-
duration:
|
|
2306
|
-
outputText:
|
|
2307
|
-
files:
|
|
2308
|
-
fileDiffs:
|
|
2279
|
+
passed: import_zod29.z.number(),
|
|
2280
|
+
failed: import_zod29.z.number(),
|
|
2281
|
+
passRate: import_zod29.z.number(),
|
|
2282
|
+
duration: import_zod29.z.number(),
|
|
2283
|
+
outputText: import_zod29.z.string().optional(),
|
|
2284
|
+
files: import_zod29.z.array(ExpectedFileSchema).optional(),
|
|
2285
|
+
fileDiffs: import_zod29.z.array(DiffContentSchema).optional(),
|
|
2309
2286
|
/** Full template files after execution with status indicators */
|
|
2310
|
-
templateFiles:
|
|
2311
|
-
startedAt:
|
|
2312
|
-
completedAt:
|
|
2287
|
+
templateFiles: import_zod29.z.array(TemplateFileSchema).optional(),
|
|
2288
|
+
startedAt: import_zod29.z.string().optional(),
|
|
2289
|
+
completedAt: import_zod29.z.string().optional(),
|
|
2313
2290
|
llmTrace: LLMTraceSchema.optional(),
|
|
2314
2291
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
2315
|
-
conversation:
|
|
2316
|
-
});
|
|
2317
|
-
var PromptResultSchema =
|
|
2318
|
-
text:
|
|
2319
|
-
files:
|
|
2320
|
-
finishReason:
|
|
2321
|
-
reasoning:
|
|
2322
|
-
reasoningDetails:
|
|
2323
|
-
toolCalls:
|
|
2324
|
-
toolResults:
|
|
2325
|
-
warnings:
|
|
2326
|
-
sources:
|
|
2327
|
-
steps:
|
|
2328
|
-
generationTimeMs:
|
|
2329
|
-
prompt:
|
|
2330
|
-
systemPrompt:
|
|
2331
|
-
usage:
|
|
2332
|
-
totalTokens:
|
|
2333
|
-
totalMicrocentsSpent:
|
|
2292
|
+
conversation: import_zod29.z.array(ConversationMessageSchema).optional()
|
|
2293
|
+
});
|
|
2294
|
+
var PromptResultSchema = import_zod29.z.object({
|
|
2295
|
+
text: import_zod29.z.string(),
|
|
2296
|
+
files: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2297
|
+
finishReason: import_zod29.z.string().optional(),
|
|
2298
|
+
reasoning: import_zod29.z.string().optional(),
|
|
2299
|
+
reasoningDetails: import_zod29.z.unknown().optional(),
|
|
2300
|
+
toolCalls: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2301
|
+
toolResults: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2302
|
+
warnings: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2303
|
+
sources: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2304
|
+
steps: import_zod29.z.array(import_zod29.z.unknown()),
|
|
2305
|
+
generationTimeMs: import_zod29.z.number(),
|
|
2306
|
+
prompt: import_zod29.z.string(),
|
|
2307
|
+
systemPrompt: import_zod29.z.string(),
|
|
2308
|
+
usage: import_zod29.z.object({
|
|
2309
|
+
totalTokens: import_zod29.z.number().optional(),
|
|
2310
|
+
totalMicrocentsSpent: import_zod29.z.number().optional()
|
|
2334
2311
|
})
|
|
2335
2312
|
});
|
|
2336
|
-
var EvaluationResultSchema =
|
|
2337
|
-
id:
|
|
2338
|
-
runId:
|
|
2339
|
-
timestamp:
|
|
2313
|
+
var EvaluationResultSchema = import_zod29.z.object({
|
|
2314
|
+
id: import_zod29.z.string(),
|
|
2315
|
+
runId: import_zod29.z.string(),
|
|
2316
|
+
timestamp: import_zod29.z.number(),
|
|
2340
2317
|
promptResult: PromptResultSchema,
|
|
2341
|
-
testResults:
|
|
2342
|
-
tags:
|
|
2343
|
-
feedback:
|
|
2344
|
-
score:
|
|
2345
|
-
suiteId:
|
|
2346
|
-
});
|
|
2347
|
-
var LeanEvaluationResultSchema =
|
|
2348
|
-
id:
|
|
2349
|
-
runId:
|
|
2350
|
-
timestamp:
|
|
2351
|
-
tags:
|
|
2352
|
-
scenarioId:
|
|
2353
|
-
scenarioVersion:
|
|
2354
|
-
targetId:
|
|
2355
|
-
targetVersion:
|
|
2356
|
-
suiteId:
|
|
2357
|
-
score:
|
|
2358
|
-
time:
|
|
2359
|
-
microcentsSpent:
|
|
2318
|
+
testResults: import_zod29.z.array(import_zod29.z.unknown()),
|
|
2319
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
2320
|
+
feedback: import_zod29.z.string().optional(),
|
|
2321
|
+
score: import_zod29.z.number(),
|
|
2322
|
+
suiteId: import_zod29.z.string().optional()
|
|
2323
|
+
});
|
|
2324
|
+
var LeanEvaluationResultSchema = import_zod29.z.object({
|
|
2325
|
+
id: import_zod29.z.string(),
|
|
2326
|
+
runId: import_zod29.z.string(),
|
|
2327
|
+
timestamp: import_zod29.z.number(),
|
|
2328
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
2329
|
+
scenarioId: import_zod29.z.string(),
|
|
2330
|
+
scenarioVersion: import_zod29.z.number().optional(),
|
|
2331
|
+
targetId: import_zod29.z.string(),
|
|
2332
|
+
targetVersion: import_zod29.z.number().optional(),
|
|
2333
|
+
suiteId: import_zod29.z.string().optional(),
|
|
2334
|
+
score: import_zod29.z.number(),
|
|
2335
|
+
time: import_zod29.z.number().optional(),
|
|
2336
|
+
microcentsSpent: import_zod29.z.number().optional()
|
|
2360
2337
|
});
|
|
2361
2338
|
|
|
2362
2339
|
// src/project/project.ts
|
|
2363
|
-
var
|
|
2340
|
+
var import_zod30 = require("zod");
|
|
2364
2341
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
2365
|
-
appId:
|
|
2366
|
-
appSecret:
|
|
2367
|
-
useWixAuth:
|
|
2368
|
-
useBase44Auth:
|
|
2369
|
-
scenarioTags:
|
|
2342
|
+
appId: import_zod30.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
2343
|
+
appSecret: import_zod30.z.string().optional().describe("The secret of the app in Dev Center"),
|
|
2344
|
+
useWixAuth: import_zod30.z.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
|
|
2345
|
+
useBase44Auth: import_zod30.z.boolean().optional().describe("Enable Base44 auth for evaluations"),
|
|
2346
|
+
scenarioTags: import_zod30.z.array(import_zod30.z.string()).optional().describe("Project-level tag vocabulary for scenarios")
|
|
2370
2347
|
});
|
|
2371
2348
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
2372
2349
|
id: true,
|
|
@@ -2419,7 +2396,7 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2419
2396
|
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2420
2397
|
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2421
2398
|
name: "Tool Called With Param",
|
|
2422
|
-
description: "Check that a tool was called with expected parameters",
|
|
2399
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
2423
2400
|
type: "tool_called_with_param",
|
|
2424
2401
|
parameters: [
|
|
2425
2402
|
{
|
|
@@ -2433,6 +2410,14 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2433
2410
|
label: "Expected Parameters (JSON, substring match)",
|
|
2434
2411
|
type: "string",
|
|
2435
2412
|
required: true
|
|
2413
|
+
},
|
|
2414
|
+
{
|
|
2415
|
+
name: "requireSuccess",
|
|
2416
|
+
label: "Require Successful Call",
|
|
2417
|
+
type: "boolean",
|
|
2418
|
+
required: false,
|
|
2419
|
+
defaultValue: false,
|
|
2420
|
+
advanced: true
|
|
2436
2421
|
}
|
|
2437
2422
|
]
|
|
2438
2423
|
},
|