@wix/evalforge-types 0.52.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +397 -391
- package/build/index.js.map +4 -4
- package/build/index.mjs +394 -391
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +106 -4
- package/build/types/scenario/index.d.ts +0 -1
- package/build/types/scenario/test-scenario.d.ts +69 -30
- package/build/types/target/skill.d.ts +46 -0
- package/package.json +2 -2
- package/build/types/scenario/assertions.d.ts +0 -98
package/build/index.js
CHANGED
|
@@ -46,6 +46,9 @@ __export(index_exports, {
|
|
|
46
46
|
BuildCheckTestSchema: () => BuildCheckTestSchema,
|
|
47
47
|
BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
|
|
48
48
|
BuildPassedConfigSchema: () => BuildPassedConfigSchema,
|
|
49
|
+
BulkImportResultItemSchema: () => BulkImportResultItemSchema,
|
|
50
|
+
BulkImportResultSchema: () => BulkImportResultSchema,
|
|
51
|
+
BulkImportSkillsInputSchema: () => BulkImportSkillsInputSchema,
|
|
49
52
|
ClaudeModel: () => ClaudeModel2,
|
|
50
53
|
ClaudeModelSchema: () => ClaudeModelSchema,
|
|
51
54
|
CommandExecutionSchema: () => CommandExecutionSchema,
|
|
@@ -1359,6 +1362,21 @@ var UpdateSkillInputSchema = SkillInputBaseSchema.partial().refine(
|
|
|
1359
1362
|
var SkillWithLatestVersionSchema = SkillSchema.extend({
|
|
1360
1363
|
latestVersion: SkillVersionSchema.optional()
|
|
1361
1364
|
});
|
|
1365
|
+
var BulkImportSkillsInputSchema = import_zod7.z.object({
|
|
1366
|
+
source: GitHubSourceSchema
|
|
1367
|
+
});
|
|
1368
|
+
var BulkImportResultItemSchema = import_zod7.z.object({
|
|
1369
|
+
name: import_zod7.z.string(),
|
|
1370
|
+
status: import_zod7.z.enum(["created", "skipped", "failed"]),
|
|
1371
|
+
skillId: import_zod7.z.string().optional(),
|
|
1372
|
+
reason: import_zod7.z.string().optional()
|
|
1373
|
+
});
|
|
1374
|
+
var BulkImportResultSchema = import_zod7.z.object({
|
|
1375
|
+
created: import_zod7.z.number(),
|
|
1376
|
+
skipped: import_zod7.z.number(),
|
|
1377
|
+
failed: import_zod7.z.number(),
|
|
1378
|
+
items: import_zod7.z.array(BulkImportResultItemSchema)
|
|
1379
|
+
});
|
|
1362
1380
|
|
|
1363
1381
|
// src/target/sub-agent.ts
|
|
1364
1382
|
var import_zod8 = require("zod");
|
|
@@ -1591,85 +1609,34 @@ var TestSchema = import_zod20.z.discriminatedUnion("type", [
|
|
|
1591
1609
|
PlaywrightNLTestSchema
|
|
1592
1610
|
]);
|
|
1593
1611
|
|
|
1594
|
-
// src/scenario/assertions.ts
|
|
1595
|
-
var import_zod21 = require("zod");
|
|
1596
|
-
var SkillWasCalledAssertionSchema = import_zod21.z.object({
|
|
1597
|
-
type: import_zod21.z.literal("skill_was_called"),
|
|
1598
|
-
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1599
|
-
skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
|
|
1600
|
-
});
|
|
1601
|
-
var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
|
|
1602
|
-
type: import_zod21.z.literal("tool_called_with_param"),
|
|
1603
|
-
/** Name of the tool that must have been called */
|
|
1604
|
-
toolName: import_zod21.z.string().min(1),
|
|
1605
|
-
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1606
|
-
expectedParams: import_zod21.z.string().min(1)
|
|
1607
|
-
});
|
|
1608
|
-
var BuildPassedAssertionSchema = import_zod21.z.object({
|
|
1609
|
-
type: import_zod21.z.literal("build_passed"),
|
|
1610
|
-
/** Command to run (default: "yarn build") */
|
|
1611
|
-
command: import_zod21.z.string().optional(),
|
|
1612
|
-
/** Expected exit code (default: 0) */
|
|
1613
|
-
expectedExitCode: import_zod21.z.number().int().optional()
|
|
1614
|
-
});
|
|
1615
|
-
var CostAssertionSchema = import_zod21.z.object({
|
|
1616
|
-
type: import_zod21.z.literal("cost"),
|
|
1617
|
-
/** Maximum allowed cost in USD */
|
|
1618
|
-
maxCostUsd: import_zod21.z.number().positive()
|
|
1619
|
-
});
|
|
1620
|
-
var LlmJudgeAssertionSchema = import_zod21.z.object({
|
|
1621
|
-
type: import_zod21.z.literal("llm_judge"),
|
|
1622
|
-
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1623
|
-
prompt: import_zod21.z.string(),
|
|
1624
|
-
/** Minimum score to pass (0-10, default 7) */
|
|
1625
|
-
minScore: import_zod21.z.number().int().min(0).max(10).optional(),
|
|
1626
|
-
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
1627
|
-
model: import_zod21.z.string().optional(),
|
|
1628
|
-
maxTokens: import_zod21.z.number().int().optional(),
|
|
1629
|
-
temperature: import_zod21.z.number().min(0).max(1).optional()
|
|
1630
|
-
});
|
|
1631
|
-
var TimeAssertionSchema = import_zod21.z.object({
|
|
1632
|
-
type: import_zod21.z.literal("time_limit"),
|
|
1633
|
-
/** Maximum allowed duration in milliseconds */
|
|
1634
|
-
maxDurationMs: import_zod21.z.number().int().positive()
|
|
1635
|
-
});
|
|
1636
|
-
var AssertionSchema = import_zod21.z.union([
|
|
1637
|
-
SkillWasCalledAssertionSchema,
|
|
1638
|
-
ToolCalledWithParamAssertionSchema,
|
|
1639
|
-
BuildPassedAssertionSchema,
|
|
1640
|
-
TimeAssertionSchema,
|
|
1641
|
-
CostAssertionSchema,
|
|
1642
|
-
LlmJudgeAssertionSchema
|
|
1643
|
-
]);
|
|
1644
|
-
|
|
1645
1612
|
// src/scenario/environment.ts
|
|
1646
|
-
var
|
|
1647
|
-
var LocalProjectConfigSchema =
|
|
1613
|
+
var import_zod21 = require("zod");
|
|
1614
|
+
var LocalProjectConfigSchema = import_zod21.z.object({
|
|
1648
1615
|
/** Template ID to use for the local project */
|
|
1649
|
-
templateId:
|
|
1616
|
+
templateId: import_zod21.z.string().optional(),
|
|
1650
1617
|
/** Files to create in the project */
|
|
1651
|
-
files:
|
|
1652
|
-
|
|
1653
|
-
path:
|
|
1654
|
-
content:
|
|
1618
|
+
files: import_zod21.z.array(
|
|
1619
|
+
import_zod21.z.object({
|
|
1620
|
+
path: import_zod21.z.string().min(1),
|
|
1621
|
+
content: import_zod21.z.string().min(1)
|
|
1655
1622
|
})
|
|
1656
1623
|
).optional()
|
|
1657
1624
|
});
|
|
1658
|
-
var MetaSiteConfigSchema =
|
|
1659
|
-
configurations:
|
|
1660
|
-
|
|
1661
|
-
name:
|
|
1662
|
-
apiCalls:
|
|
1663
|
-
|
|
1664
|
-
url:
|
|
1665
|
-
method:
|
|
1666
|
-
body:
|
|
1625
|
+
var MetaSiteConfigSchema = import_zod21.z.object({
|
|
1626
|
+
configurations: import_zod21.z.array(
|
|
1627
|
+
import_zod21.z.object({
|
|
1628
|
+
name: import_zod21.z.string().min(1),
|
|
1629
|
+
apiCalls: import_zod21.z.array(
|
|
1630
|
+
import_zod21.z.object({
|
|
1631
|
+
url: import_zod21.z.string().url(),
|
|
1632
|
+
method: import_zod21.z.enum(["POST", "PUT"]),
|
|
1633
|
+
body: import_zod21.z.string()
|
|
1667
1634
|
})
|
|
1668
1635
|
)
|
|
1669
1636
|
})
|
|
1670
1637
|
).optional()
|
|
1671
1638
|
});
|
|
1672
|
-
var EnvironmentSchema =
|
|
1639
|
+
var EnvironmentSchema = import_zod21.z.object({
|
|
1673
1640
|
/** Local project configuration */
|
|
1674
1641
|
localProject: LocalProjectConfigSchema.optional(),
|
|
1675
1642
|
/** Meta site configuration */
|
|
@@ -1677,11 +1644,11 @@ var EnvironmentSchema = import_zod22.z.object({
|
|
|
1677
1644
|
});
|
|
1678
1645
|
|
|
1679
1646
|
// src/scenario/test-scenario.ts
|
|
1680
|
-
var
|
|
1647
|
+
var import_zod23 = require("zod");
|
|
1681
1648
|
|
|
1682
1649
|
// src/assertion/assertion.ts
|
|
1683
|
-
var
|
|
1684
|
-
var AssertionTypeSchema =
|
|
1650
|
+
var import_zod22 = require("zod");
|
|
1651
|
+
var AssertionTypeSchema = import_zod22.z.enum([
|
|
1685
1652
|
"skill_was_called",
|
|
1686
1653
|
"tool_called_with_param",
|
|
1687
1654
|
"build_passed",
|
|
@@ -1689,59 +1656,61 @@ var AssertionTypeSchema = import_zod23.z.enum([
|
|
|
1689
1656
|
"cost",
|
|
1690
1657
|
"llm_judge"
|
|
1691
1658
|
]);
|
|
1692
|
-
var AssertionParameterTypeSchema =
|
|
1659
|
+
var AssertionParameterTypeSchema = import_zod22.z.enum([
|
|
1693
1660
|
"string",
|
|
1694
1661
|
"number",
|
|
1695
1662
|
"boolean"
|
|
1696
1663
|
]);
|
|
1697
|
-
var AssertionParameterSchema =
|
|
1664
|
+
var AssertionParameterSchema = import_zod22.z.object({
|
|
1698
1665
|
/** Parameter name (used as key in params object) */
|
|
1699
|
-
name:
|
|
1666
|
+
name: import_zod22.z.string().min(1),
|
|
1700
1667
|
/** Display label for the parameter */
|
|
1701
|
-
label:
|
|
1668
|
+
label: import_zod22.z.string().min(1),
|
|
1702
1669
|
/** Parameter type */
|
|
1703
1670
|
type: AssertionParameterTypeSchema,
|
|
1704
1671
|
/** Whether this parameter is required */
|
|
1705
|
-
required:
|
|
1672
|
+
required: import_zod22.z.boolean(),
|
|
1706
1673
|
/** Default value (optional, used when not provided) */
|
|
1707
|
-
defaultValue:
|
|
1674
|
+
defaultValue: import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean()]).optional(),
|
|
1708
1675
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
1709
|
-
advanced:
|
|
1676
|
+
advanced: import_zod22.z.boolean().optional()
|
|
1710
1677
|
});
|
|
1711
|
-
var ScenarioAssertionLinkSchema =
|
|
1678
|
+
var ScenarioAssertionLinkSchema = import_zod22.z.object({
|
|
1712
1679
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
1713
|
-
assertionId:
|
|
1680
|
+
assertionId: import_zod22.z.string(),
|
|
1714
1681
|
/** Parameter values for this assertion in this scenario */
|
|
1715
|
-
params:
|
|
1716
|
-
|
|
1717
|
-
|
|
1682
|
+
params: import_zod22.z.record(
|
|
1683
|
+
import_zod22.z.string(),
|
|
1684
|
+
import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean(), import_zod22.z.null()])
|
|
1718
1685
|
).optional()
|
|
1719
1686
|
});
|
|
1720
|
-
var SkillWasCalledConfigSchema =
|
|
1687
|
+
var SkillWasCalledConfigSchema = import_zod22.z.object({
|
|
1721
1688
|
/** Names of the skills that must have been called */
|
|
1722
|
-
skillNames:
|
|
1689
|
+
skillNames: import_zod22.z.array(import_zod22.z.string().min(1)).min(1)
|
|
1723
1690
|
});
|
|
1724
|
-
var CostConfigSchema =
|
|
1691
|
+
var CostConfigSchema = import_zod22.z.strictObject({
|
|
1725
1692
|
/** Maximum allowed cost in USD */
|
|
1726
|
-
maxCostUsd:
|
|
1693
|
+
maxCostUsd: import_zod22.z.number().positive()
|
|
1727
1694
|
});
|
|
1728
|
-
var ToolCalledWithParamConfigSchema =
|
|
1695
|
+
var ToolCalledWithParamConfigSchema = import_zod22.z.strictObject({
|
|
1729
1696
|
/** Name of the tool that must have been called */
|
|
1730
|
-
toolName:
|
|
1697
|
+
toolName: import_zod22.z.string().min(1),
|
|
1731
1698
|
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1732
|
-
expectedParams:
|
|
1699
|
+
expectedParams: import_zod22.z.string().min(1),
|
|
1700
|
+
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
1701
|
+
requireSuccess: import_zod22.z.boolean().optional()
|
|
1733
1702
|
});
|
|
1734
|
-
var BuildPassedConfigSchema =
|
|
1703
|
+
var BuildPassedConfigSchema = import_zod22.z.strictObject({
|
|
1735
1704
|
/** Command to run (default: "yarn build") */
|
|
1736
|
-
command:
|
|
1705
|
+
command: import_zod22.z.string().optional(),
|
|
1737
1706
|
/** Expected exit code (default: 0) */
|
|
1738
|
-
expectedExitCode:
|
|
1707
|
+
expectedExitCode: import_zod22.z.number().int().optional()
|
|
1739
1708
|
});
|
|
1740
|
-
var TimeConfigSchema =
|
|
1709
|
+
var TimeConfigSchema = import_zod22.z.strictObject({
|
|
1741
1710
|
/** Maximum allowed duration in milliseconds */
|
|
1742
|
-
maxDurationMs:
|
|
1711
|
+
maxDurationMs: import_zod22.z.number().int().positive()
|
|
1743
1712
|
});
|
|
1744
|
-
var LlmJudgeConfigSchema =
|
|
1713
|
+
var LlmJudgeConfigSchema = import_zod22.z.object({
|
|
1745
1714
|
/**
|
|
1746
1715
|
* Prompt template with placeholders:
|
|
1747
1716
|
* - {{output}}: agent's final output
|
|
@@ -1752,19 +1721,45 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
|
|
|
1752
1721
|
* - {{trace}}: step-by-step trace of tool calls
|
|
1753
1722
|
* - Custom parameters defined in the parameters array
|
|
1754
1723
|
*/
|
|
1755
|
-
prompt:
|
|
1724
|
+
prompt: import_zod22.z.string().min(1),
|
|
1756
1725
|
/** Minimum score to pass (0-10, default 7) */
|
|
1757
|
-
minScore:
|
|
1726
|
+
minScore: import_zod22.z.number().int().min(0).max(10).optional(),
|
|
1758
1727
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1759
|
-
model:
|
|
1728
|
+
model: import_zod22.z.string().optional(),
|
|
1760
1729
|
/** Max output tokens */
|
|
1761
|
-
maxTokens:
|
|
1730
|
+
maxTokens: import_zod22.z.number().int().optional(),
|
|
1762
1731
|
/** Temperature (0-1) */
|
|
1763
|
-
temperature:
|
|
1732
|
+
temperature: import_zod22.z.number().min(0).max(1).optional(),
|
|
1764
1733
|
/** User-defined parameters for this assertion */
|
|
1765
|
-
parameters:
|
|
1734
|
+
parameters: import_zod22.z.array(AssertionParameterSchema).optional()
|
|
1735
|
+
});
|
|
1736
|
+
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
1737
|
+
type: import_zod22.z.literal("skill_was_called")
|
|
1738
|
+
});
|
|
1739
|
+
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
1740
|
+
type: import_zod22.z.literal("tool_called_with_param")
|
|
1766
1741
|
});
|
|
1767
|
-
var
|
|
1742
|
+
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
1743
|
+
type: import_zod22.z.literal("build_passed")
|
|
1744
|
+
});
|
|
1745
|
+
var CostAssertionSchema = CostConfigSchema.extend({
|
|
1746
|
+
type: import_zod22.z.literal("cost")
|
|
1747
|
+
});
|
|
1748
|
+
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
1749
|
+
type: import_zod22.z.literal("llm_judge")
|
|
1750
|
+
});
|
|
1751
|
+
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
1752
|
+
type: import_zod22.z.literal("time_limit")
|
|
1753
|
+
});
|
|
1754
|
+
var AssertionSchema = import_zod22.z.union([
|
|
1755
|
+
SkillWasCalledAssertionSchema,
|
|
1756
|
+
ToolCalledWithParamAssertionSchema,
|
|
1757
|
+
BuildPassedAssertionSchema,
|
|
1758
|
+
TimeAssertionSchema,
|
|
1759
|
+
CostAssertionSchema,
|
|
1760
|
+
LlmJudgeAssertionSchema
|
|
1761
|
+
]);
|
|
1762
|
+
var AssertionConfigSchema = import_zod22.z.union([
|
|
1768
1763
|
LlmJudgeConfigSchema,
|
|
1769
1764
|
// requires prompt - check first
|
|
1770
1765
|
SkillWasCalledConfigSchema,
|
|
@@ -1777,7 +1772,7 @@ var AssertionConfigSchema = import_zod23.z.union([
|
|
|
1777
1772
|
// requires maxCostUsd, uses strictObject
|
|
1778
1773
|
BuildPassedConfigSchema,
|
|
1779
1774
|
// all optional, uses strictObject to reject unknown keys
|
|
1780
|
-
|
|
1775
|
+
import_zod22.z.object({})
|
|
1781
1776
|
// fallback empty config
|
|
1782
1777
|
]);
|
|
1783
1778
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -1828,25 +1823,25 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1828
1823
|
}
|
|
1829
1824
|
|
|
1830
1825
|
// src/scenario/test-scenario.ts
|
|
1831
|
-
var ExpectedFileSchema =
|
|
1826
|
+
var ExpectedFileSchema = import_zod23.z.object({
|
|
1832
1827
|
/** Relative path where the file should be created */
|
|
1833
|
-
path:
|
|
1828
|
+
path: import_zod23.z.string(),
|
|
1834
1829
|
/** Optional expected content */
|
|
1835
|
-
content:
|
|
1830
|
+
content: import_zod23.z.string().optional()
|
|
1836
1831
|
});
|
|
1837
1832
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
1838
1833
|
/** The prompt sent to the agent to trigger the task */
|
|
1839
|
-
triggerPrompt:
|
|
1834
|
+
triggerPrompt: import_zod23.z.string().min(10),
|
|
1840
1835
|
/** ID of the template to use for this scenario (null = no template) */
|
|
1841
|
-
templateId:
|
|
1836
|
+
templateId: import_zod23.z.string().nullish(),
|
|
1842
1837
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
1843
|
-
assertions:
|
|
1838
|
+
assertions: import_zod23.z.array(AssertionSchema).optional(),
|
|
1844
1839
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
1845
|
-
assertionIds:
|
|
1840
|
+
assertionIds: import_zod23.z.array(import_zod23.z.string()).optional(),
|
|
1846
1841
|
/** Linked assertions with per-scenario parameter values */
|
|
1847
|
-
assertionLinks:
|
|
1842
|
+
assertionLinks: import_zod23.z.array(ScenarioAssertionLinkSchema).optional(),
|
|
1848
1843
|
/** Tags for categorisation and filtering */
|
|
1849
|
-
tags:
|
|
1844
|
+
tags: import_zod23.z.array(import_zod23.z.string()).optional()
|
|
1850
1845
|
});
|
|
1851
1846
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
1852
1847
|
id: true,
|
|
@@ -1857,10 +1852,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
1857
1852
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
1858
1853
|
|
|
1859
1854
|
// src/suite/test-suite.ts
|
|
1860
|
-
var
|
|
1855
|
+
var import_zod24 = require("zod");
|
|
1861
1856
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1862
1857
|
/** IDs of test scenarios in this suite */
|
|
1863
|
-
scenarioIds:
|
|
1858
|
+
scenarioIds: import_zod24.z.array(import_zod24.z.string())
|
|
1864
1859
|
});
|
|
1865
1860
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1866
1861
|
id: true,
|
|
@@ -1871,21 +1866,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1871
1866
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1872
1867
|
|
|
1873
1868
|
// src/evaluation/metrics.ts
|
|
1874
|
-
var
|
|
1875
|
-
var TokenUsageSchema =
|
|
1876
|
-
prompt:
|
|
1877
|
-
completion:
|
|
1878
|
-
total:
|
|
1879
|
-
});
|
|
1880
|
-
var EvalMetricsSchema =
|
|
1881
|
-
totalAssertions:
|
|
1882
|
-
passed:
|
|
1883
|
-
failed:
|
|
1884
|
-
skipped:
|
|
1885
|
-
errors:
|
|
1886
|
-
passRate:
|
|
1887
|
-
avgDuration:
|
|
1888
|
-
totalDuration:
|
|
1869
|
+
var import_zod25 = require("zod");
|
|
1870
|
+
var TokenUsageSchema = import_zod25.z.object({
|
|
1871
|
+
prompt: import_zod25.z.number(),
|
|
1872
|
+
completion: import_zod25.z.number(),
|
|
1873
|
+
total: import_zod25.z.number()
|
|
1874
|
+
});
|
|
1875
|
+
var EvalMetricsSchema = import_zod25.z.object({
|
|
1876
|
+
totalAssertions: import_zod25.z.number(),
|
|
1877
|
+
passed: import_zod25.z.number(),
|
|
1878
|
+
failed: import_zod25.z.number(),
|
|
1879
|
+
skipped: import_zod25.z.number(),
|
|
1880
|
+
errors: import_zod25.z.number(),
|
|
1881
|
+
passRate: import_zod25.z.number(),
|
|
1882
|
+
avgDuration: import_zod25.z.number(),
|
|
1883
|
+
totalDuration: import_zod25.z.number()
|
|
1889
1884
|
});
|
|
1890
1885
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1891
1886
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1895,7 +1890,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1895
1890
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1896
1891
|
return EvalStatus2;
|
|
1897
1892
|
})(EvalStatus || {});
|
|
1898
|
-
var EvalStatusSchema =
|
|
1893
|
+
var EvalStatusSchema = import_zod25.z.enum(EvalStatus);
|
|
1899
1894
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1900
1895
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1901
1896
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1903,52 +1898,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1903
1898
|
LLMStepType2["THINKING"] = "thinking";
|
|
1904
1899
|
return LLMStepType2;
|
|
1905
1900
|
})(LLMStepType || {});
|
|
1906
|
-
var LLMTraceStepSchema =
|
|
1907
|
-
id:
|
|
1908
|
-
stepNumber:
|
|
1909
|
-
type:
|
|
1910
|
-
model:
|
|
1911
|
-
provider:
|
|
1912
|
-
startedAt:
|
|
1913
|
-
durationMs:
|
|
1901
|
+
var LLMTraceStepSchema = import_zod25.z.object({
|
|
1902
|
+
id: import_zod25.z.string(),
|
|
1903
|
+
stepNumber: import_zod25.z.number(),
|
|
1904
|
+
type: import_zod25.z.enum(LLMStepType),
|
|
1905
|
+
model: import_zod25.z.string(),
|
|
1906
|
+
provider: import_zod25.z.string(),
|
|
1907
|
+
startedAt: import_zod25.z.string(),
|
|
1908
|
+
durationMs: import_zod25.z.number(),
|
|
1914
1909
|
tokenUsage: TokenUsageSchema,
|
|
1915
|
-
costUsd:
|
|
1916
|
-
toolName:
|
|
1917
|
-
toolArguments:
|
|
1918
|
-
inputPreview:
|
|
1919
|
-
outputPreview:
|
|
1920
|
-
success:
|
|
1921
|
-
error:
|
|
1922
|
-
});
|
|
1923
|
-
var LLMBreakdownStatsSchema =
|
|
1924
|
-
count:
|
|
1925
|
-
durationMs:
|
|
1926
|
-
tokens:
|
|
1927
|
-
costUsd:
|
|
1928
|
-
});
|
|
1929
|
-
var LLMTraceSummarySchema =
|
|
1930
|
-
totalSteps:
|
|
1931
|
-
totalDurationMs:
|
|
1910
|
+
costUsd: import_zod25.z.number(),
|
|
1911
|
+
toolName: import_zod25.z.string().optional(),
|
|
1912
|
+
toolArguments: import_zod25.z.string().optional(),
|
|
1913
|
+
inputPreview: import_zod25.z.string().optional(),
|
|
1914
|
+
outputPreview: import_zod25.z.string().optional(),
|
|
1915
|
+
success: import_zod25.z.boolean(),
|
|
1916
|
+
error: import_zod25.z.string().optional()
|
|
1917
|
+
});
|
|
1918
|
+
var LLMBreakdownStatsSchema = import_zod25.z.object({
|
|
1919
|
+
count: import_zod25.z.number(),
|
|
1920
|
+
durationMs: import_zod25.z.number(),
|
|
1921
|
+
tokens: import_zod25.z.number(),
|
|
1922
|
+
costUsd: import_zod25.z.number()
|
|
1923
|
+
});
|
|
1924
|
+
var LLMTraceSummarySchema = import_zod25.z.object({
|
|
1925
|
+
totalSteps: import_zod25.z.number(),
|
|
1926
|
+
totalDurationMs: import_zod25.z.number(),
|
|
1932
1927
|
totalTokens: TokenUsageSchema,
|
|
1933
|
-
totalCostUsd:
|
|
1934
|
-
stepTypeBreakdown:
|
|
1935
|
-
modelBreakdown:
|
|
1936
|
-
modelsUsed:
|
|
1937
|
-
});
|
|
1938
|
-
var LLMTraceSchema =
|
|
1939
|
-
id:
|
|
1940
|
-
steps:
|
|
1928
|
+
totalCostUsd: import_zod25.z.number(),
|
|
1929
|
+
stepTypeBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
1930
|
+
modelBreakdown: import_zod25.z.record(import_zod25.z.string(), LLMBreakdownStatsSchema),
|
|
1931
|
+
modelsUsed: import_zod25.z.array(import_zod25.z.string())
|
|
1932
|
+
});
|
|
1933
|
+
var LLMTraceSchema = import_zod25.z.object({
|
|
1934
|
+
id: import_zod25.z.string(),
|
|
1935
|
+
steps: import_zod25.z.array(LLMTraceStepSchema),
|
|
1941
1936
|
summary: LLMTraceSummarySchema
|
|
1942
1937
|
});
|
|
1943
1938
|
|
|
1944
1939
|
// src/evaluation/eval-result.ts
|
|
1945
|
-
var
|
|
1940
|
+
var import_zod29 = require("zod");
|
|
1946
1941
|
|
|
1947
1942
|
// src/evaluation/eval-run.ts
|
|
1948
|
-
var
|
|
1943
|
+
var import_zod27 = require("zod");
|
|
1949
1944
|
|
|
1950
1945
|
// src/evaluation/live-trace.ts
|
|
1951
|
-
var
|
|
1946
|
+
var import_zod26 = require("zod");
|
|
1952
1947
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1953
1948
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1954
1949
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1962,37 +1957,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1962
1957
|
LiveTraceEventType2["USER"] = "user";
|
|
1963
1958
|
return LiveTraceEventType2;
|
|
1964
1959
|
})(LiveTraceEventType || {});
|
|
1965
|
-
var LiveTraceEventSchema =
|
|
1960
|
+
var LiveTraceEventSchema = import_zod26.z.object({
|
|
1966
1961
|
/** The evaluation run ID */
|
|
1967
|
-
evalRunId:
|
|
1962
|
+
evalRunId: import_zod26.z.string(),
|
|
1968
1963
|
/** The scenario ID being executed */
|
|
1969
|
-
scenarioId:
|
|
1964
|
+
scenarioId: import_zod26.z.string(),
|
|
1970
1965
|
/** The scenario name for display */
|
|
1971
|
-
scenarioName:
|
|
1966
|
+
scenarioName: import_zod26.z.string(),
|
|
1972
1967
|
/** The target ID (skill, agent, etc.) */
|
|
1973
|
-
targetId:
|
|
1968
|
+
targetId: import_zod26.z.string(),
|
|
1974
1969
|
/** The target name for display */
|
|
1975
|
-
targetName:
|
|
1970
|
+
targetName: import_zod26.z.string(),
|
|
1976
1971
|
/** Step number in the current scenario execution */
|
|
1977
|
-
stepNumber:
|
|
1972
|
+
stepNumber: import_zod26.z.number(),
|
|
1978
1973
|
/** Type of trace event */
|
|
1979
|
-
type:
|
|
1974
|
+
type: import_zod26.z.enum(LiveTraceEventType),
|
|
1980
1975
|
/** Tool name if this is a tool_use event */
|
|
1981
|
-
toolName:
|
|
1976
|
+
toolName: import_zod26.z.string().optional(),
|
|
1982
1977
|
/** Tool arguments preview (truncated JSON) */
|
|
1983
|
-
toolArgs:
|
|
1978
|
+
toolArgs: import_zod26.z.string().optional(),
|
|
1984
1979
|
/** Output preview (truncated text) */
|
|
1985
|
-
outputPreview:
|
|
1980
|
+
outputPreview: import_zod26.z.string().optional(),
|
|
1986
1981
|
/** File path for file operations */
|
|
1987
|
-
filePath:
|
|
1982
|
+
filePath: import_zod26.z.string().optional(),
|
|
1988
1983
|
/** Elapsed time in milliseconds for progress events */
|
|
1989
|
-
elapsedMs:
|
|
1984
|
+
elapsedMs: import_zod26.z.number().optional(),
|
|
1990
1985
|
/** Thinking/reasoning text from Claude */
|
|
1991
|
-
thinking:
|
|
1986
|
+
thinking: import_zod26.z.string().optional(),
|
|
1992
1987
|
/** Timestamp when this event occurred */
|
|
1993
|
-
timestamp:
|
|
1988
|
+
timestamp: import_zod26.z.string(),
|
|
1994
1989
|
/** Whether this is the final event for this scenario */
|
|
1995
|
-
isComplete:
|
|
1990
|
+
isComplete: import_zod26.z.boolean()
|
|
1996
1991
|
});
|
|
1997
1992
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1998
1993
|
function parseTraceEventLine(line) {
|
|
@@ -2020,14 +2015,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
2020
2015
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
2021
2016
|
return TriggerType2;
|
|
2022
2017
|
})(TriggerType || {});
|
|
2023
|
-
var TriggerMetadataSchema =
|
|
2024
|
-
version:
|
|
2025
|
-
resourceUpdated:
|
|
2018
|
+
var TriggerMetadataSchema = import_zod27.z.object({
|
|
2019
|
+
version: import_zod27.z.string().optional(),
|
|
2020
|
+
resourceUpdated: import_zod27.z.array(import_zod27.z.string()).optional()
|
|
2026
2021
|
});
|
|
2027
|
-
var TriggerSchema =
|
|
2028
|
-
id:
|
|
2022
|
+
var TriggerSchema = import_zod27.z.object({
|
|
2023
|
+
id: import_zod27.z.string(),
|
|
2029
2024
|
metadata: TriggerMetadataSchema.optional(),
|
|
2030
|
-
type:
|
|
2025
|
+
type: import_zod27.z.enum(TriggerType)
|
|
2031
2026
|
});
|
|
2032
2027
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
2033
2028
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -2045,28 +2040,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
2045
2040
|
FailureSeverity2["LOW"] = "low";
|
|
2046
2041
|
return FailureSeverity2;
|
|
2047
2042
|
})(FailureSeverity || {});
|
|
2048
|
-
var DiffLineTypeSchema =
|
|
2049
|
-
var DiffLineSchema =
|
|
2043
|
+
var DiffLineTypeSchema = import_zod27.z.enum(["added", "removed", "unchanged"]);
|
|
2044
|
+
var DiffLineSchema = import_zod27.z.object({
|
|
2050
2045
|
type: DiffLineTypeSchema,
|
|
2051
|
-
content:
|
|
2052
|
-
lineNumber:
|
|
2053
|
-
});
|
|
2054
|
-
var DiffContentSchema =
|
|
2055
|
-
path:
|
|
2056
|
-
expected:
|
|
2057
|
-
actual:
|
|
2058
|
-
diffLines:
|
|
2059
|
-
renamedFrom:
|
|
2060
|
-
});
|
|
2061
|
-
var CommandExecutionSchema =
|
|
2062
|
-
command:
|
|
2063
|
-
exitCode:
|
|
2064
|
-
output:
|
|
2065
|
-
duration:
|
|
2066
|
-
});
|
|
2067
|
-
var FileModificationSchema =
|
|
2068
|
-
path:
|
|
2069
|
-
action:
|
|
2046
|
+
content: import_zod27.z.string(),
|
|
2047
|
+
lineNumber: import_zod27.z.number()
|
|
2048
|
+
});
|
|
2049
|
+
var DiffContentSchema = import_zod27.z.object({
|
|
2050
|
+
path: import_zod27.z.string(),
|
|
2051
|
+
expected: import_zod27.z.string(),
|
|
2052
|
+
actual: import_zod27.z.string(),
|
|
2053
|
+
diffLines: import_zod27.z.array(DiffLineSchema),
|
|
2054
|
+
renamedFrom: import_zod27.z.string().optional()
|
|
2055
|
+
});
|
|
2056
|
+
var CommandExecutionSchema = import_zod27.z.object({
|
|
2057
|
+
command: import_zod27.z.string(),
|
|
2058
|
+
exitCode: import_zod27.z.number(),
|
|
2059
|
+
output: import_zod27.z.string().optional(),
|
|
2060
|
+
duration: import_zod27.z.number()
|
|
2061
|
+
});
|
|
2062
|
+
var FileModificationSchema = import_zod27.z.object({
|
|
2063
|
+
path: import_zod27.z.string(),
|
|
2064
|
+
action: import_zod27.z.enum(["created", "modified", "deleted"])
|
|
2070
2065
|
});
|
|
2071
2066
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
2072
2067
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -2074,87 +2069,87 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
2074
2069
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
2075
2070
|
return TemplateFileStatus2;
|
|
2076
2071
|
})(TemplateFileStatus || {});
|
|
2077
|
-
var TemplateFileSchema =
|
|
2072
|
+
var TemplateFileSchema = import_zod27.z.object({
|
|
2078
2073
|
/** Relative path within the template */
|
|
2079
|
-
path:
|
|
2074
|
+
path: import_zod27.z.string(),
|
|
2080
2075
|
/** Full file content after execution */
|
|
2081
|
-
content:
|
|
2076
|
+
content: import_zod27.z.string(),
|
|
2082
2077
|
/** File status (new, modified, unchanged) */
|
|
2083
|
-
status:
|
|
2084
|
-
});
|
|
2085
|
-
var ApiCallSchema =
|
|
2086
|
-
endpoint:
|
|
2087
|
-
tokensUsed:
|
|
2088
|
-
duration:
|
|
2089
|
-
});
|
|
2090
|
-
var ExecutionTraceSchema =
|
|
2091
|
-
commands:
|
|
2092
|
-
filesModified:
|
|
2093
|
-
apiCalls:
|
|
2094
|
-
totalDuration:
|
|
2095
|
-
});
|
|
2096
|
-
var FailureAnalysisSchema =
|
|
2097
|
-
category:
|
|
2098
|
-
severity:
|
|
2099
|
-
summary:
|
|
2100
|
-
details:
|
|
2101
|
-
rootCause:
|
|
2102
|
-
suggestedFix:
|
|
2103
|
-
relatedAssertions:
|
|
2104
|
-
codeSnippet:
|
|
2105
|
-
similarIssues:
|
|
2106
|
-
patternId:
|
|
2078
|
+
status: import_zod27.z.enum(["new", "modified", "unchanged"])
|
|
2079
|
+
});
|
|
2080
|
+
var ApiCallSchema = import_zod27.z.object({
|
|
2081
|
+
endpoint: import_zod27.z.string(),
|
|
2082
|
+
tokensUsed: import_zod27.z.number(),
|
|
2083
|
+
duration: import_zod27.z.number()
|
|
2084
|
+
});
|
|
2085
|
+
var ExecutionTraceSchema = import_zod27.z.object({
|
|
2086
|
+
commands: import_zod27.z.array(CommandExecutionSchema),
|
|
2087
|
+
filesModified: import_zod27.z.array(FileModificationSchema),
|
|
2088
|
+
apiCalls: import_zod27.z.array(ApiCallSchema),
|
|
2089
|
+
totalDuration: import_zod27.z.number()
|
|
2090
|
+
});
|
|
2091
|
+
var FailureAnalysisSchema = import_zod27.z.object({
|
|
2092
|
+
category: import_zod27.z.enum(FailureCategory),
|
|
2093
|
+
severity: import_zod27.z.enum(FailureSeverity),
|
|
2094
|
+
summary: import_zod27.z.string(),
|
|
2095
|
+
details: import_zod27.z.string(),
|
|
2096
|
+
rootCause: import_zod27.z.string(),
|
|
2097
|
+
suggestedFix: import_zod27.z.string(),
|
|
2098
|
+
relatedAssertions: import_zod27.z.array(import_zod27.z.string()),
|
|
2099
|
+
codeSnippet: import_zod27.z.string().optional(),
|
|
2100
|
+
similarIssues: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2101
|
+
patternId: import_zod27.z.string().optional(),
|
|
2107
2102
|
// Extended fields for detailed debugging
|
|
2108
2103
|
diff: DiffContentSchema.optional(),
|
|
2109
2104
|
executionTrace: ExecutionTraceSchema.optional()
|
|
2110
2105
|
});
|
|
2111
2106
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
2112
2107
|
/** Agent ID for this run */
|
|
2113
|
-
agentId:
|
|
2108
|
+
agentId: import_zod27.z.string().optional(),
|
|
2114
2109
|
/** Preset ID that originated this run (optional) */
|
|
2115
|
-
presetId:
|
|
2110
|
+
presetId: import_zod27.z.string().optional(),
|
|
2116
2111
|
/** Skill IDs for this run */
|
|
2117
|
-
skillIds:
|
|
2112
|
+
skillIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2118
2113
|
/** Map of skillId to skillVersionId for this run */
|
|
2119
|
-
skillVersions:
|
|
2114
|
+
skillVersions: import_zod27.z.record(import_zod27.z.string(), import_zod27.z.string()).optional(),
|
|
2120
2115
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
2121
|
-
scenarioIds:
|
|
2116
|
+
scenarioIds: import_zod27.z.array(import_zod27.z.string()),
|
|
2122
2117
|
/** Current status */
|
|
2123
2118
|
status: EvalStatusSchema,
|
|
2124
2119
|
/** Progress percentage (0-100) */
|
|
2125
|
-
progress:
|
|
2120
|
+
progress: import_zod27.z.number(),
|
|
2126
2121
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
2127
|
-
results:
|
|
2122
|
+
results: import_zod27.z.array(import_zod27.z.lazy(() => EvalRunResultSchema)),
|
|
2128
2123
|
/** Aggregated metrics across all results */
|
|
2129
2124
|
aggregateMetrics: EvalMetricsSchema,
|
|
2130
2125
|
/** Failure analyses */
|
|
2131
|
-
failureAnalyses:
|
|
2126
|
+
failureAnalyses: import_zod27.z.array(FailureAnalysisSchema).optional(),
|
|
2132
2127
|
/** Aggregated LLM trace summary */
|
|
2133
2128
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
2134
2129
|
/** What triggered this run */
|
|
2135
2130
|
trigger: TriggerSchema.optional(),
|
|
2136
2131
|
/** When the run started (set when evaluation is triggered) */
|
|
2137
|
-
startedAt:
|
|
2132
|
+
startedAt: import_zod27.z.string().optional(),
|
|
2138
2133
|
/** When the run completed */
|
|
2139
|
-
completedAt:
|
|
2134
|
+
completedAt: import_zod27.z.string().optional(),
|
|
2140
2135
|
/** Live trace events captured during execution (for playback on results page) */
|
|
2141
|
-
liveTraceEvents:
|
|
2136
|
+
liveTraceEvents: import_zod27.z.array(LiveTraceEventSchema).optional(),
|
|
2142
2137
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
2143
|
-
jobId:
|
|
2138
|
+
jobId: import_zod27.z.string().optional(),
|
|
2144
2139
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
2145
|
-
jobStatus:
|
|
2140
|
+
jobStatus: import_zod27.z.string().optional(),
|
|
2146
2141
|
/** Remote job error message if the job failed */
|
|
2147
|
-
jobError:
|
|
2142
|
+
jobError: import_zod27.z.string().optional(),
|
|
2148
2143
|
/** Timestamp of the last job status check */
|
|
2149
|
-
jobStatusCheckedAt:
|
|
2144
|
+
jobStatusCheckedAt: import_zod27.z.string().optional(),
|
|
2150
2145
|
/** MCP server IDs to enable for this run (optional) */
|
|
2151
|
-
mcpIds:
|
|
2146
|
+
mcpIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2152
2147
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
2153
|
-
subAgentIds:
|
|
2148
|
+
subAgentIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2154
2149
|
/** Rule IDs to enable for this run (optional) */
|
|
2155
|
-
ruleIds:
|
|
2150
|
+
ruleIds: import_zod27.z.array(import_zod27.z.string()).optional(),
|
|
2156
2151
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
2157
|
-
tags:
|
|
2152
|
+
tags: import_zod27.z.array(import_zod27.z.string()).optional()
|
|
2158
2153
|
});
|
|
2159
2154
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
2160
2155
|
id: true,
|
|
@@ -2169,60 +2164,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
2169
2164
|
scenarioIds: true
|
|
2170
2165
|
}).extend({
|
|
2171
2166
|
/** Optional on input — backend resolves from tags when not provided */
|
|
2172
|
-
scenarioIds:
|
|
2167
|
+
scenarioIds: import_zod27.z.array(import_zod27.z.string()).optional()
|
|
2173
2168
|
}).refine(
|
|
2174
2169
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
2175
2170
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
2176
2171
|
);
|
|
2177
|
-
var EvaluationProgressSchema =
|
|
2178
|
-
runId:
|
|
2179
|
-
targetId:
|
|
2180
|
-
totalScenarios:
|
|
2181
|
-
completedScenarios:
|
|
2182
|
-
scenarioProgress:
|
|
2183
|
-
|
|
2184
|
-
scenarioId:
|
|
2185
|
-
currentStep:
|
|
2186
|
-
error:
|
|
2172
|
+
var EvaluationProgressSchema = import_zod27.z.object({
|
|
2173
|
+
runId: import_zod27.z.string(),
|
|
2174
|
+
targetId: import_zod27.z.string(),
|
|
2175
|
+
totalScenarios: import_zod27.z.number(),
|
|
2176
|
+
completedScenarios: import_zod27.z.number(),
|
|
2177
|
+
scenarioProgress: import_zod27.z.array(
|
|
2178
|
+
import_zod27.z.object({
|
|
2179
|
+
scenarioId: import_zod27.z.string(),
|
|
2180
|
+
currentStep: import_zod27.z.string(),
|
|
2181
|
+
error: import_zod27.z.string().optional()
|
|
2187
2182
|
})
|
|
2188
2183
|
),
|
|
2189
|
-
createdAt:
|
|
2190
|
-
});
|
|
2191
|
-
var EvaluationLogSchema =
|
|
2192
|
-
runId:
|
|
2193
|
-
scenarioId:
|
|
2194
|
-
log:
|
|
2195
|
-
level:
|
|
2196
|
-
message:
|
|
2197
|
-
args:
|
|
2198
|
-
error:
|
|
2184
|
+
createdAt: import_zod27.z.number()
|
|
2185
|
+
});
|
|
2186
|
+
var EvaluationLogSchema = import_zod27.z.object({
|
|
2187
|
+
runId: import_zod27.z.string(),
|
|
2188
|
+
scenarioId: import_zod27.z.string(),
|
|
2189
|
+
log: import_zod27.z.object({
|
|
2190
|
+
level: import_zod27.z.enum(["info", "error", "debug"]),
|
|
2191
|
+
message: import_zod27.z.string().optional(),
|
|
2192
|
+
args: import_zod27.z.array(import_zod27.z.any()).optional(),
|
|
2193
|
+
error: import_zod27.z.string().optional()
|
|
2199
2194
|
})
|
|
2200
2195
|
});
|
|
2201
2196
|
var LLM_TIMEOUT = 12e4;
|
|
2202
2197
|
|
|
2203
2198
|
// src/evaluation/conversation.ts
|
|
2204
|
-
var
|
|
2205
|
-
var TextBlockSchema =
|
|
2206
|
-
type:
|
|
2207
|
-
text:
|
|
2208
|
-
});
|
|
2209
|
-
var ThinkingBlockSchema =
|
|
2210
|
-
type:
|
|
2211
|
-
thinking:
|
|
2212
|
-
});
|
|
2213
|
-
var ToolUseBlockSchema =
|
|
2214
|
-
type:
|
|
2215
|
-
toolName:
|
|
2216
|
-
toolId:
|
|
2217
|
-
input:
|
|
2218
|
-
});
|
|
2219
|
-
var ToolResultBlockSchema =
|
|
2220
|
-
type:
|
|
2221
|
-
toolUseId:
|
|
2222
|
-
content:
|
|
2223
|
-
isError:
|
|
2224
|
-
});
|
|
2225
|
-
var ConversationBlockSchema =
|
|
2199
|
+
var import_zod28 = require("zod");
|
|
2200
|
+
var TextBlockSchema = import_zod28.z.object({
|
|
2201
|
+
type: import_zod28.z.literal("text"),
|
|
2202
|
+
text: import_zod28.z.string()
|
|
2203
|
+
});
|
|
2204
|
+
var ThinkingBlockSchema = import_zod28.z.object({
|
|
2205
|
+
type: import_zod28.z.literal("thinking"),
|
|
2206
|
+
thinking: import_zod28.z.string()
|
|
2207
|
+
});
|
|
2208
|
+
var ToolUseBlockSchema = import_zod28.z.object({
|
|
2209
|
+
type: import_zod28.z.literal("tool_use"),
|
|
2210
|
+
toolName: import_zod28.z.string(),
|
|
2211
|
+
toolId: import_zod28.z.string(),
|
|
2212
|
+
input: import_zod28.z.unknown()
|
|
2213
|
+
});
|
|
2214
|
+
var ToolResultBlockSchema = import_zod28.z.object({
|
|
2215
|
+
type: import_zod28.z.literal("tool_result"),
|
|
2216
|
+
toolUseId: import_zod28.z.string(),
|
|
2217
|
+
content: import_zod28.z.string(),
|
|
2218
|
+
isError: import_zod28.z.boolean().optional()
|
|
2219
|
+
});
|
|
2220
|
+
var ConversationBlockSchema = import_zod28.z.discriminatedUnion("type", [
|
|
2226
2221
|
TextBlockSchema,
|
|
2227
2222
|
ThinkingBlockSchema,
|
|
2228
2223
|
ToolUseBlockSchema,
|
|
@@ -2233,18 +2228,18 @@ var ConversationMessageRoles = [
|
|
|
2233
2228
|
"user",
|
|
2234
2229
|
"system"
|
|
2235
2230
|
];
|
|
2236
|
-
var ConversationMessageSchema =
|
|
2237
|
-
role:
|
|
2238
|
-
content:
|
|
2239
|
-
timestamp:
|
|
2231
|
+
var ConversationMessageSchema = import_zod28.z.object({
|
|
2232
|
+
role: import_zod28.z.enum(ConversationMessageRoles),
|
|
2233
|
+
content: import_zod28.z.array(ConversationBlockSchema),
|
|
2234
|
+
timestamp: import_zod28.z.string()
|
|
2240
2235
|
});
|
|
2241
|
-
var ScenarioConversationSchema =
|
|
2242
|
-
id:
|
|
2243
|
-
projectId:
|
|
2244
|
-
evalRunId:
|
|
2245
|
-
resultId:
|
|
2246
|
-
messages:
|
|
2247
|
-
createdAt:
|
|
2236
|
+
var ScenarioConversationSchema = import_zod28.z.object({
|
|
2237
|
+
id: import_zod28.z.string(),
|
|
2238
|
+
projectId: import_zod28.z.string(),
|
|
2239
|
+
evalRunId: import_zod28.z.string(),
|
|
2240
|
+
resultId: import_zod28.z.string(),
|
|
2241
|
+
messages: import_zod28.z.array(ConversationMessageSchema),
|
|
2242
|
+
createdAt: import_zod28.z.string()
|
|
2248
2243
|
});
|
|
2249
2244
|
|
|
2250
2245
|
// src/evaluation/eval-result.ts
|
|
@@ -2255,100 +2250,100 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
2255
2250
|
AssertionResultStatus2["ERROR"] = "error";
|
|
2256
2251
|
return AssertionResultStatus2;
|
|
2257
2252
|
})(AssertionResultStatus || {});
|
|
2258
|
-
var AssertionResultSchema =
|
|
2259
|
-
id:
|
|
2260
|
-
assertionId:
|
|
2261
|
-
assertionType:
|
|
2262
|
-
assertionName:
|
|
2263
|
-
status:
|
|
2264
|
-
message:
|
|
2265
|
-
expected:
|
|
2266
|
-
actual:
|
|
2267
|
-
duration:
|
|
2268
|
-
details:
|
|
2269
|
-
llmTraceSteps:
|
|
2270
|
-
});
|
|
2271
|
-
var EvalRunResultSchema =
|
|
2272
|
-
id:
|
|
2273
|
-
targetId:
|
|
2274
|
-
targetName:
|
|
2253
|
+
var AssertionResultSchema = import_zod29.z.object({
|
|
2254
|
+
id: import_zod29.z.string(),
|
|
2255
|
+
assertionId: import_zod29.z.string(),
|
|
2256
|
+
assertionType: import_zod29.z.string(),
|
|
2257
|
+
assertionName: import_zod29.z.string(),
|
|
2258
|
+
status: import_zod29.z.enum(AssertionResultStatus),
|
|
2259
|
+
message: import_zod29.z.string().optional(),
|
|
2260
|
+
expected: import_zod29.z.string().optional(),
|
|
2261
|
+
actual: import_zod29.z.string().optional(),
|
|
2262
|
+
duration: import_zod29.z.number().optional(),
|
|
2263
|
+
details: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.unknown()).optional(),
|
|
2264
|
+
llmTraceSteps: import_zod29.z.array(LLMTraceStepSchema).optional()
|
|
2265
|
+
});
|
|
2266
|
+
var EvalRunResultSchema = import_zod29.z.object({
|
|
2267
|
+
id: import_zod29.z.string(),
|
|
2268
|
+
targetId: import_zod29.z.string(),
|
|
2269
|
+
targetName: import_zod29.z.string().optional(),
|
|
2275
2270
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
2276
|
-
skillVersionId:
|
|
2271
|
+
skillVersionId: import_zod29.z.string().optional(),
|
|
2277
2272
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
2278
|
-
skillVersion:
|
|
2279
|
-
scenarioId:
|
|
2280
|
-
scenarioName:
|
|
2273
|
+
skillVersion: import_zod29.z.string().optional(),
|
|
2274
|
+
scenarioId: import_zod29.z.string(),
|
|
2275
|
+
scenarioName: import_zod29.z.string(),
|
|
2281
2276
|
modelConfig: ModelConfigSchema.optional(),
|
|
2282
|
-
assertionResults:
|
|
2277
|
+
assertionResults: import_zod29.z.array(AssertionResultSchema),
|
|
2283
2278
|
metrics: EvalMetricsSchema.optional(),
|
|
2284
|
-
passed:
|
|
2285
|
-
failed:
|
|
2286
|
-
passRate:
|
|
2287
|
-
duration:
|
|
2288
|
-
outputText:
|
|
2289
|
-
files:
|
|
2290
|
-
fileDiffs:
|
|
2279
|
+
passed: import_zod29.z.number(),
|
|
2280
|
+
failed: import_zod29.z.number(),
|
|
2281
|
+
passRate: import_zod29.z.number(),
|
|
2282
|
+
duration: import_zod29.z.number(),
|
|
2283
|
+
outputText: import_zod29.z.string().optional(),
|
|
2284
|
+
files: import_zod29.z.array(ExpectedFileSchema).optional(),
|
|
2285
|
+
fileDiffs: import_zod29.z.array(DiffContentSchema).optional(),
|
|
2291
2286
|
/** Full template files after execution with status indicators */
|
|
2292
|
-
templateFiles:
|
|
2293
|
-
startedAt:
|
|
2294
|
-
completedAt:
|
|
2287
|
+
templateFiles: import_zod29.z.array(TemplateFileSchema).optional(),
|
|
2288
|
+
startedAt: import_zod29.z.string().optional(),
|
|
2289
|
+
completedAt: import_zod29.z.string().optional(),
|
|
2295
2290
|
llmTrace: LLMTraceSchema.optional(),
|
|
2296
2291
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
2297
|
-
conversation:
|
|
2298
|
-
});
|
|
2299
|
-
var PromptResultSchema =
|
|
2300
|
-
text:
|
|
2301
|
-
files:
|
|
2302
|
-
finishReason:
|
|
2303
|
-
reasoning:
|
|
2304
|
-
reasoningDetails:
|
|
2305
|
-
toolCalls:
|
|
2306
|
-
toolResults:
|
|
2307
|
-
warnings:
|
|
2308
|
-
sources:
|
|
2309
|
-
steps:
|
|
2310
|
-
generationTimeMs:
|
|
2311
|
-
prompt:
|
|
2312
|
-
systemPrompt:
|
|
2313
|
-
usage:
|
|
2314
|
-
totalTokens:
|
|
2315
|
-
totalMicrocentsSpent:
|
|
2292
|
+
conversation: import_zod29.z.array(ConversationMessageSchema).optional()
|
|
2293
|
+
});
|
|
2294
|
+
var PromptResultSchema = import_zod29.z.object({
|
|
2295
|
+
text: import_zod29.z.string(),
|
|
2296
|
+
files: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2297
|
+
finishReason: import_zod29.z.string().optional(),
|
|
2298
|
+
reasoning: import_zod29.z.string().optional(),
|
|
2299
|
+
reasoningDetails: import_zod29.z.unknown().optional(),
|
|
2300
|
+
toolCalls: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2301
|
+
toolResults: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2302
|
+
warnings: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2303
|
+
sources: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2304
|
+
steps: import_zod29.z.array(import_zod29.z.unknown()),
|
|
2305
|
+
generationTimeMs: import_zod29.z.number(),
|
|
2306
|
+
prompt: import_zod29.z.string(),
|
|
2307
|
+
systemPrompt: import_zod29.z.string(),
|
|
2308
|
+
usage: import_zod29.z.object({
|
|
2309
|
+
totalTokens: import_zod29.z.number().optional(),
|
|
2310
|
+
totalMicrocentsSpent: import_zod29.z.number().optional()
|
|
2316
2311
|
})
|
|
2317
2312
|
});
|
|
2318
|
-
var EvaluationResultSchema =
|
|
2319
|
-
id:
|
|
2320
|
-
runId:
|
|
2321
|
-
timestamp:
|
|
2313
|
+
var EvaluationResultSchema = import_zod29.z.object({
|
|
2314
|
+
id: import_zod29.z.string(),
|
|
2315
|
+
runId: import_zod29.z.string(),
|
|
2316
|
+
timestamp: import_zod29.z.number(),
|
|
2322
2317
|
promptResult: PromptResultSchema,
|
|
2323
|
-
testResults:
|
|
2324
|
-
tags:
|
|
2325
|
-
feedback:
|
|
2326
|
-
score:
|
|
2327
|
-
suiteId:
|
|
2328
|
-
});
|
|
2329
|
-
var LeanEvaluationResultSchema =
|
|
2330
|
-
id:
|
|
2331
|
-
runId:
|
|
2332
|
-
timestamp:
|
|
2333
|
-
tags:
|
|
2334
|
-
scenarioId:
|
|
2335
|
-
scenarioVersion:
|
|
2336
|
-
targetId:
|
|
2337
|
-
targetVersion:
|
|
2338
|
-
suiteId:
|
|
2339
|
-
score:
|
|
2340
|
-
time:
|
|
2341
|
-
microcentsSpent:
|
|
2318
|
+
testResults: import_zod29.z.array(import_zod29.z.unknown()),
|
|
2319
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
2320
|
+
feedback: import_zod29.z.string().optional(),
|
|
2321
|
+
score: import_zod29.z.number(),
|
|
2322
|
+
suiteId: import_zod29.z.string().optional()
|
|
2323
|
+
});
|
|
2324
|
+
var LeanEvaluationResultSchema = import_zod29.z.object({
|
|
2325
|
+
id: import_zod29.z.string(),
|
|
2326
|
+
runId: import_zod29.z.string(),
|
|
2327
|
+
timestamp: import_zod29.z.number(),
|
|
2328
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
2329
|
+
scenarioId: import_zod29.z.string(),
|
|
2330
|
+
scenarioVersion: import_zod29.z.number().optional(),
|
|
2331
|
+
targetId: import_zod29.z.string(),
|
|
2332
|
+
targetVersion: import_zod29.z.number().optional(),
|
|
2333
|
+
suiteId: import_zod29.z.string().optional(),
|
|
2334
|
+
score: import_zod29.z.number(),
|
|
2335
|
+
time: import_zod29.z.number().optional(),
|
|
2336
|
+
microcentsSpent: import_zod29.z.number().optional()
|
|
2342
2337
|
});
|
|
2343
2338
|
|
|
2344
2339
|
// src/project/project.ts
|
|
2345
|
-
var
|
|
2340
|
+
var import_zod30 = require("zod");
|
|
2346
2341
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
2347
|
-
appId:
|
|
2348
|
-
appSecret:
|
|
2349
|
-
useWixAuth:
|
|
2350
|
-
useBase44Auth:
|
|
2351
|
-
scenarioTags:
|
|
2342
|
+
appId: import_zod30.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
2343
|
+
appSecret: import_zod30.z.string().optional().describe("The secret of the app in Dev Center"),
|
|
2344
|
+
useWixAuth: import_zod30.z.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
|
|
2345
|
+
useBase44Auth: import_zod30.z.boolean().optional().describe("Enable Base44 auth for evaluations"),
|
|
2346
|
+
scenarioTags: import_zod30.z.array(import_zod30.z.string()).optional().describe("Project-level tag vocabulary for scenarios")
|
|
2352
2347
|
});
|
|
2353
2348
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
2354
2349
|
id: true,
|
|
@@ -2401,7 +2396,7 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2401
2396
|
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2402
2397
|
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2403
2398
|
name: "Tool Called With Param",
|
|
2404
|
-
description: "Check that a tool was called with expected parameters",
|
|
2399
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
2405
2400
|
type: "tool_called_with_param",
|
|
2406
2401
|
parameters: [
|
|
2407
2402
|
{
|
|
@@ -2415,6 +2410,14 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2415
2410
|
label: "Expected Parameters (JSON, substring match)",
|
|
2416
2411
|
type: "string",
|
|
2417
2412
|
required: true
|
|
2413
|
+
},
|
|
2414
|
+
{
|
|
2415
|
+
name: "requireSuccess",
|
|
2416
|
+
label: "Require Successful Call",
|
|
2417
|
+
type: "boolean",
|
|
2418
|
+
required: false,
|
|
2419
|
+
defaultValue: false,
|
|
2420
|
+
advanced: true
|
|
2418
2421
|
}
|
|
2419
2422
|
]
|
|
2420
2423
|
},
|
|
@@ -2547,6 +2550,9 @@ function getSystemAssertion(id) {
|
|
|
2547
2550
|
BuildCheckTestSchema,
|
|
2548
2551
|
BuildPassedAssertionSchema,
|
|
2549
2552
|
BuildPassedConfigSchema,
|
|
2553
|
+
BulkImportResultItemSchema,
|
|
2554
|
+
BulkImportResultSchema,
|
|
2555
|
+
BulkImportSkillsInputSchema,
|
|
2550
2556
|
ClaudeModel,
|
|
2551
2557
|
ClaudeModelSchema,
|
|
2552
2558
|
CommandExecutionSchema,
|