@wix/evalforge-types 0.52.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +397 -391
- package/build/index.js.map +4 -4
- package/build/index.mjs +394 -391
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +106 -4
- package/build/types/scenario/index.d.ts +0 -1
- package/build/types/scenario/test-scenario.d.ts +69 -30
- package/build/types/target/skill.d.ts +46 -0
- package/package.json +2 -2
- package/build/types/scenario/assertions.d.ts +0 -98
package/build/index.mjs
CHANGED
|
@@ -1168,6 +1168,21 @@ var UpdateSkillInputSchema = SkillInputBaseSchema.partial().refine(
|
|
|
1168
1168
|
var SkillWithLatestVersionSchema = SkillSchema.extend({
|
|
1169
1169
|
latestVersion: SkillVersionSchema.optional()
|
|
1170
1170
|
});
|
|
1171
|
+
var BulkImportSkillsInputSchema = z7.object({
|
|
1172
|
+
source: GitHubSourceSchema
|
|
1173
|
+
});
|
|
1174
|
+
var BulkImportResultItemSchema = z7.object({
|
|
1175
|
+
name: z7.string(),
|
|
1176
|
+
status: z7.enum(["created", "skipped", "failed"]),
|
|
1177
|
+
skillId: z7.string().optional(),
|
|
1178
|
+
reason: z7.string().optional()
|
|
1179
|
+
});
|
|
1180
|
+
var BulkImportResultSchema = z7.object({
|
|
1181
|
+
created: z7.number(),
|
|
1182
|
+
skipped: z7.number(),
|
|
1183
|
+
failed: z7.number(),
|
|
1184
|
+
items: z7.array(BulkImportResultItemSchema)
|
|
1185
|
+
});
|
|
1171
1186
|
|
|
1172
1187
|
// src/target/sub-agent.ts
|
|
1173
1188
|
import { z as z8 } from "zod";
|
|
@@ -1400,85 +1415,34 @@ var TestSchema = z20.discriminatedUnion("type", [
|
|
|
1400
1415
|
PlaywrightNLTestSchema
|
|
1401
1416
|
]);
|
|
1402
1417
|
|
|
1403
|
-
// src/scenario/assertions.ts
|
|
1404
|
-
import { z as z21 } from "zod";
|
|
1405
|
-
var SkillWasCalledAssertionSchema = z21.object({
|
|
1406
|
-
type: z21.literal("skill_was_called"),
|
|
1407
|
-
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1408
|
-
skillNames: z21.array(z21.string().min(1)).min(1)
|
|
1409
|
-
});
|
|
1410
|
-
var ToolCalledWithParamAssertionSchema = z21.object({
|
|
1411
|
-
type: z21.literal("tool_called_with_param"),
|
|
1412
|
-
/** Name of the tool that must have been called */
|
|
1413
|
-
toolName: z21.string().min(1),
|
|
1414
|
-
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1415
|
-
expectedParams: z21.string().min(1)
|
|
1416
|
-
});
|
|
1417
|
-
var BuildPassedAssertionSchema = z21.object({
|
|
1418
|
-
type: z21.literal("build_passed"),
|
|
1419
|
-
/** Command to run (default: "yarn build") */
|
|
1420
|
-
command: z21.string().optional(),
|
|
1421
|
-
/** Expected exit code (default: 0) */
|
|
1422
|
-
expectedExitCode: z21.number().int().optional()
|
|
1423
|
-
});
|
|
1424
|
-
var CostAssertionSchema = z21.object({
|
|
1425
|
-
type: z21.literal("cost"),
|
|
1426
|
-
/** Maximum allowed cost in USD */
|
|
1427
|
-
maxCostUsd: z21.number().positive()
|
|
1428
|
-
});
|
|
1429
|
-
var LlmJudgeAssertionSchema = z21.object({
|
|
1430
|
-
type: z21.literal("llm_judge"),
|
|
1431
|
-
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1432
|
-
prompt: z21.string(),
|
|
1433
|
-
/** Minimum score to pass (0-10, default 7) */
|
|
1434
|
-
minScore: z21.number().int().min(0).max(10).optional(),
|
|
1435
|
-
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
1436
|
-
model: z21.string().optional(),
|
|
1437
|
-
maxTokens: z21.number().int().optional(),
|
|
1438
|
-
temperature: z21.number().min(0).max(1).optional()
|
|
1439
|
-
});
|
|
1440
|
-
var TimeAssertionSchema = z21.object({
|
|
1441
|
-
type: z21.literal("time_limit"),
|
|
1442
|
-
/** Maximum allowed duration in milliseconds */
|
|
1443
|
-
maxDurationMs: z21.number().int().positive()
|
|
1444
|
-
});
|
|
1445
|
-
var AssertionSchema = z21.union([
|
|
1446
|
-
SkillWasCalledAssertionSchema,
|
|
1447
|
-
ToolCalledWithParamAssertionSchema,
|
|
1448
|
-
BuildPassedAssertionSchema,
|
|
1449
|
-
TimeAssertionSchema,
|
|
1450
|
-
CostAssertionSchema,
|
|
1451
|
-
LlmJudgeAssertionSchema
|
|
1452
|
-
]);
|
|
1453
|
-
|
|
1454
1418
|
// src/scenario/environment.ts
|
|
1455
|
-
import { z as
|
|
1456
|
-
var LocalProjectConfigSchema =
|
|
1419
|
+
import { z as z21 } from "zod";
|
|
1420
|
+
var LocalProjectConfigSchema = z21.object({
|
|
1457
1421
|
/** Template ID to use for the local project */
|
|
1458
|
-
templateId:
|
|
1422
|
+
templateId: z21.string().optional(),
|
|
1459
1423
|
/** Files to create in the project */
|
|
1460
|
-
files:
|
|
1461
|
-
|
|
1462
|
-
path:
|
|
1463
|
-
content:
|
|
1424
|
+
files: z21.array(
|
|
1425
|
+
z21.object({
|
|
1426
|
+
path: z21.string().min(1),
|
|
1427
|
+
content: z21.string().min(1)
|
|
1464
1428
|
})
|
|
1465
1429
|
).optional()
|
|
1466
1430
|
});
|
|
1467
|
-
var MetaSiteConfigSchema =
|
|
1468
|
-
configurations:
|
|
1469
|
-
|
|
1470
|
-
name:
|
|
1471
|
-
apiCalls:
|
|
1472
|
-
|
|
1473
|
-
url:
|
|
1474
|
-
method:
|
|
1475
|
-
body:
|
|
1431
|
+
var MetaSiteConfigSchema = z21.object({
|
|
1432
|
+
configurations: z21.array(
|
|
1433
|
+
z21.object({
|
|
1434
|
+
name: z21.string().min(1),
|
|
1435
|
+
apiCalls: z21.array(
|
|
1436
|
+
z21.object({
|
|
1437
|
+
url: z21.string().url(),
|
|
1438
|
+
method: z21.enum(["POST", "PUT"]),
|
|
1439
|
+
body: z21.string()
|
|
1476
1440
|
})
|
|
1477
1441
|
)
|
|
1478
1442
|
})
|
|
1479
1443
|
).optional()
|
|
1480
1444
|
});
|
|
1481
|
-
var EnvironmentSchema =
|
|
1445
|
+
var EnvironmentSchema = z21.object({
|
|
1482
1446
|
/** Local project configuration */
|
|
1483
1447
|
localProject: LocalProjectConfigSchema.optional(),
|
|
1484
1448
|
/** Meta site configuration */
|
|
@@ -1486,11 +1450,11 @@ var EnvironmentSchema = z22.object({
|
|
|
1486
1450
|
});
|
|
1487
1451
|
|
|
1488
1452
|
// src/scenario/test-scenario.ts
|
|
1489
|
-
import { z as
|
|
1453
|
+
import { z as z23 } from "zod";
|
|
1490
1454
|
|
|
1491
1455
|
// src/assertion/assertion.ts
|
|
1492
|
-
import { z as
|
|
1493
|
-
var AssertionTypeSchema =
|
|
1456
|
+
import { z as z22 } from "zod";
|
|
1457
|
+
var AssertionTypeSchema = z22.enum([
|
|
1494
1458
|
"skill_was_called",
|
|
1495
1459
|
"tool_called_with_param",
|
|
1496
1460
|
"build_passed",
|
|
@@ -1498,59 +1462,61 @@ var AssertionTypeSchema = z23.enum([
|
|
|
1498
1462
|
"cost",
|
|
1499
1463
|
"llm_judge"
|
|
1500
1464
|
]);
|
|
1501
|
-
var AssertionParameterTypeSchema =
|
|
1465
|
+
var AssertionParameterTypeSchema = z22.enum([
|
|
1502
1466
|
"string",
|
|
1503
1467
|
"number",
|
|
1504
1468
|
"boolean"
|
|
1505
1469
|
]);
|
|
1506
|
-
var AssertionParameterSchema =
|
|
1470
|
+
var AssertionParameterSchema = z22.object({
|
|
1507
1471
|
/** Parameter name (used as key in params object) */
|
|
1508
|
-
name:
|
|
1472
|
+
name: z22.string().min(1),
|
|
1509
1473
|
/** Display label for the parameter */
|
|
1510
|
-
label:
|
|
1474
|
+
label: z22.string().min(1),
|
|
1511
1475
|
/** Parameter type */
|
|
1512
1476
|
type: AssertionParameterTypeSchema,
|
|
1513
1477
|
/** Whether this parameter is required */
|
|
1514
|
-
required:
|
|
1478
|
+
required: z22.boolean(),
|
|
1515
1479
|
/** Default value (optional, used when not provided) */
|
|
1516
|
-
defaultValue:
|
|
1480
|
+
defaultValue: z22.union([z22.string(), z22.number(), z22.boolean()]).optional(),
|
|
1517
1481
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
1518
|
-
advanced:
|
|
1482
|
+
advanced: z22.boolean().optional()
|
|
1519
1483
|
});
|
|
1520
|
-
var ScenarioAssertionLinkSchema =
|
|
1484
|
+
var ScenarioAssertionLinkSchema = z22.object({
|
|
1521
1485
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
1522
|
-
assertionId:
|
|
1486
|
+
assertionId: z22.string(),
|
|
1523
1487
|
/** Parameter values for this assertion in this scenario */
|
|
1524
|
-
params:
|
|
1525
|
-
|
|
1526
|
-
|
|
1488
|
+
params: z22.record(
|
|
1489
|
+
z22.string(),
|
|
1490
|
+
z22.union([z22.string(), z22.number(), z22.boolean(), z22.null()])
|
|
1527
1491
|
).optional()
|
|
1528
1492
|
});
|
|
1529
|
-
var SkillWasCalledConfigSchema =
|
|
1493
|
+
var SkillWasCalledConfigSchema = z22.object({
|
|
1530
1494
|
/** Names of the skills that must have been called */
|
|
1531
|
-
skillNames:
|
|
1495
|
+
skillNames: z22.array(z22.string().min(1)).min(1)
|
|
1532
1496
|
});
|
|
1533
|
-
var CostConfigSchema =
|
|
1497
|
+
var CostConfigSchema = z22.strictObject({
|
|
1534
1498
|
/** Maximum allowed cost in USD */
|
|
1535
|
-
maxCostUsd:
|
|
1499
|
+
maxCostUsd: z22.number().positive()
|
|
1536
1500
|
});
|
|
1537
|
-
var ToolCalledWithParamConfigSchema =
|
|
1501
|
+
var ToolCalledWithParamConfigSchema = z22.strictObject({
|
|
1538
1502
|
/** Name of the tool that must have been called */
|
|
1539
|
-
toolName:
|
|
1503
|
+
toolName: z22.string().min(1),
|
|
1540
1504
|
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1541
|
-
expectedParams:
|
|
1505
|
+
expectedParams: z22.string().min(1),
|
|
1506
|
+
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
1507
|
+
requireSuccess: z22.boolean().optional()
|
|
1542
1508
|
});
|
|
1543
|
-
var BuildPassedConfigSchema =
|
|
1509
|
+
var BuildPassedConfigSchema = z22.strictObject({
|
|
1544
1510
|
/** Command to run (default: "yarn build") */
|
|
1545
|
-
command:
|
|
1511
|
+
command: z22.string().optional(),
|
|
1546
1512
|
/** Expected exit code (default: 0) */
|
|
1547
|
-
expectedExitCode:
|
|
1513
|
+
expectedExitCode: z22.number().int().optional()
|
|
1548
1514
|
});
|
|
1549
|
-
var TimeConfigSchema =
|
|
1515
|
+
var TimeConfigSchema = z22.strictObject({
|
|
1550
1516
|
/** Maximum allowed duration in milliseconds */
|
|
1551
|
-
maxDurationMs:
|
|
1517
|
+
maxDurationMs: z22.number().int().positive()
|
|
1552
1518
|
});
|
|
1553
|
-
var LlmJudgeConfigSchema =
|
|
1519
|
+
var LlmJudgeConfigSchema = z22.object({
|
|
1554
1520
|
/**
|
|
1555
1521
|
* Prompt template with placeholders:
|
|
1556
1522
|
* - {{output}}: agent's final output
|
|
@@ -1561,19 +1527,45 @@ var LlmJudgeConfigSchema = z23.object({
|
|
|
1561
1527
|
* - {{trace}}: step-by-step trace of tool calls
|
|
1562
1528
|
* - Custom parameters defined in the parameters array
|
|
1563
1529
|
*/
|
|
1564
|
-
prompt:
|
|
1530
|
+
prompt: z22.string().min(1),
|
|
1565
1531
|
/** Minimum score to pass (0-10, default 7) */
|
|
1566
|
-
minScore:
|
|
1532
|
+
minScore: z22.number().int().min(0).max(10).optional(),
|
|
1567
1533
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1568
|
-
model:
|
|
1534
|
+
model: z22.string().optional(),
|
|
1569
1535
|
/** Max output tokens */
|
|
1570
|
-
maxTokens:
|
|
1536
|
+
maxTokens: z22.number().int().optional(),
|
|
1571
1537
|
/** Temperature (0-1) */
|
|
1572
|
-
temperature:
|
|
1538
|
+
temperature: z22.number().min(0).max(1).optional(),
|
|
1573
1539
|
/** User-defined parameters for this assertion */
|
|
1574
|
-
parameters:
|
|
1540
|
+
parameters: z22.array(AssertionParameterSchema).optional()
|
|
1541
|
+
});
|
|
1542
|
+
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
1543
|
+
type: z22.literal("skill_was_called")
|
|
1544
|
+
});
|
|
1545
|
+
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
1546
|
+
type: z22.literal("tool_called_with_param")
|
|
1575
1547
|
});
|
|
1576
|
-
var
|
|
1548
|
+
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
1549
|
+
type: z22.literal("build_passed")
|
|
1550
|
+
});
|
|
1551
|
+
var CostAssertionSchema = CostConfigSchema.extend({
|
|
1552
|
+
type: z22.literal("cost")
|
|
1553
|
+
});
|
|
1554
|
+
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
1555
|
+
type: z22.literal("llm_judge")
|
|
1556
|
+
});
|
|
1557
|
+
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
1558
|
+
type: z22.literal("time_limit")
|
|
1559
|
+
});
|
|
1560
|
+
var AssertionSchema = z22.union([
|
|
1561
|
+
SkillWasCalledAssertionSchema,
|
|
1562
|
+
ToolCalledWithParamAssertionSchema,
|
|
1563
|
+
BuildPassedAssertionSchema,
|
|
1564
|
+
TimeAssertionSchema,
|
|
1565
|
+
CostAssertionSchema,
|
|
1566
|
+
LlmJudgeAssertionSchema
|
|
1567
|
+
]);
|
|
1568
|
+
var AssertionConfigSchema = z22.union([
|
|
1577
1569
|
LlmJudgeConfigSchema,
|
|
1578
1570
|
// requires prompt - check first
|
|
1579
1571
|
SkillWasCalledConfigSchema,
|
|
@@ -1586,7 +1578,7 @@ var AssertionConfigSchema = z23.union([
|
|
|
1586
1578
|
// requires maxCostUsd, uses strictObject
|
|
1587
1579
|
BuildPassedConfigSchema,
|
|
1588
1580
|
// all optional, uses strictObject to reject unknown keys
|
|
1589
|
-
|
|
1581
|
+
z22.object({})
|
|
1590
1582
|
// fallback empty config
|
|
1591
1583
|
]);
|
|
1592
1584
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -1637,25 +1629,25 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1637
1629
|
}
|
|
1638
1630
|
|
|
1639
1631
|
// src/scenario/test-scenario.ts
|
|
1640
|
-
var ExpectedFileSchema =
|
|
1632
|
+
var ExpectedFileSchema = z23.object({
|
|
1641
1633
|
/** Relative path where the file should be created */
|
|
1642
|
-
path:
|
|
1634
|
+
path: z23.string(),
|
|
1643
1635
|
/** Optional expected content */
|
|
1644
|
-
content:
|
|
1636
|
+
content: z23.string().optional()
|
|
1645
1637
|
});
|
|
1646
1638
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
1647
1639
|
/** The prompt sent to the agent to trigger the task */
|
|
1648
|
-
triggerPrompt:
|
|
1640
|
+
triggerPrompt: z23.string().min(10),
|
|
1649
1641
|
/** ID of the template to use for this scenario (null = no template) */
|
|
1650
|
-
templateId:
|
|
1642
|
+
templateId: z23.string().nullish(),
|
|
1651
1643
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
1652
|
-
assertions:
|
|
1644
|
+
assertions: z23.array(AssertionSchema).optional(),
|
|
1653
1645
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
1654
|
-
assertionIds:
|
|
1646
|
+
assertionIds: z23.array(z23.string()).optional(),
|
|
1655
1647
|
/** Linked assertions with per-scenario parameter values */
|
|
1656
|
-
assertionLinks:
|
|
1648
|
+
assertionLinks: z23.array(ScenarioAssertionLinkSchema).optional(),
|
|
1657
1649
|
/** Tags for categorisation and filtering */
|
|
1658
|
-
tags:
|
|
1650
|
+
tags: z23.array(z23.string()).optional()
|
|
1659
1651
|
});
|
|
1660
1652
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
1661
1653
|
id: true,
|
|
@@ -1666,10 +1658,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
1666
1658
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
1667
1659
|
|
|
1668
1660
|
// src/suite/test-suite.ts
|
|
1669
|
-
import { z as
|
|
1661
|
+
import { z as z24 } from "zod";
|
|
1670
1662
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1671
1663
|
/** IDs of test scenarios in this suite */
|
|
1672
|
-
scenarioIds:
|
|
1664
|
+
scenarioIds: z24.array(z24.string())
|
|
1673
1665
|
});
|
|
1674
1666
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1675
1667
|
id: true,
|
|
@@ -1680,21 +1672,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1680
1672
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1681
1673
|
|
|
1682
1674
|
// src/evaluation/metrics.ts
|
|
1683
|
-
import { z as
|
|
1684
|
-
var TokenUsageSchema =
|
|
1685
|
-
prompt:
|
|
1686
|
-
completion:
|
|
1687
|
-
total:
|
|
1688
|
-
});
|
|
1689
|
-
var EvalMetricsSchema =
|
|
1690
|
-
totalAssertions:
|
|
1691
|
-
passed:
|
|
1692
|
-
failed:
|
|
1693
|
-
skipped:
|
|
1694
|
-
errors:
|
|
1695
|
-
passRate:
|
|
1696
|
-
avgDuration:
|
|
1697
|
-
totalDuration:
|
|
1675
|
+
import { z as z25 } from "zod";
|
|
1676
|
+
var TokenUsageSchema = z25.object({
|
|
1677
|
+
prompt: z25.number(),
|
|
1678
|
+
completion: z25.number(),
|
|
1679
|
+
total: z25.number()
|
|
1680
|
+
});
|
|
1681
|
+
var EvalMetricsSchema = z25.object({
|
|
1682
|
+
totalAssertions: z25.number(),
|
|
1683
|
+
passed: z25.number(),
|
|
1684
|
+
failed: z25.number(),
|
|
1685
|
+
skipped: z25.number(),
|
|
1686
|
+
errors: z25.number(),
|
|
1687
|
+
passRate: z25.number(),
|
|
1688
|
+
avgDuration: z25.number(),
|
|
1689
|
+
totalDuration: z25.number()
|
|
1698
1690
|
});
|
|
1699
1691
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1700
1692
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1704,7 +1696,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1704
1696
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1705
1697
|
return EvalStatus2;
|
|
1706
1698
|
})(EvalStatus || {});
|
|
1707
|
-
var EvalStatusSchema =
|
|
1699
|
+
var EvalStatusSchema = z25.enum(EvalStatus);
|
|
1708
1700
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1709
1701
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1710
1702
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1712,52 +1704,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1712
1704
|
LLMStepType2["THINKING"] = "thinking";
|
|
1713
1705
|
return LLMStepType2;
|
|
1714
1706
|
})(LLMStepType || {});
|
|
1715
|
-
var LLMTraceStepSchema =
|
|
1716
|
-
id:
|
|
1717
|
-
stepNumber:
|
|
1718
|
-
type:
|
|
1719
|
-
model:
|
|
1720
|
-
provider:
|
|
1721
|
-
startedAt:
|
|
1722
|
-
durationMs:
|
|
1707
|
+
var LLMTraceStepSchema = z25.object({
|
|
1708
|
+
id: z25.string(),
|
|
1709
|
+
stepNumber: z25.number(),
|
|
1710
|
+
type: z25.enum(LLMStepType),
|
|
1711
|
+
model: z25.string(),
|
|
1712
|
+
provider: z25.string(),
|
|
1713
|
+
startedAt: z25.string(),
|
|
1714
|
+
durationMs: z25.number(),
|
|
1723
1715
|
tokenUsage: TokenUsageSchema,
|
|
1724
|
-
costUsd:
|
|
1725
|
-
toolName:
|
|
1726
|
-
toolArguments:
|
|
1727
|
-
inputPreview:
|
|
1728
|
-
outputPreview:
|
|
1729
|
-
success:
|
|
1730
|
-
error:
|
|
1731
|
-
});
|
|
1732
|
-
var LLMBreakdownStatsSchema =
|
|
1733
|
-
count:
|
|
1734
|
-
durationMs:
|
|
1735
|
-
tokens:
|
|
1736
|
-
costUsd:
|
|
1737
|
-
});
|
|
1738
|
-
var LLMTraceSummarySchema =
|
|
1739
|
-
totalSteps:
|
|
1740
|
-
totalDurationMs:
|
|
1716
|
+
costUsd: z25.number(),
|
|
1717
|
+
toolName: z25.string().optional(),
|
|
1718
|
+
toolArguments: z25.string().optional(),
|
|
1719
|
+
inputPreview: z25.string().optional(),
|
|
1720
|
+
outputPreview: z25.string().optional(),
|
|
1721
|
+
success: z25.boolean(),
|
|
1722
|
+
error: z25.string().optional()
|
|
1723
|
+
});
|
|
1724
|
+
var LLMBreakdownStatsSchema = z25.object({
|
|
1725
|
+
count: z25.number(),
|
|
1726
|
+
durationMs: z25.number(),
|
|
1727
|
+
tokens: z25.number(),
|
|
1728
|
+
costUsd: z25.number()
|
|
1729
|
+
});
|
|
1730
|
+
var LLMTraceSummarySchema = z25.object({
|
|
1731
|
+
totalSteps: z25.number(),
|
|
1732
|
+
totalDurationMs: z25.number(),
|
|
1741
1733
|
totalTokens: TokenUsageSchema,
|
|
1742
|
-
totalCostUsd:
|
|
1743
|
-
stepTypeBreakdown:
|
|
1744
|
-
modelBreakdown:
|
|
1745
|
-
modelsUsed:
|
|
1746
|
-
});
|
|
1747
|
-
var LLMTraceSchema =
|
|
1748
|
-
id:
|
|
1749
|
-
steps:
|
|
1734
|
+
totalCostUsd: z25.number(),
|
|
1735
|
+
stepTypeBreakdown: z25.record(z25.string(), LLMBreakdownStatsSchema).optional(),
|
|
1736
|
+
modelBreakdown: z25.record(z25.string(), LLMBreakdownStatsSchema),
|
|
1737
|
+
modelsUsed: z25.array(z25.string())
|
|
1738
|
+
});
|
|
1739
|
+
var LLMTraceSchema = z25.object({
|
|
1740
|
+
id: z25.string(),
|
|
1741
|
+
steps: z25.array(LLMTraceStepSchema),
|
|
1750
1742
|
summary: LLMTraceSummarySchema
|
|
1751
1743
|
});
|
|
1752
1744
|
|
|
1753
1745
|
// src/evaluation/eval-result.ts
|
|
1754
|
-
import { z as
|
|
1746
|
+
import { z as z29 } from "zod";
|
|
1755
1747
|
|
|
1756
1748
|
// src/evaluation/eval-run.ts
|
|
1757
|
-
import { z as
|
|
1749
|
+
import { z as z27 } from "zod";
|
|
1758
1750
|
|
|
1759
1751
|
// src/evaluation/live-trace.ts
|
|
1760
|
-
import { z as
|
|
1752
|
+
import { z as z26 } from "zod";
|
|
1761
1753
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1762
1754
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1763
1755
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1771,37 +1763,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1771
1763
|
LiveTraceEventType2["USER"] = "user";
|
|
1772
1764
|
return LiveTraceEventType2;
|
|
1773
1765
|
})(LiveTraceEventType || {});
|
|
1774
|
-
var LiveTraceEventSchema =
|
|
1766
|
+
var LiveTraceEventSchema = z26.object({
|
|
1775
1767
|
/** The evaluation run ID */
|
|
1776
|
-
evalRunId:
|
|
1768
|
+
evalRunId: z26.string(),
|
|
1777
1769
|
/** The scenario ID being executed */
|
|
1778
|
-
scenarioId:
|
|
1770
|
+
scenarioId: z26.string(),
|
|
1779
1771
|
/** The scenario name for display */
|
|
1780
|
-
scenarioName:
|
|
1772
|
+
scenarioName: z26.string(),
|
|
1781
1773
|
/** The target ID (skill, agent, etc.) */
|
|
1782
|
-
targetId:
|
|
1774
|
+
targetId: z26.string(),
|
|
1783
1775
|
/** The target name for display */
|
|
1784
|
-
targetName:
|
|
1776
|
+
targetName: z26.string(),
|
|
1785
1777
|
/** Step number in the current scenario execution */
|
|
1786
|
-
stepNumber:
|
|
1778
|
+
stepNumber: z26.number(),
|
|
1787
1779
|
/** Type of trace event */
|
|
1788
|
-
type:
|
|
1780
|
+
type: z26.enum(LiveTraceEventType),
|
|
1789
1781
|
/** Tool name if this is a tool_use event */
|
|
1790
|
-
toolName:
|
|
1782
|
+
toolName: z26.string().optional(),
|
|
1791
1783
|
/** Tool arguments preview (truncated JSON) */
|
|
1792
|
-
toolArgs:
|
|
1784
|
+
toolArgs: z26.string().optional(),
|
|
1793
1785
|
/** Output preview (truncated text) */
|
|
1794
|
-
outputPreview:
|
|
1786
|
+
outputPreview: z26.string().optional(),
|
|
1795
1787
|
/** File path for file operations */
|
|
1796
|
-
filePath:
|
|
1788
|
+
filePath: z26.string().optional(),
|
|
1797
1789
|
/** Elapsed time in milliseconds for progress events */
|
|
1798
|
-
elapsedMs:
|
|
1790
|
+
elapsedMs: z26.number().optional(),
|
|
1799
1791
|
/** Thinking/reasoning text from Claude */
|
|
1800
|
-
thinking:
|
|
1792
|
+
thinking: z26.string().optional(),
|
|
1801
1793
|
/** Timestamp when this event occurred */
|
|
1802
|
-
timestamp:
|
|
1794
|
+
timestamp: z26.string(),
|
|
1803
1795
|
/** Whether this is the final event for this scenario */
|
|
1804
|
-
isComplete:
|
|
1796
|
+
isComplete: z26.boolean()
|
|
1805
1797
|
});
|
|
1806
1798
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1807
1799
|
function parseTraceEventLine(line) {
|
|
@@ -1829,14 +1821,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1829
1821
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
1830
1822
|
return TriggerType2;
|
|
1831
1823
|
})(TriggerType || {});
|
|
1832
|
-
var TriggerMetadataSchema =
|
|
1833
|
-
version:
|
|
1834
|
-
resourceUpdated:
|
|
1824
|
+
var TriggerMetadataSchema = z27.object({
|
|
1825
|
+
version: z27.string().optional(),
|
|
1826
|
+
resourceUpdated: z27.array(z27.string()).optional()
|
|
1835
1827
|
});
|
|
1836
|
-
var TriggerSchema =
|
|
1837
|
-
id:
|
|
1828
|
+
var TriggerSchema = z27.object({
|
|
1829
|
+
id: z27.string(),
|
|
1838
1830
|
metadata: TriggerMetadataSchema.optional(),
|
|
1839
|
-
type:
|
|
1831
|
+
type: z27.enum(TriggerType)
|
|
1840
1832
|
});
|
|
1841
1833
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
1842
1834
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -1854,28 +1846,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
1854
1846
|
FailureSeverity2["LOW"] = "low";
|
|
1855
1847
|
return FailureSeverity2;
|
|
1856
1848
|
})(FailureSeverity || {});
|
|
1857
|
-
var DiffLineTypeSchema =
|
|
1858
|
-
var DiffLineSchema =
|
|
1849
|
+
var DiffLineTypeSchema = z27.enum(["added", "removed", "unchanged"]);
|
|
1850
|
+
var DiffLineSchema = z27.object({
|
|
1859
1851
|
type: DiffLineTypeSchema,
|
|
1860
|
-
content:
|
|
1861
|
-
lineNumber:
|
|
1862
|
-
});
|
|
1863
|
-
var DiffContentSchema =
|
|
1864
|
-
path:
|
|
1865
|
-
expected:
|
|
1866
|
-
actual:
|
|
1867
|
-
diffLines:
|
|
1868
|
-
renamedFrom:
|
|
1869
|
-
});
|
|
1870
|
-
var CommandExecutionSchema =
|
|
1871
|
-
command:
|
|
1872
|
-
exitCode:
|
|
1873
|
-
output:
|
|
1874
|
-
duration:
|
|
1875
|
-
});
|
|
1876
|
-
var FileModificationSchema =
|
|
1877
|
-
path:
|
|
1878
|
-
action:
|
|
1852
|
+
content: z27.string(),
|
|
1853
|
+
lineNumber: z27.number()
|
|
1854
|
+
});
|
|
1855
|
+
var DiffContentSchema = z27.object({
|
|
1856
|
+
path: z27.string(),
|
|
1857
|
+
expected: z27.string(),
|
|
1858
|
+
actual: z27.string(),
|
|
1859
|
+
diffLines: z27.array(DiffLineSchema),
|
|
1860
|
+
renamedFrom: z27.string().optional()
|
|
1861
|
+
});
|
|
1862
|
+
var CommandExecutionSchema = z27.object({
|
|
1863
|
+
command: z27.string(),
|
|
1864
|
+
exitCode: z27.number(),
|
|
1865
|
+
output: z27.string().optional(),
|
|
1866
|
+
duration: z27.number()
|
|
1867
|
+
});
|
|
1868
|
+
var FileModificationSchema = z27.object({
|
|
1869
|
+
path: z27.string(),
|
|
1870
|
+
action: z27.enum(["created", "modified", "deleted"])
|
|
1879
1871
|
});
|
|
1880
1872
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1881
1873
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1883,87 +1875,87 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1883
1875
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1884
1876
|
return TemplateFileStatus2;
|
|
1885
1877
|
})(TemplateFileStatus || {});
|
|
1886
|
-
var TemplateFileSchema =
|
|
1878
|
+
var TemplateFileSchema = z27.object({
|
|
1887
1879
|
/** Relative path within the template */
|
|
1888
|
-
path:
|
|
1880
|
+
path: z27.string(),
|
|
1889
1881
|
/** Full file content after execution */
|
|
1890
|
-
content:
|
|
1882
|
+
content: z27.string(),
|
|
1891
1883
|
/** File status (new, modified, unchanged) */
|
|
1892
|
-
status:
|
|
1893
|
-
});
|
|
1894
|
-
var ApiCallSchema =
|
|
1895
|
-
endpoint:
|
|
1896
|
-
tokensUsed:
|
|
1897
|
-
duration:
|
|
1898
|
-
});
|
|
1899
|
-
var ExecutionTraceSchema =
|
|
1900
|
-
commands:
|
|
1901
|
-
filesModified:
|
|
1902
|
-
apiCalls:
|
|
1903
|
-
totalDuration:
|
|
1904
|
-
});
|
|
1905
|
-
var FailureAnalysisSchema =
|
|
1906
|
-
category:
|
|
1907
|
-
severity:
|
|
1908
|
-
summary:
|
|
1909
|
-
details:
|
|
1910
|
-
rootCause:
|
|
1911
|
-
suggestedFix:
|
|
1912
|
-
relatedAssertions:
|
|
1913
|
-
codeSnippet:
|
|
1914
|
-
similarIssues:
|
|
1915
|
-
patternId:
|
|
1884
|
+
status: z27.enum(["new", "modified", "unchanged"])
|
|
1885
|
+
});
|
|
1886
|
+
var ApiCallSchema = z27.object({
|
|
1887
|
+
endpoint: z27.string(),
|
|
1888
|
+
tokensUsed: z27.number(),
|
|
1889
|
+
duration: z27.number()
|
|
1890
|
+
});
|
|
1891
|
+
var ExecutionTraceSchema = z27.object({
|
|
1892
|
+
commands: z27.array(CommandExecutionSchema),
|
|
1893
|
+
filesModified: z27.array(FileModificationSchema),
|
|
1894
|
+
apiCalls: z27.array(ApiCallSchema),
|
|
1895
|
+
totalDuration: z27.number()
|
|
1896
|
+
});
|
|
1897
|
+
var FailureAnalysisSchema = z27.object({
|
|
1898
|
+
category: z27.enum(FailureCategory),
|
|
1899
|
+
severity: z27.enum(FailureSeverity),
|
|
1900
|
+
summary: z27.string(),
|
|
1901
|
+
details: z27.string(),
|
|
1902
|
+
rootCause: z27.string(),
|
|
1903
|
+
suggestedFix: z27.string(),
|
|
1904
|
+
relatedAssertions: z27.array(z27.string()),
|
|
1905
|
+
codeSnippet: z27.string().optional(),
|
|
1906
|
+
similarIssues: z27.array(z27.string()).optional(),
|
|
1907
|
+
patternId: z27.string().optional(),
|
|
1916
1908
|
// Extended fields for detailed debugging
|
|
1917
1909
|
diff: DiffContentSchema.optional(),
|
|
1918
1910
|
executionTrace: ExecutionTraceSchema.optional()
|
|
1919
1911
|
});
|
|
1920
1912
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1921
1913
|
/** Agent ID for this run */
|
|
1922
|
-
agentId:
|
|
1914
|
+
agentId: z27.string().optional(),
|
|
1923
1915
|
/** Preset ID that originated this run (optional) */
|
|
1924
|
-
presetId:
|
|
1916
|
+
presetId: z27.string().optional(),
|
|
1925
1917
|
/** Skill IDs for this run */
|
|
1926
|
-
skillIds:
|
|
1918
|
+
skillIds: z27.array(z27.string()).optional(),
|
|
1927
1919
|
/** Map of skillId to skillVersionId for this run */
|
|
1928
|
-
skillVersions:
|
|
1920
|
+
skillVersions: z27.record(z27.string(), z27.string()).optional(),
|
|
1929
1921
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
1930
|
-
scenarioIds:
|
|
1922
|
+
scenarioIds: z27.array(z27.string()),
|
|
1931
1923
|
/** Current status */
|
|
1932
1924
|
status: EvalStatusSchema,
|
|
1933
1925
|
/** Progress percentage (0-100) */
|
|
1934
|
-
progress:
|
|
1926
|
+
progress: z27.number(),
|
|
1935
1927
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1936
|
-
results:
|
|
1928
|
+
results: z27.array(z27.lazy(() => EvalRunResultSchema)),
|
|
1937
1929
|
/** Aggregated metrics across all results */
|
|
1938
1930
|
aggregateMetrics: EvalMetricsSchema,
|
|
1939
1931
|
/** Failure analyses */
|
|
1940
|
-
failureAnalyses:
|
|
1932
|
+
failureAnalyses: z27.array(FailureAnalysisSchema).optional(),
|
|
1941
1933
|
/** Aggregated LLM trace summary */
|
|
1942
1934
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
1943
1935
|
/** What triggered this run */
|
|
1944
1936
|
trigger: TriggerSchema.optional(),
|
|
1945
1937
|
/** When the run started (set when evaluation is triggered) */
|
|
1946
|
-
startedAt:
|
|
1938
|
+
startedAt: z27.string().optional(),
|
|
1947
1939
|
/** When the run completed */
|
|
1948
|
-
completedAt:
|
|
1940
|
+
completedAt: z27.string().optional(),
|
|
1949
1941
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1950
|
-
liveTraceEvents:
|
|
1942
|
+
liveTraceEvents: z27.array(LiveTraceEventSchema).optional(),
|
|
1951
1943
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1952
|
-
jobId:
|
|
1944
|
+
jobId: z27.string().optional(),
|
|
1953
1945
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1954
|
-
jobStatus:
|
|
1946
|
+
jobStatus: z27.string().optional(),
|
|
1955
1947
|
/** Remote job error message if the job failed */
|
|
1956
|
-
jobError:
|
|
1948
|
+
jobError: z27.string().optional(),
|
|
1957
1949
|
/** Timestamp of the last job status check */
|
|
1958
|
-
jobStatusCheckedAt:
|
|
1950
|
+
jobStatusCheckedAt: z27.string().optional(),
|
|
1959
1951
|
/** MCP server IDs to enable for this run (optional) */
|
|
1960
|
-
mcpIds:
|
|
1952
|
+
mcpIds: z27.array(z27.string()).optional(),
|
|
1961
1953
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
1962
|
-
subAgentIds:
|
|
1954
|
+
subAgentIds: z27.array(z27.string()).optional(),
|
|
1963
1955
|
/** Rule IDs to enable for this run (optional) */
|
|
1964
|
-
ruleIds:
|
|
1956
|
+
ruleIds: z27.array(z27.string()).optional(),
|
|
1965
1957
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
1966
|
-
tags:
|
|
1958
|
+
tags: z27.array(z27.string()).optional()
|
|
1967
1959
|
});
|
|
1968
1960
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
1969
1961
|
id: true,
|
|
@@ -1978,60 +1970,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1978
1970
|
scenarioIds: true
|
|
1979
1971
|
}).extend({
|
|
1980
1972
|
/** Optional on input — backend resolves from tags when not provided */
|
|
1981
|
-
scenarioIds:
|
|
1973
|
+
scenarioIds: z27.array(z27.string()).optional()
|
|
1982
1974
|
}).refine(
|
|
1983
1975
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
1984
1976
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
1985
1977
|
);
|
|
1986
|
-
var EvaluationProgressSchema =
|
|
1987
|
-
runId:
|
|
1988
|
-
targetId:
|
|
1989
|
-
totalScenarios:
|
|
1990
|
-
completedScenarios:
|
|
1991
|
-
scenarioProgress:
|
|
1992
|
-
|
|
1993
|
-
scenarioId:
|
|
1994
|
-
currentStep:
|
|
1995
|
-
error:
|
|
1978
|
+
var EvaluationProgressSchema = z27.object({
|
|
1979
|
+
runId: z27.string(),
|
|
1980
|
+
targetId: z27.string(),
|
|
1981
|
+
totalScenarios: z27.number(),
|
|
1982
|
+
completedScenarios: z27.number(),
|
|
1983
|
+
scenarioProgress: z27.array(
|
|
1984
|
+
z27.object({
|
|
1985
|
+
scenarioId: z27.string(),
|
|
1986
|
+
currentStep: z27.string(),
|
|
1987
|
+
error: z27.string().optional()
|
|
1996
1988
|
})
|
|
1997
1989
|
),
|
|
1998
|
-
createdAt:
|
|
1999
|
-
});
|
|
2000
|
-
var EvaluationLogSchema =
|
|
2001
|
-
runId:
|
|
2002
|
-
scenarioId:
|
|
2003
|
-
log:
|
|
2004
|
-
level:
|
|
2005
|
-
message:
|
|
2006
|
-
args:
|
|
2007
|
-
error:
|
|
1990
|
+
createdAt: z27.number()
|
|
1991
|
+
});
|
|
1992
|
+
var EvaluationLogSchema = z27.object({
|
|
1993
|
+
runId: z27.string(),
|
|
1994
|
+
scenarioId: z27.string(),
|
|
1995
|
+
log: z27.object({
|
|
1996
|
+
level: z27.enum(["info", "error", "debug"]),
|
|
1997
|
+
message: z27.string().optional(),
|
|
1998
|
+
args: z27.array(z27.any()).optional(),
|
|
1999
|
+
error: z27.string().optional()
|
|
2008
2000
|
})
|
|
2009
2001
|
});
|
|
2010
2002
|
var LLM_TIMEOUT = 12e4;
|
|
2011
2003
|
|
|
2012
2004
|
// src/evaluation/conversation.ts
|
|
2013
|
-
import { z as
|
|
2014
|
-
var TextBlockSchema =
|
|
2015
|
-
type:
|
|
2016
|
-
text:
|
|
2017
|
-
});
|
|
2018
|
-
var ThinkingBlockSchema =
|
|
2019
|
-
type:
|
|
2020
|
-
thinking:
|
|
2021
|
-
});
|
|
2022
|
-
var ToolUseBlockSchema =
|
|
2023
|
-
type:
|
|
2024
|
-
toolName:
|
|
2025
|
-
toolId:
|
|
2026
|
-
input:
|
|
2027
|
-
});
|
|
2028
|
-
var ToolResultBlockSchema =
|
|
2029
|
-
type:
|
|
2030
|
-
toolUseId:
|
|
2031
|
-
content:
|
|
2032
|
-
isError:
|
|
2033
|
-
});
|
|
2034
|
-
var ConversationBlockSchema =
|
|
2005
|
+
import { z as z28 } from "zod";
|
|
2006
|
+
var TextBlockSchema = z28.object({
|
|
2007
|
+
type: z28.literal("text"),
|
|
2008
|
+
text: z28.string()
|
|
2009
|
+
});
|
|
2010
|
+
var ThinkingBlockSchema = z28.object({
|
|
2011
|
+
type: z28.literal("thinking"),
|
|
2012
|
+
thinking: z28.string()
|
|
2013
|
+
});
|
|
2014
|
+
var ToolUseBlockSchema = z28.object({
|
|
2015
|
+
type: z28.literal("tool_use"),
|
|
2016
|
+
toolName: z28.string(),
|
|
2017
|
+
toolId: z28.string(),
|
|
2018
|
+
input: z28.unknown()
|
|
2019
|
+
});
|
|
2020
|
+
var ToolResultBlockSchema = z28.object({
|
|
2021
|
+
type: z28.literal("tool_result"),
|
|
2022
|
+
toolUseId: z28.string(),
|
|
2023
|
+
content: z28.string(),
|
|
2024
|
+
isError: z28.boolean().optional()
|
|
2025
|
+
});
|
|
2026
|
+
var ConversationBlockSchema = z28.discriminatedUnion("type", [
|
|
2035
2027
|
TextBlockSchema,
|
|
2036
2028
|
ThinkingBlockSchema,
|
|
2037
2029
|
ToolUseBlockSchema,
|
|
@@ -2042,18 +2034,18 @@ var ConversationMessageRoles = [
|
|
|
2042
2034
|
"user",
|
|
2043
2035
|
"system"
|
|
2044
2036
|
];
|
|
2045
|
-
var ConversationMessageSchema =
|
|
2046
|
-
role:
|
|
2047
|
-
content:
|
|
2048
|
-
timestamp:
|
|
2037
|
+
var ConversationMessageSchema = z28.object({
|
|
2038
|
+
role: z28.enum(ConversationMessageRoles),
|
|
2039
|
+
content: z28.array(ConversationBlockSchema),
|
|
2040
|
+
timestamp: z28.string()
|
|
2049
2041
|
});
|
|
2050
|
-
var ScenarioConversationSchema =
|
|
2051
|
-
id:
|
|
2052
|
-
projectId:
|
|
2053
|
-
evalRunId:
|
|
2054
|
-
resultId:
|
|
2055
|
-
messages:
|
|
2056
|
-
createdAt:
|
|
2042
|
+
var ScenarioConversationSchema = z28.object({
|
|
2043
|
+
id: z28.string(),
|
|
2044
|
+
projectId: z28.string(),
|
|
2045
|
+
evalRunId: z28.string(),
|
|
2046
|
+
resultId: z28.string(),
|
|
2047
|
+
messages: z28.array(ConversationMessageSchema),
|
|
2048
|
+
createdAt: z28.string()
|
|
2057
2049
|
});
|
|
2058
2050
|
|
|
2059
2051
|
// src/evaluation/eval-result.ts
|
|
@@ -2064,100 +2056,100 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
2064
2056
|
AssertionResultStatus2["ERROR"] = "error";
|
|
2065
2057
|
return AssertionResultStatus2;
|
|
2066
2058
|
})(AssertionResultStatus || {});
|
|
2067
|
-
var AssertionResultSchema =
|
|
2068
|
-
id:
|
|
2069
|
-
assertionId:
|
|
2070
|
-
assertionType:
|
|
2071
|
-
assertionName:
|
|
2072
|
-
status:
|
|
2073
|
-
message:
|
|
2074
|
-
expected:
|
|
2075
|
-
actual:
|
|
2076
|
-
duration:
|
|
2077
|
-
details:
|
|
2078
|
-
llmTraceSteps:
|
|
2079
|
-
});
|
|
2080
|
-
var EvalRunResultSchema =
|
|
2081
|
-
id:
|
|
2082
|
-
targetId:
|
|
2083
|
-
targetName:
|
|
2059
|
+
var AssertionResultSchema = z29.object({
|
|
2060
|
+
id: z29.string(),
|
|
2061
|
+
assertionId: z29.string(),
|
|
2062
|
+
assertionType: z29.string(),
|
|
2063
|
+
assertionName: z29.string(),
|
|
2064
|
+
status: z29.enum(AssertionResultStatus),
|
|
2065
|
+
message: z29.string().optional(),
|
|
2066
|
+
expected: z29.string().optional(),
|
|
2067
|
+
actual: z29.string().optional(),
|
|
2068
|
+
duration: z29.number().optional(),
|
|
2069
|
+
details: z29.record(z29.string(), z29.unknown()).optional(),
|
|
2070
|
+
llmTraceSteps: z29.array(LLMTraceStepSchema).optional()
|
|
2071
|
+
});
|
|
2072
|
+
var EvalRunResultSchema = z29.object({
|
|
2073
|
+
id: z29.string(),
|
|
2074
|
+
targetId: z29.string(),
|
|
2075
|
+
targetName: z29.string().optional(),
|
|
2084
2076
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
2085
|
-
skillVersionId:
|
|
2077
|
+
skillVersionId: z29.string().optional(),
|
|
2086
2078
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
2087
|
-
skillVersion:
|
|
2088
|
-
scenarioId:
|
|
2089
|
-
scenarioName:
|
|
2079
|
+
skillVersion: z29.string().optional(),
|
|
2080
|
+
scenarioId: z29.string(),
|
|
2081
|
+
scenarioName: z29.string(),
|
|
2090
2082
|
modelConfig: ModelConfigSchema.optional(),
|
|
2091
|
-
assertionResults:
|
|
2083
|
+
assertionResults: z29.array(AssertionResultSchema),
|
|
2092
2084
|
metrics: EvalMetricsSchema.optional(),
|
|
2093
|
-
passed:
|
|
2094
|
-
failed:
|
|
2095
|
-
passRate:
|
|
2096
|
-
duration:
|
|
2097
|
-
outputText:
|
|
2098
|
-
files:
|
|
2099
|
-
fileDiffs:
|
|
2085
|
+
passed: z29.number(),
|
|
2086
|
+
failed: z29.number(),
|
|
2087
|
+
passRate: z29.number(),
|
|
2088
|
+
duration: z29.number(),
|
|
2089
|
+
outputText: z29.string().optional(),
|
|
2090
|
+
files: z29.array(ExpectedFileSchema).optional(),
|
|
2091
|
+
fileDiffs: z29.array(DiffContentSchema).optional(),
|
|
2100
2092
|
/** Full template files after execution with status indicators */
|
|
2101
|
-
templateFiles:
|
|
2102
|
-
startedAt:
|
|
2103
|
-
completedAt:
|
|
2093
|
+
templateFiles: z29.array(TemplateFileSchema).optional(),
|
|
2094
|
+
startedAt: z29.string().optional(),
|
|
2095
|
+
completedAt: z29.string().optional(),
|
|
2104
2096
|
llmTrace: LLMTraceSchema.optional(),
|
|
2105
2097
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
2106
|
-
conversation:
|
|
2107
|
-
});
|
|
2108
|
-
var PromptResultSchema =
|
|
2109
|
-
text:
|
|
2110
|
-
files:
|
|
2111
|
-
finishReason:
|
|
2112
|
-
reasoning:
|
|
2113
|
-
reasoningDetails:
|
|
2114
|
-
toolCalls:
|
|
2115
|
-
toolResults:
|
|
2116
|
-
warnings:
|
|
2117
|
-
sources:
|
|
2118
|
-
steps:
|
|
2119
|
-
generationTimeMs:
|
|
2120
|
-
prompt:
|
|
2121
|
-
systemPrompt:
|
|
2122
|
-
usage:
|
|
2123
|
-
totalTokens:
|
|
2124
|
-
totalMicrocentsSpent:
|
|
2098
|
+
conversation: z29.array(ConversationMessageSchema).optional()
|
|
2099
|
+
});
|
|
2100
|
+
var PromptResultSchema = z29.object({
|
|
2101
|
+
text: z29.string(),
|
|
2102
|
+
files: z29.array(z29.unknown()).optional(),
|
|
2103
|
+
finishReason: z29.string().optional(),
|
|
2104
|
+
reasoning: z29.string().optional(),
|
|
2105
|
+
reasoningDetails: z29.unknown().optional(),
|
|
2106
|
+
toolCalls: z29.array(z29.unknown()).optional(),
|
|
2107
|
+
toolResults: z29.array(z29.unknown()).optional(),
|
|
2108
|
+
warnings: z29.array(z29.unknown()).optional(),
|
|
2109
|
+
sources: z29.array(z29.unknown()).optional(),
|
|
2110
|
+
steps: z29.array(z29.unknown()),
|
|
2111
|
+
generationTimeMs: z29.number(),
|
|
2112
|
+
prompt: z29.string(),
|
|
2113
|
+
systemPrompt: z29.string(),
|
|
2114
|
+
usage: z29.object({
|
|
2115
|
+
totalTokens: z29.number().optional(),
|
|
2116
|
+
totalMicrocentsSpent: z29.number().optional()
|
|
2125
2117
|
})
|
|
2126
2118
|
});
|
|
2127
|
-
var EvaluationResultSchema =
|
|
2128
|
-
id:
|
|
2129
|
-
runId:
|
|
2130
|
-
timestamp:
|
|
2119
|
+
var EvaluationResultSchema = z29.object({
|
|
2120
|
+
id: z29.string(),
|
|
2121
|
+
runId: z29.string(),
|
|
2122
|
+
timestamp: z29.number(),
|
|
2131
2123
|
promptResult: PromptResultSchema,
|
|
2132
|
-
testResults:
|
|
2133
|
-
tags:
|
|
2134
|
-
feedback:
|
|
2135
|
-
score:
|
|
2136
|
-
suiteId:
|
|
2137
|
-
});
|
|
2138
|
-
var LeanEvaluationResultSchema =
|
|
2139
|
-
id:
|
|
2140
|
-
runId:
|
|
2141
|
-
timestamp:
|
|
2142
|
-
tags:
|
|
2143
|
-
scenarioId:
|
|
2144
|
-
scenarioVersion:
|
|
2145
|
-
targetId:
|
|
2146
|
-
targetVersion:
|
|
2147
|
-
suiteId:
|
|
2148
|
-
score:
|
|
2149
|
-
time:
|
|
2150
|
-
microcentsSpent:
|
|
2124
|
+
testResults: z29.array(z29.unknown()),
|
|
2125
|
+
tags: z29.array(z29.string()).optional(),
|
|
2126
|
+
feedback: z29.string().optional(),
|
|
2127
|
+
score: z29.number(),
|
|
2128
|
+
suiteId: z29.string().optional()
|
|
2129
|
+
});
|
|
2130
|
+
var LeanEvaluationResultSchema = z29.object({
|
|
2131
|
+
id: z29.string(),
|
|
2132
|
+
runId: z29.string(),
|
|
2133
|
+
timestamp: z29.number(),
|
|
2134
|
+
tags: z29.array(z29.string()).optional(),
|
|
2135
|
+
scenarioId: z29.string(),
|
|
2136
|
+
scenarioVersion: z29.number().optional(),
|
|
2137
|
+
targetId: z29.string(),
|
|
2138
|
+
targetVersion: z29.number().optional(),
|
|
2139
|
+
suiteId: z29.string().optional(),
|
|
2140
|
+
score: z29.number(),
|
|
2141
|
+
time: z29.number().optional(),
|
|
2142
|
+
microcentsSpent: z29.number().optional()
|
|
2151
2143
|
});
|
|
2152
2144
|
|
|
2153
2145
|
// src/project/project.ts
|
|
2154
|
-
import { z as
|
|
2146
|
+
import { z as z30 } from "zod";
|
|
2155
2147
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
2156
|
-
appId:
|
|
2157
|
-
appSecret:
|
|
2158
|
-
useWixAuth:
|
|
2159
|
-
useBase44Auth:
|
|
2160
|
-
scenarioTags:
|
|
2148
|
+
appId: z30.string().optional().describe("The ID of the app in Dev Center"),
|
|
2149
|
+
appSecret: z30.string().optional().describe("The secret of the app in Dev Center"),
|
|
2150
|
+
useWixAuth: z30.boolean().optional().describe("Enable Wix CLI/MCP auth for evaluations"),
|
|
2151
|
+
useBase44Auth: z30.boolean().optional().describe("Enable Base44 auth for evaluations"),
|
|
2152
|
+
scenarioTags: z30.array(z30.string()).optional().describe("Project-level tag vocabulary for scenarios")
|
|
2161
2153
|
});
|
|
2162
2154
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
2163
2155
|
id: true,
|
|
@@ -2210,7 +2202,7 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2210
2202
|
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2211
2203
|
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2212
2204
|
name: "Tool Called With Param",
|
|
2213
|
-
description: "Check that a tool was called with expected parameters",
|
|
2205
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
2214
2206
|
type: "tool_called_with_param",
|
|
2215
2207
|
parameters: [
|
|
2216
2208
|
{
|
|
@@ -2224,6 +2216,14 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2224
2216
|
label: "Expected Parameters (JSON, substring match)",
|
|
2225
2217
|
type: "string",
|
|
2226
2218
|
required: true
|
|
2219
|
+
},
|
|
2220
|
+
{
|
|
2221
|
+
name: "requireSuccess",
|
|
2222
|
+
label: "Require Successful Call",
|
|
2223
|
+
type: "boolean",
|
|
2224
|
+
required: false,
|
|
2225
|
+
defaultValue: false,
|
|
2226
|
+
advanced: true
|
|
2227
2227
|
}
|
|
2228
2228
|
]
|
|
2229
2229
|
},
|
|
@@ -2355,6 +2355,9 @@ export {
|
|
|
2355
2355
|
BuildCheckTestSchema,
|
|
2356
2356
|
BuildPassedAssertionSchema,
|
|
2357
2357
|
BuildPassedConfigSchema,
|
|
2358
|
+
BulkImportResultItemSchema,
|
|
2359
|
+
BulkImportResultSchema,
|
|
2360
|
+
BulkImportSkillsInputSchema,
|
|
2358
2361
|
ClaudeModel2 as ClaudeModel,
|
|
2359
2362
|
ClaudeModelSchema,
|
|
2360
2363
|
CommandExecutionSchema,
|