@voltagent/scorers 2.0.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/index.cjs +350 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +30 -2
- package/dist/index.d.ts +30 -2
- package/dist/index.js +346 -0
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -3,5 +3,9 @@
|
|
|
3
3
|
Re-export of the prebuilt scorer utilities used by Viteval. These scorers originate from the Viteval project and are surfaced here so VoltAgent components can depend on them without pulling the full Viteval toolchain.
|
|
4
4
|
|
|
5
5
|
```ts
|
|
6
|
-
import {
|
|
6
|
+
import {
|
|
7
|
+
scorers,
|
|
8
|
+
createAnswerCorrectnessScorer,
|
|
9
|
+
createToolCallAccuracyScorerCode,
|
|
10
|
+
} from "@voltagent/scorers";
|
|
7
11
|
```
|
package/dist/index.cjs
CHANGED
|
@@ -26,7 +26,7 @@ __export(index_exports, {
|
|
|
26
26
|
ListContains: () => import_autoevals.ListContains,
|
|
27
27
|
NumericDiff: () => import_autoevals.NumericDiff,
|
|
28
28
|
adaptScorerForAgentEval: () => adaptScorerForAgentEval,
|
|
29
|
-
buildSamplingMetadata: () =>
|
|
29
|
+
buildSamplingMetadata: () => import_core10.buildSamplingMetadata,
|
|
30
30
|
createAnswerCorrectnessScorer: () => createAnswerCorrectnessScorer,
|
|
31
31
|
createAnswerRelevancyScorer: () => createAnswerRelevancyScorer,
|
|
32
32
|
createContextPrecisionScorer: () => createContextPrecisionScorer,
|
|
@@ -37,12 +37,13 @@ __export(index_exports, {
|
|
|
37
37
|
createModerationScorer: () => createModerationScorer,
|
|
38
38
|
createPossibleScorer: () => createPossibleScorer,
|
|
39
39
|
createSummaryScorer: () => createSummaryScorer,
|
|
40
|
+
createToolCallAccuracyScorerCode: () => createToolCallAccuracyScorerCode,
|
|
40
41
|
createTranslationScorer: () => createTranslationScorer,
|
|
41
|
-
normalizeScorerResult: () =>
|
|
42
|
+
normalizeScorerResult: () => import_core10.normalizeScorerResult,
|
|
42
43
|
rawAutoEvalScorers: () => rawAutoEvalScorers,
|
|
43
|
-
runLocalScorers: () =>
|
|
44
|
+
runLocalScorers: () => import_core10.runLocalScorers,
|
|
44
45
|
scorers: () => scorers,
|
|
45
|
-
shouldSample: () =>
|
|
46
|
+
shouldSample: () => import_core10.shouldSample
|
|
46
47
|
});
|
|
47
48
|
module.exports = __toCommonJS(index_exports);
|
|
48
49
|
var import_autoevals = require("autoevals");
|
|
@@ -285,7 +286,7 @@ function normalizeMetadata(value) {
|
|
|
285
286
|
}
|
|
286
287
|
|
|
287
288
|
// src/index.ts
|
|
288
|
-
var
|
|
289
|
+
var import_core10 = require("@voltagent/core");
|
|
289
290
|
|
|
290
291
|
// src/llm/moderation.ts
|
|
291
292
|
var import_core2 = require("@voltagent/core");
|
|
@@ -1561,6 +1562,349 @@ function mergeMetadata5(base, additional) {
|
|
|
1561
1562
|
return { ...base, ...additional };
|
|
1562
1563
|
}
|
|
1563
1564
|
|
|
1565
|
+
// src/tool-call-accuracy.ts
|
|
1566
|
+
var import_core9 = require("@voltagent/core");
|
|
1567
|
+
var import_utils9 = require("@voltagent/internal/utils");
|
|
1568
|
+
function createToolCallAccuracyScorerCode({
|
|
1569
|
+
id = "toolCallAccuracyCode",
|
|
1570
|
+
name = "Tool Call Accuracy (Code)",
|
|
1571
|
+
expectedTool,
|
|
1572
|
+
expectedToolOrder,
|
|
1573
|
+
strictMode = false,
|
|
1574
|
+
metadata,
|
|
1575
|
+
buildPayload
|
|
1576
|
+
}) {
|
|
1577
|
+
const normalizedExpectedTool = normalizeToolName(expectedTool);
|
|
1578
|
+
const normalizedExpectedToolOrder = normalizeExpectedToolOrder(expectedToolOrder);
|
|
1579
|
+
if (!normalizedExpectedTool && normalizedExpectedToolOrder.length === 0) {
|
|
1580
|
+
throw new Error(
|
|
1581
|
+
"createToolCallAccuracyScorerCode requires either expectedTool or expectedToolOrder"
|
|
1582
|
+
);
|
|
1583
|
+
}
|
|
1584
|
+
const mode = normalizedExpectedToolOrder.length > 0 ? "tool_order" : "single_tool";
|
|
1585
|
+
return (0, import_core9.buildScorer)({
|
|
1586
|
+
id,
|
|
1587
|
+
label: name,
|
|
1588
|
+
metadata: mergeMetadata6(metadata, {
|
|
1589
|
+
voltAgent: {
|
|
1590
|
+
scorer: id,
|
|
1591
|
+
category: "tool_call_accuracy",
|
|
1592
|
+
mode: "code"
|
|
1593
|
+
}
|
|
1594
|
+
})
|
|
1595
|
+
}).score((context) => {
|
|
1596
|
+
const payload = resolvePayload6(context, buildPayload);
|
|
1597
|
+
const actualTools = extractToolNames(payload);
|
|
1598
|
+
const hasToolCalls = actualTools.length > 0;
|
|
1599
|
+
let correctToolCalled = false;
|
|
1600
|
+
let correctOrderCalled = null;
|
|
1601
|
+
if (mode === "tool_order") {
|
|
1602
|
+
correctOrderCalled = checkToolOrder(actualTools, normalizedExpectedToolOrder, strictMode);
|
|
1603
|
+
} else if (normalizedExpectedTool) {
|
|
1604
|
+
correctToolCalled = strictMode ? actualTools.length === 1 && actualTools[0] === normalizedExpectedTool : actualTools.includes(normalizedExpectedTool);
|
|
1605
|
+
}
|
|
1606
|
+
const score = mode === "tool_order" ? correctOrderCalled ? 1 : 0 : correctToolCalled ? 1 : 0;
|
|
1607
|
+
const evaluation = {
|
|
1608
|
+
score,
|
|
1609
|
+
actualTools,
|
|
1610
|
+
expectedTool: normalizedExpectedTool,
|
|
1611
|
+
expectedToolOrder: normalizedExpectedToolOrder.length > 0 ? normalizedExpectedToolOrder : void 0,
|
|
1612
|
+
strictMode,
|
|
1613
|
+
hasToolCalls,
|
|
1614
|
+
correctToolCalled,
|
|
1615
|
+
correctOrderCalled,
|
|
1616
|
+
mode
|
|
1617
|
+
};
|
|
1618
|
+
context.results.raw.toolCallAccuracyEvaluation = evaluation;
|
|
1619
|
+
return score;
|
|
1620
|
+
}).reason(({ results }) => {
|
|
1621
|
+
const evaluation = results.raw.toolCallAccuracyEvaluation;
|
|
1622
|
+
if (!evaluation) {
|
|
1623
|
+
return { reason: "Tool call accuracy evaluation was not available." };
|
|
1624
|
+
}
|
|
1625
|
+
return {
|
|
1626
|
+
reason: buildReason(evaluation),
|
|
1627
|
+
metadata: {
|
|
1628
|
+
toolCallAccuracy: {
|
|
1629
|
+
mode: evaluation.mode,
|
|
1630
|
+
strictMode: evaluation.strictMode,
|
|
1631
|
+
hasToolCalls: evaluation.hasToolCalls,
|
|
1632
|
+
actualTools: evaluation.actualTools,
|
|
1633
|
+
expectedTool: evaluation.expectedTool,
|
|
1634
|
+
expectedToolOrder: evaluation.expectedToolOrder,
|
|
1635
|
+
correctToolCalled: evaluation.correctToolCalled,
|
|
1636
|
+
correctOrderCalled: evaluation.correctOrderCalled,
|
|
1637
|
+
score: evaluation.score
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
};
|
|
1641
|
+
}).build();
|
|
1642
|
+
}
|
|
1643
|
+
function resolvePayload6(context, buildPayload) {
|
|
1644
|
+
if (buildPayload) {
|
|
1645
|
+
return buildPayload(context);
|
|
1646
|
+
}
|
|
1647
|
+
const payload = context.payload;
|
|
1648
|
+
return {
|
|
1649
|
+
toolCalls: payload.toolCalls,
|
|
1650
|
+
toolResults: payload.toolResults,
|
|
1651
|
+
messages: payload.messages,
|
|
1652
|
+
output: payload.output,
|
|
1653
|
+
rawOutput: payload.rawOutput
|
|
1654
|
+
};
|
|
1655
|
+
}
|
|
1656
|
+
function extractToolNames(payload) {
|
|
1657
|
+
const directToolCalls = extractToolNamesFromList(payload.toolCalls);
|
|
1658
|
+
if (directToolCalls.length > 0) {
|
|
1659
|
+
return directToolCalls;
|
|
1660
|
+
}
|
|
1661
|
+
const messageToolCalls = extractToolNamesFromMessages(payload.messages);
|
|
1662
|
+
if (messageToolCalls.length > 0) {
|
|
1663
|
+
return messageToolCalls;
|
|
1664
|
+
}
|
|
1665
|
+
const rawOutputToolCalls = extractToolNamesFromOutput(payload.rawOutput);
|
|
1666
|
+
if (rawOutputToolCalls.length > 0) {
|
|
1667
|
+
return rawOutputToolCalls;
|
|
1668
|
+
}
|
|
1669
|
+
return extractToolNamesFromOutput(payload.output);
|
|
1670
|
+
}
|
|
1671
|
+
function extractToolNamesFromOutput(value) {
|
|
1672
|
+
if (Array.isArray(value)) {
|
|
1673
|
+
return extractToolNamesFromMessages(value);
|
|
1674
|
+
}
|
|
1675
|
+
if (!isPlainRecord(value)) {
|
|
1676
|
+
return [];
|
|
1677
|
+
}
|
|
1678
|
+
const fromToolCalls = extractToolNamesFromList(value.toolCalls);
|
|
1679
|
+
if (fromToolCalls.length > 0) {
|
|
1680
|
+
return fromToolCalls;
|
|
1681
|
+
}
|
|
1682
|
+
const fromMessages = extractToolNamesFromMessages(value.messages);
|
|
1683
|
+
if (fromMessages.length > 0) {
|
|
1684
|
+
return fromMessages;
|
|
1685
|
+
}
|
|
1686
|
+
if (Array.isArray(value.steps)) {
|
|
1687
|
+
const fromSteps = extractToolNamesFromMessages(value.steps);
|
|
1688
|
+
if (fromSteps.length > 0) {
|
|
1689
|
+
return fromSteps;
|
|
1690
|
+
}
|
|
1691
|
+
}
|
|
1692
|
+
return [];
|
|
1693
|
+
}
|
|
1694
|
+
function extractToolNamesFromMessages(value) {
|
|
1695
|
+
if (!Array.isArray(value)) {
|
|
1696
|
+
return [];
|
|
1697
|
+
}
|
|
1698
|
+
const toolNames = [];
|
|
1699
|
+
for (const message of value) {
|
|
1700
|
+
if (!isPlainRecord(message)) {
|
|
1701
|
+
continue;
|
|
1702
|
+
}
|
|
1703
|
+
const messageType = normalizeMessageType(message.type);
|
|
1704
|
+
if (messageType === "tool_call") {
|
|
1705
|
+
const directToolName = extractToolName(message);
|
|
1706
|
+
if (directToolName) {
|
|
1707
|
+
toolNames.push(directToolName);
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
if (Array.isArray(message.toolInvocations)) {
|
|
1711
|
+
toolNames.push(...extractToolNamesFromList(message.toolInvocations));
|
|
1712
|
+
}
|
|
1713
|
+
if (Array.isArray(message.parts)) {
|
|
1714
|
+
toolNames.push(...extractToolNamesFromParts(message.parts));
|
|
1715
|
+
}
|
|
1716
|
+
if (isPlainRecord(message.content)) {
|
|
1717
|
+
if (Array.isArray(message.content.toolInvocations)) {
|
|
1718
|
+
toolNames.push(...extractToolNamesFromList(message.content.toolInvocations));
|
|
1719
|
+
}
|
|
1720
|
+
if (Array.isArray(message.content.parts)) {
|
|
1721
|
+
toolNames.push(...extractToolNamesFromParts(message.content.parts));
|
|
1722
|
+
}
|
|
1723
|
+
} else if (Array.isArray(message.content)) {
|
|
1724
|
+
toolNames.push(...extractToolNamesFromParts(message.content));
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
return toolNames;
|
|
1728
|
+
}
|
|
1729
|
+
function extractToolNamesFromParts(parts) {
|
|
1730
|
+
const toolNames = [];
|
|
1731
|
+
for (const part of parts) {
|
|
1732
|
+
if (!isPlainRecord(part)) {
|
|
1733
|
+
continue;
|
|
1734
|
+
}
|
|
1735
|
+
const partType = normalizeMessageType(part.type);
|
|
1736
|
+
if (partType !== "tool_call") {
|
|
1737
|
+
continue;
|
|
1738
|
+
}
|
|
1739
|
+
const toolName = extractToolName(part);
|
|
1740
|
+
if (toolName) {
|
|
1741
|
+
toolNames.push(toolName);
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
return toolNames;
|
|
1745
|
+
}
|
|
1746
|
+
function extractToolNamesFromList(value) {
|
|
1747
|
+
if (!Array.isArray(value)) {
|
|
1748
|
+
return [];
|
|
1749
|
+
}
|
|
1750
|
+
const toolNames = [];
|
|
1751
|
+
for (const entry of value) {
|
|
1752
|
+
const toolName = extractToolName(entry);
|
|
1753
|
+
if (toolName) {
|
|
1754
|
+
toolNames.push(toolName);
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
return toolNames;
|
|
1758
|
+
}
|
|
1759
|
+
function extractToolName(value) {
|
|
1760
|
+
if (typeof value === "string") {
|
|
1761
|
+
return normalizeToolName(value);
|
|
1762
|
+
}
|
|
1763
|
+
if (!isPlainRecord(value)) {
|
|
1764
|
+
return void 0;
|
|
1765
|
+
}
|
|
1766
|
+
if (typeof value.toolName === "string") {
|
|
1767
|
+
return normalizeToolName(value.toolName);
|
|
1768
|
+
}
|
|
1769
|
+
if (typeof value.name === "string") {
|
|
1770
|
+
return normalizeToolName(value.name);
|
|
1771
|
+
}
|
|
1772
|
+
if (isPlainRecord(value.payload)) {
|
|
1773
|
+
if (typeof value.payload.toolName === "string") {
|
|
1774
|
+
return normalizeToolName(value.payload.toolName);
|
|
1775
|
+
}
|
|
1776
|
+
if (typeof value.payload.name === "string") {
|
|
1777
|
+
return normalizeToolName(value.payload.name);
|
|
1778
|
+
}
|
|
1779
|
+
}
|
|
1780
|
+
if (typeof value.type === "string") {
|
|
1781
|
+
const normalizedType = normalizeMessageType(value.type);
|
|
1782
|
+
if (shouldExtractToolNameFromType(normalizedType)) {
|
|
1783
|
+
return normalizeToolTypeName(value.type);
|
|
1784
|
+
}
|
|
1785
|
+
}
|
|
1786
|
+
return void 0;
|
|
1787
|
+
}
|
|
1788
|
+
function shouldExtractToolNameFromType(normalizedType) {
|
|
1789
|
+
if (!normalizedType) {
|
|
1790
|
+
return false;
|
|
1791
|
+
}
|
|
1792
|
+
if (!normalizedType.startsWith("tool")) {
|
|
1793
|
+
return false;
|
|
1794
|
+
}
|
|
1795
|
+
if (normalizedType === "tool" || normalizedType === "tool_call" || normalizedType === "tool_result") {
|
|
1796
|
+
return false;
|
|
1797
|
+
}
|
|
1798
|
+
const streamEventPrefixes = [
|
|
1799
|
+
"tool_input_",
|
|
1800
|
+
"tool_output_",
|
|
1801
|
+
"tool_call_",
|
|
1802
|
+
"tool_result_",
|
|
1803
|
+
"tool_invocation_",
|
|
1804
|
+
"tool_execution_"
|
|
1805
|
+
];
|
|
1806
|
+
if (streamEventPrefixes.some((prefix) => normalizedType.startsWith(prefix))) {
|
|
1807
|
+
return false;
|
|
1808
|
+
}
|
|
1809
|
+
const streamEventSuffixes = ["_start", "_end", "_delta", "_chunk", "_done"];
|
|
1810
|
+
if (streamEventSuffixes.some((suffix) => normalizedType.endsWith(suffix))) {
|
|
1811
|
+
return false;
|
|
1812
|
+
}
|
|
1813
|
+
return normalizedType.startsWith("tool_");
|
|
1814
|
+
}
|
|
1815
|
+
function normalizeToolName(value) {
|
|
1816
|
+
if (typeof value !== "string") {
|
|
1817
|
+
return void 0;
|
|
1818
|
+
}
|
|
1819
|
+
const trimmed = value.trim();
|
|
1820
|
+
if (!trimmed) {
|
|
1821
|
+
return void 0;
|
|
1822
|
+
}
|
|
1823
|
+
return trimmed;
|
|
1824
|
+
}
|
|
1825
|
+
function normalizeExpectedToolOrder(value) {
|
|
1826
|
+
if (!Array.isArray(value)) {
|
|
1827
|
+
return [];
|
|
1828
|
+
}
|
|
1829
|
+
const normalized = [];
|
|
1830
|
+
for (const item of value) {
|
|
1831
|
+
const toolName = normalizeToolName(item);
|
|
1832
|
+
if (toolName) {
|
|
1833
|
+
normalized.push(toolName);
|
|
1834
|
+
}
|
|
1835
|
+
}
|
|
1836
|
+
return normalized;
|
|
1837
|
+
}
|
|
1838
|
+
function normalizeMessageType(value) {
|
|
1839
|
+
if (typeof value !== "string") {
|
|
1840
|
+
return void 0;
|
|
1841
|
+
}
|
|
1842
|
+
return value.toLowerCase().replace(/-/g, "_");
|
|
1843
|
+
}
|
|
1844
|
+
function normalizeToolTypeName(value) {
|
|
1845
|
+
const normalized = value.toLowerCase();
|
|
1846
|
+
if (normalized.startsWith("tool-") && normalized !== "tool-call" && normalized !== "tool-result") {
|
|
1847
|
+
return normalizeToolName(value.slice(5));
|
|
1848
|
+
}
|
|
1849
|
+
if (normalized.startsWith("tool_") && normalized !== "tool_call" && normalized !== "tool_result") {
|
|
1850
|
+
return normalizeToolName(value.slice(5));
|
|
1851
|
+
}
|
|
1852
|
+
return void 0;
|
|
1853
|
+
}
|
|
1854
|
+
function checkToolOrder(actualTools, expectedOrder, strictMode) {
|
|
1855
|
+
if (strictMode) {
|
|
1856
|
+
if (actualTools.length !== expectedOrder.length) {
|
|
1857
|
+
return false;
|
|
1858
|
+
}
|
|
1859
|
+
for (let i = 0; i < expectedOrder.length; i++) {
|
|
1860
|
+
if (actualTools[i] !== expectedOrder[i]) {
|
|
1861
|
+
return false;
|
|
1862
|
+
}
|
|
1863
|
+
}
|
|
1864
|
+
return true;
|
|
1865
|
+
}
|
|
1866
|
+
let searchIndex = 0;
|
|
1867
|
+
for (const expectedTool of expectedOrder) {
|
|
1868
|
+
const foundIndex = actualTools.indexOf(expectedTool, searchIndex);
|
|
1869
|
+
if (foundIndex === -1) {
|
|
1870
|
+
return false;
|
|
1871
|
+
}
|
|
1872
|
+
searchIndex = foundIndex + 1;
|
|
1873
|
+
}
|
|
1874
|
+
return true;
|
|
1875
|
+
}
|
|
1876
|
+
function buildReason(evaluation) {
|
|
1877
|
+
const actualTools = (0, import_utils9.safeStringify)(evaluation.actualTools);
|
|
1878
|
+
if (evaluation.mode === "tool_order") {
|
|
1879
|
+
const expectedOrder = (0, import_utils9.safeStringify)(evaluation.expectedToolOrder ?? []);
|
|
1880
|
+
if (!evaluation.hasToolCalls) {
|
|
1881
|
+
return `No tool calls were made. Expected order: ${expectedOrder}.`;
|
|
1882
|
+
}
|
|
1883
|
+
if (evaluation.correctOrderCalled) {
|
|
1884
|
+
return `Tool calls matched expected order ${expectedOrder}. Actual tools: ${actualTools}.`;
|
|
1885
|
+
}
|
|
1886
|
+
return `Tool call order mismatch. Expected order ${expectedOrder}, actual tools ${actualTools}.`;
|
|
1887
|
+
}
|
|
1888
|
+
const expectedTool = evaluation.expectedTool ?? "";
|
|
1889
|
+
if (!evaluation.hasToolCalls) {
|
|
1890
|
+
return `No tool calls were made. Expected tool: "${expectedTool}".`;
|
|
1891
|
+
}
|
|
1892
|
+
if (evaluation.correctToolCalled) {
|
|
1893
|
+
return `Expected tool "${expectedTool}" was called. Actual tools: ${actualTools}.`;
|
|
1894
|
+
}
|
|
1895
|
+
return `Expected tool "${expectedTool}" was not called. Actual tools: ${actualTools}.`;
|
|
1896
|
+
}
|
|
1897
|
+
function isPlainRecord(value) {
|
|
1898
|
+
if (value === null || typeof value !== "object") {
|
|
1899
|
+
return false;
|
|
1900
|
+
}
|
|
1901
|
+
const proto = Object.getPrototypeOf(value);
|
|
1902
|
+
return proto === Object.prototype || proto === null;
|
|
1903
|
+
}
|
|
1904
|
+
function mergeMetadata6(base, additional) {
|
|
1905
|
+
return { ...base, ...additional };
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1564
1908
|
// src/index.ts
|
|
1565
1909
|
var rawAutoEvalScorers = {
|
|
1566
1910
|
listContains: import_autoevals.ListContains,
|
|
@@ -1626,6 +1970,7 @@ function adaptScorerForAgentEval(definition, options) {
|
|
|
1626
1970
|
createModerationScorer,
|
|
1627
1971
|
createPossibleScorer,
|
|
1628
1972
|
createSummaryScorer,
|
|
1973
|
+
createToolCallAccuracyScorerCode,
|
|
1629
1974
|
createTranslationScorer,
|
|
1630
1975
|
normalizeScorerResult,
|
|
1631
1976
|
rawAutoEvalScorers,
|