mongodb-mcp-server 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/accuracy-tests.yml +55 -0
- package/.github/workflows/check.yml +1 -1
- package/.github/workflows/code_health.yaml +1 -1
- package/.github/workflows/code_health_fork.yaml +0 -14
- package/.github/workflows/dependabot_pr.yaml +26 -0
- package/.github/workflows/jira-issue.yml +72 -0
- package/.vscode/extensions.json +1 -1
- package/.vscode/launch.json +11 -1
- package/.vscode/settings.json +1 -11
- package/Dockerfile +1 -0
- package/README.md +118 -31
- package/dist/common/atlas/accessListUtils.js +36 -0
- package/dist/common/atlas/accessListUtils.js.map +1 -0
- package/dist/common/atlas/apiClient.js +25 -6
- package/dist/common/atlas/apiClient.js.map +1 -1
- package/dist/common/atlas/cluster.js +1 -1
- package/dist/common/atlas/cluster.js.map +1 -1
- package/dist/{config.js → common/config.js} +7 -1
- package/dist/common/config.js.map +1 -0
- package/dist/common/errors.js.map +1 -0
- package/dist/{logger.js → common/logger.js} +20 -18
- package/dist/common/logger.js.map +1 -0
- package/dist/common/managedTimeout.js +20 -0
- package/dist/common/managedTimeout.js.map +1 -0
- package/dist/common/packageInfo.js.map +1 -0
- package/dist/{session.js → common/session.js} +20 -21
- package/dist/common/session.js.map +1 -0
- package/dist/common/sessionStore.js +73 -0
- package/dist/common/sessionStore.js.map +1 -0
- package/dist/helpers/container.js +28 -0
- package/dist/helpers/container.js.map +1 -0
- package/dist/helpers/generatePassword.js.map +1 -0
- package/dist/helpers/indexCheck.js +1 -1
- package/dist/helpers/indexCheck.js.map +1 -1
- package/dist/index.js +30 -37
- package/dist/index.js.map +1 -1
- package/dist/server.js +43 -6
- package/dist/server.js.map +1 -1
- package/dist/telemetry/constants.js +1 -1
- package/dist/telemetry/constants.js.map +1 -1
- package/dist/telemetry/telemetry.js +28 -21
- package/dist/telemetry/telemetry.js.map +1 -1
- package/dist/tools/atlas/atlasTool.js +3 -3
- package/dist/tools/atlas/atlasTool.js.map +1 -1
- package/dist/tools/atlas/connect/connectCluster.js +198 -0
- package/dist/tools/atlas/connect/connectCluster.js.map +1 -0
- package/dist/tools/atlas/create/createAccessList.js +9 -10
- package/dist/tools/atlas/create/createAccessList.js.map +1 -1
- package/dist/tools/atlas/create/createDBUser.js +3 -1
- package/dist/tools/atlas/create/createDBUser.js.map +1 -1
- package/dist/tools/atlas/create/createFreeCluster.js +2 -0
- package/dist/tools/atlas/create/createFreeCluster.js.map +1 -1
- package/dist/tools/atlas/create/createProject.js.map +1 -1
- package/dist/tools/atlas/read/inspectAccessList.js.map +1 -1
- package/dist/tools/atlas/read/inspectCluster.js.map +1 -1
- package/dist/tools/atlas/read/listAlerts.js.map +1 -1
- package/dist/tools/atlas/read/listClusters.js.map +1 -1
- package/dist/tools/atlas/read/listDBUsers.js.map +1 -1
- package/dist/tools/atlas/read/listOrgs.js.map +1 -1
- package/dist/tools/atlas/read/listProjects.js.map +1 -1
- package/dist/tools/atlas/tools.js +1 -1
- package/dist/tools/atlas/tools.js.map +1 -1
- package/dist/tools/mongodb/{metadata → connect}/connect.js +7 -4
- package/dist/tools/mongodb/connect/connect.js.map +1 -0
- package/dist/tools/mongodb/create/createCollection.js.map +1 -1
- package/dist/tools/mongodb/create/createIndex.js +1 -1
- package/dist/tools/mongodb/create/createIndex.js.map +1 -1
- package/dist/tools/mongodb/create/insertMany.js +1 -1
- package/dist/tools/mongodb/create/insertMany.js.map +1 -1
- package/dist/tools/mongodb/delete/deleteMany.js +2 -1
- package/dist/tools/mongodb/delete/deleteMany.js.map +1 -1
- package/dist/tools/mongodb/delete/dropCollection.js.map +1 -1
- package/dist/tools/mongodb/delete/dropDatabase.js.map +1 -1
- package/dist/tools/mongodb/metadata/collectionSchema.js.map +1 -1
- package/dist/tools/mongodb/metadata/collectionStorageSize.js.map +1 -1
- package/dist/tools/mongodb/metadata/dbStats.js.map +1 -1
- package/dist/tools/mongodb/metadata/explain.js +1 -1
- package/dist/tools/mongodb/metadata/explain.js.map +1 -1
- package/dist/tools/mongodb/metadata/listCollections.js.map +1 -1
- package/dist/tools/mongodb/metadata/listDatabases.js.map +1 -1
- package/dist/tools/mongodb/metadata/logs.js.map +1 -1
- package/dist/tools/mongodb/mongodbTool.js +37 -10
- package/dist/tools/mongodb/mongodbTool.js.map +1 -1
- package/dist/tools/mongodb/read/aggregate.js +1 -1
- package/dist/tools/mongodb/read/aggregate.js.map +1 -1
- package/dist/tools/mongodb/read/collectionIndexes.js.map +1 -1
- package/dist/tools/mongodb/read/count.js +2 -1
- package/dist/tools/mongodb/read/count.js.map +1 -1
- package/dist/tools/mongodb/read/find.js +7 -4
- package/dist/tools/mongodb/read/find.js.map +1 -1
- package/dist/tools/mongodb/tools.js +1 -1
- package/dist/tools/mongodb/tools.js.map +1 -1
- package/dist/tools/mongodb/update/renameCollection.js.map +1 -1
- package/dist/tools/mongodb/update/updateMany.js +4 -2
- package/dist/tools/mongodb/update/updateMany.js.map +1 -1
- package/dist/tools/tool.js +8 -5
- package/dist/tools/tool.js.map +1 -1
- package/dist/transports/base.js +26 -0
- package/dist/transports/base.js.map +1 -0
- package/dist/{helpers/EJsonTransport.js → transports/stdio.js} +24 -2
- package/dist/transports/stdio.js.map +1 -0
- package/dist/transports/streamableHttp.js +140 -0
- package/dist/transports/streamableHttp.js.map +1 -0
- package/eslint.config.js +13 -4
- package/package.json +43 -33
- package/resources/test-summary-template.html +415 -0
- package/scripts/accuracy/generateTestSummary.ts +335 -0
- package/scripts/accuracy/runAccuracyTests.sh +45 -0
- package/scripts/accuracy/updateAccuracyRunStatus.ts +21 -0
- package/src/common/atlas/accessListUtils.ts +54 -0
- package/src/common/atlas/apiClient.ts +25 -6
- package/src/common/atlas/cluster.ts +1 -1
- package/src/{config.ts → common/config.ts} +14 -2
- package/src/{logger.ts → common/logger.ts} +21 -23
- package/src/common/managedTimeout.ts +27 -0
- package/src/{session.ts → common/session.ts} +24 -26
- package/src/common/sessionStore.ts +111 -0
- package/src/helpers/container.ts +35 -0
- package/src/helpers/indexCheck.ts +1 -1
- package/src/index.ts +30 -40
- package/src/server.ts +54 -10
- package/src/telemetry/constants.ts +1 -1
- package/src/telemetry/telemetry.ts +34 -26
- package/src/telemetry/types.ts +2 -0
- package/src/tools/atlas/atlasTool.ts +4 -4
- package/src/tools/atlas/connect/connectCluster.ts +259 -0
- package/src/tools/atlas/create/createAccessList.ts +15 -13
- package/src/tools/atlas/create/createDBUser.ts +5 -3
- package/src/tools/atlas/create/createFreeCluster.ts +4 -2
- package/src/tools/atlas/create/createProject.ts +2 -2
- package/src/tools/atlas/read/inspectAccessList.ts +2 -2
- package/src/tools/atlas/read/inspectCluster.ts +2 -2
- package/src/tools/atlas/read/listAlerts.ts +2 -2
- package/src/tools/atlas/read/listClusters.ts +2 -2
- package/src/tools/atlas/read/listDBUsers.ts +2 -2
- package/src/tools/atlas/read/listOrgs.ts +2 -2
- package/src/tools/atlas/read/listProjects.ts +2 -2
- package/src/tools/atlas/tools.ts +1 -1
- package/src/tools/mongodb/{metadata → connect}/connect.ts +12 -9
- package/src/tools/mongodb/create/createCollection.ts +2 -2
- package/src/tools/mongodb/create/createIndex.ts +3 -3
- package/src/tools/mongodb/create/insertMany.ts +3 -3
- package/src/tools/mongodb/delete/deleteMany.ts +4 -3
- package/src/tools/mongodb/delete/dropCollection.ts +2 -2
- package/src/tools/mongodb/delete/dropDatabase.ts +2 -2
- package/src/tools/mongodb/metadata/collectionSchema.ts +2 -2
- package/src/tools/mongodb/metadata/collectionStorageSize.ts +2 -2
- package/src/tools/mongodb/metadata/dbStats.ts +2 -2
- package/src/tools/mongodb/metadata/explain.ts +3 -3
- package/src/tools/mongodb/metadata/listCollections.ts +2 -2
- package/src/tools/mongodb/metadata/listDatabases.ts +2 -2
- package/src/tools/mongodb/metadata/logs.ts +2 -2
- package/src/tools/mongodb/mongodbTool.ts +50 -14
- package/src/tools/mongodb/read/aggregate.ts +3 -3
- package/src/tools/mongodb/read/collectionIndexes.ts +2 -2
- package/src/tools/mongodb/read/count.ts +4 -3
- package/src/tools/mongodb/read/find.ts +11 -6
- package/src/tools/mongodb/tools.ts +1 -1
- package/src/tools/mongodb/update/renameCollection.ts +2 -2
- package/src/tools/mongodb/update/updateMany.ts +6 -4
- package/src/tools/tool.ts +18 -13
- package/src/transports/base.ts +34 -0
- package/src/{helpers/EJsonTransport.ts → transports/stdio.ts} +30 -1
- package/src/transports/streamableHttp.ts +178 -0
- package/tests/accuracy/aggregate.test.ts +27 -0
- package/tests/accuracy/collectionIndexes.test.ts +40 -0
- package/tests/accuracy/collectionSchema.test.ts +28 -0
- package/tests/accuracy/collectionStorageSize.test.ts +41 -0
- package/tests/accuracy/count.test.ts +44 -0
- package/tests/accuracy/createCollection.test.ts +46 -0
- package/tests/accuracy/createIndex.test.ts +37 -0
- package/tests/accuracy/dbStats.test.ts +15 -0
- package/tests/accuracy/deleteMany.test.ts +44 -0
- package/tests/accuracy/dropCollection.test.ts +74 -0
- package/tests/accuracy/dropDatabase.test.ts +41 -0
- package/tests/accuracy/explain.test.ts +73 -0
- package/tests/accuracy/find.test.ts +114 -0
- package/tests/accuracy/insertMany.test.ts +48 -0
- package/tests/accuracy/listCollections.test.ts +60 -0
- package/tests/accuracy/listDatabases.test.ts +31 -0
- package/tests/accuracy/logs.test.ts +28 -0
- package/tests/accuracy/renameCollection.test.ts +31 -0
- package/tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts +189 -0
- package/tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts +11 -0
- package/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts +151 -0
- package/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts +117 -0
- package/tests/accuracy/sdk/accuracyScorer.ts +93 -0
- package/tests/accuracy/sdk/accuracyTestingClient.ts +94 -0
- package/tests/accuracy/sdk/agent.ts +56 -0
- package/tests/accuracy/sdk/constants.ts +26 -0
- package/tests/accuracy/sdk/describeAccuracyTests.ts +126 -0
- package/tests/accuracy/sdk/gitInfo.ts +7 -0
- package/tests/accuracy/sdk/matcher.ts +193 -0
- package/tests/accuracy/sdk/models.ts +95 -0
- package/tests/accuracy/test-data-dumps/comics.books.json +417 -0
- package/tests/accuracy/test-data-dumps/comics.characters.json +402 -0
- package/tests/accuracy/test-data-dumps/mflix.movies.json +496 -0
- package/tests/accuracy/test-data-dumps/mflix.shows.json +572 -0
- package/tests/accuracy/updateMany.test.ts +42 -0
- package/tests/integration/helpers.ts +9 -9
- package/tests/integration/indexCheck.test.ts +1 -0
- package/tests/integration/server.test.ts +1 -0
- package/tests/integration/telemetry.test.ts +4 -3
- package/tests/integration/tools/atlas/accessLists.test.ts +22 -2
- package/tests/integration/tools/atlas/alerts.test.ts +3 -2
- package/tests/integration/tools/atlas/atlasHelpers.ts +3 -0
- package/tests/integration/tools/atlas/clusters.test.ts +68 -16
- package/tests/integration/tools/atlas/dbUsers.test.ts +14 -1
- package/tests/integration/tools/atlas/orgs.test.ts +2 -1
- package/tests/integration/tools/atlas/projects.test.ts +4 -3
- package/tests/integration/tools/mongodb/{metadata → connect}/connect.test.ts +34 -3
- package/tests/integration/tools/mongodb/create/createCollection.test.ts +1 -0
- package/tests/integration/tools/mongodb/create/createIndex.test.ts +1 -0
- package/tests/integration/tools/mongodb/create/insertMany.test.ts +1 -0
- package/tests/integration/tools/mongodb/delete/deleteMany.test.ts +1 -0
- package/tests/integration/tools/mongodb/delete/dropCollection.test.ts +1 -1
- package/tests/integration/tools/mongodb/delete/dropDatabase.test.ts +1 -0
- package/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts +1 -0
- package/tests/integration/tools/mongodb/metadata/collectionStorageSize.test.ts +1 -0
- package/tests/integration/tools/mongodb/metadata/dbStats.test.ts +1 -0
- package/tests/integration/tools/mongodb/metadata/explain.test.ts +1 -0
- package/tests/integration/tools/mongodb/metadata/listCollections.test.ts +1 -0
- package/tests/integration/tools/mongodb/metadata/listDatabases.test.ts +3 -2
- package/tests/integration/tools/mongodb/metadata/logs.test.ts +1 -0
- package/tests/integration/tools/mongodb/mongodbHelpers.ts +66 -2
- package/tests/integration/tools/mongodb/read/aggregate.test.ts +2 -1
- package/tests/integration/tools/mongodb/read/collectionIndexes.test.ts +1 -0
- package/tests/integration/tools/mongodb/read/count.test.ts +1 -0
- package/tests/integration/tools/mongodb/read/find.test.ts +2 -1
- package/tests/integration/tools/mongodb/update/renameCollection.test.ts +1 -0
- package/tests/integration/tools/mongodb/update/updateMany.test.ts +1 -0
- package/tests/integration/transports/stdio.test.ts +40 -0
- package/tests/integration/transports/streamableHttp.test.ts +56 -0
- package/tests/matchers/toIncludeSameMembers.test.ts +59 -0
- package/tests/matchers/toIncludeSameMembers.ts +12 -0
- package/tests/setup.ts +7 -0
- package/tests/unit/accessListUtils.test.ts +39 -0
- package/tests/unit/accuracyScorer.test.ts +390 -0
- package/tests/unit/{apiClient.test.ts → common/apiClient.test.ts} +15 -15
- package/tests/unit/common/managedTimeout.test.ts +67 -0
- package/tests/unit/{session.test.ts → common/session.test.ts} +7 -12
- package/tests/unit/{indexCheck.test.ts → helpers/indexCheck.test.ts} +2 -1
- package/tests/unit/telemetry.test.ts +52 -42
- package/tests/unit/{EJsonTransport.test.ts → transports/stdio.test.ts} +4 -4
- package/tests/vitest.d.ts +11 -0
- package/tsconfig.json +0 -1
- package/{tsconfig.jest.json → tsconfig.test.json} +1 -2
- package/vitest.config.ts +41 -0
- package/dist/common/atlas/generatePassword.js.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/errors.js.map +0 -1
- package/dist/helpers/EJsonTransport.js.map +0 -1
- package/dist/helpers/packageInfo.js.map +0 -1
- package/dist/logger.js.map +0 -1
- package/dist/session.js.map +0 -1
- package/dist/tools/atlas/metadata/connectCluster.js +0 -100
- package/dist/tools/atlas/metadata/connectCluster.js.map +0 -1
- package/dist/tools/mongodb/metadata/connect.js.map +0 -1
- package/global.d.ts +0 -1
- package/jest.config.cjs +0 -22
- package/src/tools/atlas/metadata/connectCluster.ts +0 -121
- /package/dist/{errors.js → common/errors.js} +0 -0
- /package/dist/{helpers → common}/packageInfo.js +0 -0
- /package/dist/{common/atlas → helpers}/generatePassword.js +0 -0
- /package/src/{errors.ts → common/errors.ts} +0 -0
- /package/src/{helpers → common}/packageInfo.ts +0 -0
- /package/src/{common/atlas → helpers}/generatePassword.ts +0 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { Collection, MongoClient } from "mongodb";
|
|
2
|
+
import {
|
|
3
|
+
AccuracyResult,
|
|
4
|
+
AccuracyResultStorage,
|
|
5
|
+
AccuracyRunStatus,
|
|
6
|
+
AccuracyRunStatuses,
|
|
7
|
+
ExpectedToolCall,
|
|
8
|
+
ModelResponse,
|
|
9
|
+
} from "./resultStorage.js";
|
|
10
|
+
|
|
11
|
+
// We could decide to omit some fields from the model response to reduce the size of the stored results. Since
|
|
12
|
+
// so far, the responses are not too big, we do not omit any fields, but if we decide to do so in the future,
|
|
13
|
+
// we could add `"messages"` and `"text"` to this list.
|
|
14
|
+
const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = [];
|
|
15
|
+
|
|
16
|
+
export class MongoDBBasedResultStorage implements AccuracyResultStorage {
|
|
17
|
+
private client: MongoClient;
|
|
18
|
+
private resultCollection: Collection<AccuracyResult>;
|
|
19
|
+
|
|
20
|
+
constructor(connectionString: string, database: string, collection: string) {
|
|
21
|
+
this.client = new MongoClient(connectionString);
|
|
22
|
+
this.resultCollection = this.client.db(database).collection<AccuracyResult>(collection);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
async getAccuracyResult(commitSHA: string, runId?: string): Promise<AccuracyResult | null> {
|
|
26
|
+
const filters: Partial<AccuracyResult> = runId
|
|
27
|
+
? { commitSHA, runId }
|
|
28
|
+
: // Note that we use the `Done` status filter only when asked for
|
|
29
|
+
// a commit. That is because the one use case of asking for a run
|
|
30
|
+
// for commit is when you want the last successful run of that
|
|
31
|
+
// particular commit.
|
|
32
|
+
{ commitSHA, runStatus: AccuracyRunStatus.Done };
|
|
33
|
+
|
|
34
|
+
return await this.resultCollection.findOne(filters, {
|
|
35
|
+
sort: {
|
|
36
|
+
createdOn: -1,
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise<void> {
|
|
42
|
+
await this.resultCollection.updateOne(
|
|
43
|
+
{ commitSHA, runId },
|
|
44
|
+
{
|
|
45
|
+
$set: {
|
|
46
|
+
runStatus: status,
|
|
47
|
+
},
|
|
48
|
+
}
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async saveModelResponseForPrompt({
|
|
53
|
+
commitSHA,
|
|
54
|
+
runId,
|
|
55
|
+
prompt,
|
|
56
|
+
expectedToolCalls,
|
|
57
|
+
modelResponse,
|
|
58
|
+
}: {
|
|
59
|
+
commitSHA: string;
|
|
60
|
+
runId: string;
|
|
61
|
+
prompt: string;
|
|
62
|
+
expectedToolCalls: ExpectedToolCall[];
|
|
63
|
+
modelResponse: ModelResponse;
|
|
64
|
+
}): Promise<void> {
|
|
65
|
+
const modelResponseToSave: ModelResponse = {
|
|
66
|
+
...modelResponse,
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
for (const field of OMITTED_MODEL_RESPONSE_FIELDS) {
|
|
70
|
+
delete modelResponseToSave[field];
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
await this.resultCollection.updateOne(
|
|
74
|
+
{ commitSHA, runId },
|
|
75
|
+
[
|
|
76
|
+
{
|
|
77
|
+
$set: {
|
|
78
|
+
runStatus: { $ifNull: ["$runStatus", AccuracyRunStatus.InProgress] },
|
|
79
|
+
createdOn: { $ifNull: ["$createdOn", Date.now()] },
|
|
80
|
+
commitSHA: { $ifNull: ["$commitSHA", commitSHA] },
|
|
81
|
+
runId: { $ifNull: ["$runId", runId] },
|
|
82
|
+
promptResults: {
|
|
83
|
+
$ifNull: ["$promptResults", []],
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
$set: {
|
|
89
|
+
promptResults: {
|
|
90
|
+
$let: {
|
|
91
|
+
vars: {
|
|
92
|
+
existingPromptIndex: {
|
|
93
|
+
$indexOfArray: ["$promptResults.prompt", prompt],
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
in: {
|
|
97
|
+
$cond: [
|
|
98
|
+
{ $eq: ["$$existingPromptIndex", -1] },
|
|
99
|
+
{
|
|
100
|
+
$concatArrays: [
|
|
101
|
+
"$promptResults",
|
|
102
|
+
[
|
|
103
|
+
{
|
|
104
|
+
$literal: {
|
|
105
|
+
prompt,
|
|
106
|
+
expectedToolCalls,
|
|
107
|
+
modelResponses: [modelResponseToSave],
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
],
|
|
111
|
+
],
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
$map: {
|
|
115
|
+
input: "$promptResults",
|
|
116
|
+
as: "promptResult",
|
|
117
|
+
in: {
|
|
118
|
+
$cond: [
|
|
119
|
+
{ $eq: ["$$promptResult.prompt", prompt] },
|
|
120
|
+
{
|
|
121
|
+
prompt: "$$promptResult.prompt",
|
|
122
|
+
expectedToolCalls: {
|
|
123
|
+
$literal: expectedToolCalls,
|
|
124
|
+
},
|
|
125
|
+
modelResponses: {
|
|
126
|
+
$concatArrays: [
|
|
127
|
+
"$$promptResult.modelResponses",
|
|
128
|
+
[{ $literal: modelResponseToSave }],
|
|
129
|
+
],
|
|
130
|
+
},
|
|
131
|
+
},
|
|
132
|
+
"$$promptResult",
|
|
133
|
+
],
|
|
134
|
+
},
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
],
|
|
138
|
+
},
|
|
139
|
+
},
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
},
|
|
143
|
+
],
|
|
144
|
+
{ upsert: true }
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async close(): Promise<void> {
|
|
149
|
+
await this.client.close();
|
|
150
|
+
}
|
|
151
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
export interface LLMToolCall {
|
|
2
|
+
toolCallId: string;
|
|
3
|
+
toolName: string;
|
|
4
|
+
parameters: Record<string, unknown>;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;
|
|
8
|
+
|
|
9
|
+
export const AccuracyRunStatus = {
|
|
10
|
+
Done: "done",
|
|
11
|
+
Failed: "failed",
|
|
12
|
+
InProgress: "in-progress",
|
|
13
|
+
} as const;
|
|
14
|
+
|
|
15
|
+
export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus];
|
|
16
|
+
|
|
17
|
+
export interface AccuracyResult {
|
|
18
|
+
/**
|
|
19
|
+
* A unique id for each accuracy run. Should either be generated by the
|
|
20
|
+
* script triggering the accuracy run or provided via environment variables.
|
|
21
|
+
* */
|
|
22
|
+
runId: string;
|
|
23
|
+
/**
|
|
24
|
+
* Represents the status of accuracy run. Each test completion, during an
|
|
25
|
+
* accuracy run, is supposed to submit an accuracy result entry with
|
|
26
|
+
* InProgress status which then later, after completion of accuracy run, is
|
|
27
|
+
* updated to either Done or Failed, depending on whether there were errors
|
|
28
|
+
* during the run or not. */
|
|
29
|
+
runStatus: AccuracyRunStatuses;
|
|
30
|
+
/**
|
|
31
|
+
* Timestamp of when this result entry was generated. */
|
|
32
|
+
createdOn: number;
|
|
33
|
+
/**
|
|
34
|
+
* The commit SHA for which the accuracy run was triggered. */
|
|
35
|
+
commitSHA: string;
|
|
36
|
+
/**
|
|
37
|
+
* A list of results for different prompts tested in the accuracy run. */
|
|
38
|
+
promptResults: PromptResult[];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface PromptResult {
|
|
42
|
+
/**
|
|
43
|
+
* The actual prompt that was provided to LLM as test */
|
|
44
|
+
prompt: string;
|
|
45
|
+
/**
|
|
46
|
+
* A list of tools, along with their parameters, that are expected to be
|
|
47
|
+
* called by the LLM in test. */
|
|
48
|
+
expectedToolCalls: ExpectedToolCall[];
|
|
49
|
+
/**
|
|
50
|
+
* The responses from the LLMs tested, when provided with the prompt. */
|
|
51
|
+
modelResponses: ModelResponse[];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface ModelResponse {
|
|
55
|
+
/**
|
|
56
|
+
* The LLM provider providing the LLM APIs */
|
|
57
|
+
provider: string;
|
|
58
|
+
/**
|
|
59
|
+
* The LLM which was requested to respond to our test prompts */
|
|
60
|
+
requestedModel: string;
|
|
61
|
+
/**
|
|
62
|
+
* The ID of the model that actually responded to our prompt request. */
|
|
63
|
+
respondingModel: string;
|
|
64
|
+
/**
|
|
65
|
+
* The total time taken by LLM to respond to our prompt. */
|
|
66
|
+
llmResponseTime: number;
|
|
67
|
+
/**
|
|
68
|
+
* A number between 0 and 1, representing how accurately the expected tools
|
|
69
|
+
* were called by LLM when responding to the provided prompts. To know more
|
|
70
|
+
* about how this number is generated, check - toolCallingAccuracy.ts */
|
|
71
|
+
toolCallingAccuracy: number;
|
|
72
|
+
/**
|
|
73
|
+
* A list of tools, along with their parameters, that were actually called
|
|
74
|
+
* by the LLM in test. */
|
|
75
|
+
llmToolCalls: LLMToolCall[];
|
|
76
|
+
/**
|
|
77
|
+
* Token usage data, returned as part of LLM prompt response. */
|
|
78
|
+
tokensUsed?: TokensUsed;
|
|
79
|
+
/**
|
|
80
|
+
* The final response text generated by the LLM, in response to our prompt
|
|
81
|
+
* request. */
|
|
82
|
+
text?: string;
|
|
83
|
+
/**
|
|
84
|
+
* A list of messages, exchanged between LLM and our testing agent, in
|
|
85
|
+
* response to our prompt request. This is particularly helpful for
|
|
86
|
+
* debugging. */
|
|
87
|
+
messages?: Record<string, unknown>[];
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
interface TokensUsed {
|
|
91
|
+
promptTokens?: number;
|
|
92
|
+
completionTokens?: number;
|
|
93
|
+
totalTokens?: number;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export interface AccuracyResultStorage {
|
|
97
|
+
/**
|
|
98
|
+
* Retrieves the accuracy result for the provided commit SHA and optionally
|
|
99
|
+
* the run id. When the run id is omitted, the implementation fetches the
|
|
100
|
+
* result for the last successful accuracy run otherwise it fetches the
|
|
101
|
+
* result regardless of the run status. */
|
|
102
|
+
getAccuracyResult(commitSHA: string, runId?: string): Promise<AccuracyResult | null>;
|
|
103
|
+
/**
|
|
104
|
+
* Updates the status of the run */
|
|
105
|
+
updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise<void>;
|
|
106
|
+
/**
|
|
107
|
+
* Attempts to atomically insert the model response for the prompt in the
|
|
108
|
+
* stored accuracy result. */
|
|
109
|
+
saveModelResponseForPrompt(data: {
|
|
110
|
+
commitSHA: string;
|
|
111
|
+
runId: string;
|
|
112
|
+
prompt: string;
|
|
113
|
+
expectedToolCalls: ExpectedToolCall[];
|
|
114
|
+
modelResponse: ModelResponse;
|
|
115
|
+
}): Promise<void>;
|
|
116
|
+
close(): Promise<void>;
|
|
117
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { ExpectedToolCall, LLMToolCall } from "./accuracyResultStorage/resultStorage.js";
|
|
2
|
+
import { Matcher } from "./matcher.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Tool calling accuracy is a single number calculated based on two dimensions.
|
|
6
|
+
* 1. Did LLM call the right tool?
|
|
7
|
+
* 2. Did LLM call the tool with correct and required parameters?
|
|
8
|
+
*
|
|
9
|
+
* The number can be one of:
|
|
10
|
+
* - 0: When LLM:
|
|
11
|
+
* - did not call the right tool
|
|
12
|
+
* - did not call the tool with correct parameters
|
|
13
|
+
* - 0.75: When LLM:
|
|
14
|
+
* - called the right tool but hallucinated and called some extra tools as
|
|
15
|
+
* well or called the same tool but with different parameters
|
|
16
|
+
* - called the right tool but hallucinated and called it with some
|
|
17
|
+
* non-required parameters
|
|
18
|
+
* - 1: When LLM:
|
|
19
|
+
* - called exactly the tools that were expected
|
|
20
|
+
* - called the expected tools exactly with the expected parameters
|
|
21
|
+
*
|
|
22
|
+
* To calculate this number we must have:
|
|
23
|
+
* 1. a list of expected tool calls with their expected parameters
|
|
24
|
+
* 2. a list of LLM tool calls with their parameters
|
|
25
|
+
*
|
|
26
|
+
* For each expected tool call we find the best matching LLM tool call. Best
|
|
27
|
+
* matching LLM tool call will have:
|
|
28
|
+
* 1. the same name as that of the expected tool call
|
|
29
|
+
* 2. highest parameter similarity score, with at-least 0.75 to ensure an actual
|
|
30
|
+
* match. And in case of competing scores, we take the first one that appears
|
|
31
|
+
* in the LLM tool calls.
|
|
32
|
+
*
|
|
33
|
+
* Using the above logic we establish pairs between expected and actual tool
|
|
34
|
+
* calls.
|
|
35
|
+
*
|
|
36
|
+
* 1. If we could not pair some LLM tool calls with expected tool calls that
|
|
37
|
+
* means the LLM hallucinated over the extra tool calls. For that reason we
|
|
38
|
+
* will cap the maximum achievable accuracy to 0.75.
|
|
39
|
+
*
|
|
40
|
+
* 2. If we could not pair some expected tool calls with LLM tool calls that
|
|
41
|
+
* means the LLM did not call one of the expected tool required to solve the
|
|
42
|
+
* problem. For that reason we will mark the accuracy as 0 and exit early.
|
|
43
|
+
*
|
|
44
|
+
* 3. Now for each of the established tool call pairs, we will determine how
|
|
45
|
+
* correctly the parameters were called using the parameter similarity score.
|
|
46
|
+
* The parameter similarity score follow the same accuracy number pattern
|
|
47
|
+
* described above:
|
|
48
|
+
* - 0 : for missing parameters, incorrect parameter values
|
|
49
|
+
* - 0.75 : for additional parameters
|
|
50
|
+
* - 1 : for a perfect match
|
|
51
|
+
*
|
|
52
|
+
* The final accuracy score is then calculated as the least of:
|
|
53
|
+
* - Maximum achievable accuracy from #1
|
|
54
|
+
* - The least of parameter similarity score from the established pairs in #3
|
|
55
|
+
*
|
|
56
|
+
* For examples: see the test cases in - tests/unit/accuracy-scorer.test.ts
|
|
57
|
+
*/
|
|
58
|
+
export function calculateToolCallingAccuracy(
|
|
59
|
+
expectedToolCalls: ExpectedToolCall[],
|
|
60
|
+
actualToolCalls: LLMToolCall[]
|
|
61
|
+
): number {
|
|
62
|
+
if (expectedToolCalls.length === 0) {
|
|
63
|
+
return actualToolCalls.length === 0 ? 1 : 0.75;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
let currentScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1;
|
|
67
|
+
const checkedActualToolCallIndexes = new Set<number>();
|
|
68
|
+
|
|
69
|
+
for (const expectedCall of expectedToolCalls) {
|
|
70
|
+
const candidates = actualToolCalls
|
|
71
|
+
.map((call, index) => ({ call, index }))
|
|
72
|
+
.filter(
|
|
73
|
+
({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName
|
|
74
|
+
)
|
|
75
|
+
.map(({ call, index }) => ({
|
|
76
|
+
call,
|
|
77
|
+
index,
|
|
78
|
+
score: Matcher.value(expectedCall.parameters).match(call.parameters),
|
|
79
|
+
}))
|
|
80
|
+
.filter(({ score }) => score >= 0.75)
|
|
81
|
+
.sort((a, b) => b.score - a.score || a.index - b.index);
|
|
82
|
+
|
|
83
|
+
const bestMatch = candidates[0];
|
|
84
|
+
if (!bestMatch || bestMatch.score === 0) {
|
|
85
|
+
return 0; // No matching tool call found, return 0
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
checkedActualToolCallIndexes.add(bestMatch.index);
|
|
89
|
+
currentScore = Math.min(currentScore, bestMatch.score);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return currentScore;
|
|
93
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { v4 as uuid } from "uuid";
|
|
2
|
+
import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai";
|
|
3
|
+
import { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
|
|
4
|
+
import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
|
|
5
|
+
|
|
6
|
+
import { MCP_SERVER_CLI_SCRIPT } from "./constants.js";
|
|
7
|
+
import { LLMToolCall } from "./accuracyResultStorage/resultStorage.js";
|
|
8
|
+
import { VercelMCPClient, VercelMCPClientTools } from "./agent.js";
|
|
9
|
+
|
|
10
|
+
type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise<CallToolResult>;
|
|
11
|
+
export type MockedTools = Record<string, ToolResultGeneratorFn>;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* AccuracyTestingClient is a bridge between actual MCP client connected to our
|
|
15
|
+
* MCP server and our Tool calling agent. Its serves the following purposes:
|
|
16
|
+
* 1. Captures actual tools provided by our MCP server
|
|
17
|
+
* 2. Translates captured MCP tools to tool definitions that can be consumed by
|
|
18
|
+
* Tool Calling agent (Ref: `vercelTools`)
|
|
19
|
+
* 3. Allow dynamic mocking and resetting of mocks of individual tool calls.
|
|
20
|
+
* 4. Records and provides tool calls made by LLMs with their parameters.
|
|
21
|
+
*/
|
|
22
|
+
export class AccuracyTestingClient {
|
|
23
|
+
private mockedTools: MockedTools = {};
|
|
24
|
+
private llmToolCalls: LLMToolCall[] = [];
|
|
25
|
+
|
|
26
|
+
private constructor(private readonly vercelMCPClient: VercelMCPClient) {}
|
|
27
|
+
|
|
28
|
+
async close(): Promise<void> {
|
|
29
|
+
await this.vercelMCPClient?.close();
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
async vercelTools(): Promise<VercelMCPClientTools> {
|
|
33
|
+
const vercelTools = (await this.vercelMCPClient?.tools()) ?? {};
|
|
34
|
+
const rewrappedVercelTools: VercelMCPClientTools = {};
|
|
35
|
+
for (const [toolName, tool] of Object.entries(vercelTools)) {
|
|
36
|
+
rewrappedVercelTools[toolName] = createVercelTool({
|
|
37
|
+
...tool,
|
|
38
|
+
execute: async (args, options) => {
|
|
39
|
+
this.llmToolCalls.push({
|
|
40
|
+
toolCallId: uuid(),
|
|
41
|
+
toolName: toolName,
|
|
42
|
+
parameters: args as Record<string, unknown>,
|
|
43
|
+
});
|
|
44
|
+
try {
|
|
45
|
+
const toolResultGeneratorFn = this.mockedTools[toolName];
|
|
46
|
+
if (toolResultGeneratorFn) {
|
|
47
|
+
return await toolResultGeneratorFn(args);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return await tool.execute(args, options);
|
|
51
|
+
} catch (error) {
|
|
52
|
+
// There are cases when LLM calls the tools incorrectly
|
|
53
|
+
// and the schema definition check fails. In production,
|
|
54
|
+
// the tool calling agents are deployed with this fail
|
|
55
|
+
// safe to allow LLM to course correct themselves. That
|
|
56
|
+
// is exactly what we do here as well.
|
|
57
|
+
return {
|
|
58
|
+
isError: true,
|
|
59
|
+
content: JSON.stringify(error),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return rewrappedVercelTools;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
getLLMToolCalls(): LLMToolCall[] {
|
|
70
|
+
return this.llmToolCalls;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
mockTools(mockedTools: MockedTools): void {
|
|
74
|
+
this.mockedTools = mockedTools;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
resetForTests(): void {
|
|
78
|
+
this.mockTools({});
|
|
79
|
+
this.llmToolCalls = [];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
static async initializeClient(mdbConnectionString: string): Promise<AccuracyTestingClient> {
|
|
83
|
+
const clientTransport = new StdioClientTransport({
|
|
84
|
+
command: process.execPath,
|
|
85
|
+
args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString],
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
const client = await createMCPClient({
|
|
89
|
+
transport: clientTransport,
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
return new AccuracyTestingClient(client);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai";
|
|
2
|
+
import { Model } from "./models.js";
|
|
3
|
+
|
|
4
|
+
const systemPrompt = [
|
|
5
|
+
'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119',
|
|
6
|
+
"You are an expert AI assistant with access to a set of tools for MongoDB database operations.",
|
|
7
|
+
"You MUST use the most relevant tool to answer the user's request",
|
|
8
|
+
"When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments",
|
|
9
|
+
"If a task requires multiple tool calls, you MUST call all the necessary tools in sequence, following the requirements mentioned above for each tool called.",
|
|
10
|
+
'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"',
|
|
11
|
+
];
|
|
12
|
+
|
|
13
|
+
// These types are not exported by Vercel SDK so we derive them here to be
|
|
14
|
+
// re-used again.
|
|
15
|
+
export type VercelMCPClient = Awaited<ReturnType<typeof experimental_createMCPClient>>;
|
|
16
|
+
export type VercelMCPClientTools = Awaited<ReturnType<VercelMCPClient["tools"]>>;
|
|
17
|
+
export type VercelAgent = ReturnType<typeof getVercelToolCallingAgent>;
|
|
18
|
+
|
|
19
|
+
export interface VercelAgentPromptResult {
|
|
20
|
+
respondingModel: string;
|
|
21
|
+
tokensUsage?: {
|
|
22
|
+
promptTokens?: number;
|
|
23
|
+
completionTokens?: number;
|
|
24
|
+
totalTokens?: number;
|
|
25
|
+
};
|
|
26
|
+
text: string;
|
|
27
|
+
messages: Record<string, unknown>[];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Generic interface for Agent, in case we need to switch to some other agent
|
|
31
|
+
// development SDK
|
|
32
|
+
export interface Agent<Model = unknown, Tools = unknown, Result = unknown> {
|
|
33
|
+
prompt(prompt: string, model: Model, tools: Tools): Promise<Result>;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function getVercelToolCallingAgent(
|
|
37
|
+
requestedSystemPrompt?: string
|
|
38
|
+
): Agent<Model<LanguageModelV1>, VercelMCPClientTools, VercelAgentPromptResult> {
|
|
39
|
+
return {
|
|
40
|
+
async prompt(prompt: string, model: Model<LanguageModelV1>, tools: VercelMCPClientTools) {
|
|
41
|
+
const result = await generateText({
|
|
42
|
+
model: model.getModel(),
|
|
43
|
+
system: [...systemPrompt, requestedSystemPrompt].filter(Boolean).join("\n"),
|
|
44
|
+
prompt,
|
|
45
|
+
tools,
|
|
46
|
+
maxSteps: 100,
|
|
47
|
+
});
|
|
48
|
+
return {
|
|
49
|
+
text: result.text,
|
|
50
|
+
messages: result.response.messages,
|
|
51
|
+
respondingModel: result.response.modelId,
|
|
52
|
+
tokensUsage: result.usage,
|
|
53
|
+
};
|
|
54
|
+
},
|
|
55
|
+
};
|
|
56
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import { fileURLToPath } from "url";
|
|
3
|
+
|
|
4
|
+
const __dirname = fileURLToPath(import.meta.url);
|
|
5
|
+
|
|
6
|
+
export const ROOT_DIR = path.join(__dirname, "..", "..", "..", "..");
|
|
7
|
+
|
|
8
|
+
export const DIST_DIR = path.join(ROOT_DIR, "dist");
|
|
9
|
+
|
|
10
|
+
export const RESOURCES_DIR = path.join(ROOT_DIR, "resources");
|
|
11
|
+
|
|
12
|
+
export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js");
|
|
13
|
+
|
|
14
|
+
export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps");
|
|
15
|
+
|
|
16
|
+
export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy");
|
|
17
|
+
|
|
18
|
+
export const ACCURACY_RESULTS_DIR = path.join(GENERATED_ASSETS_DIR, "results");
|
|
19
|
+
|
|
20
|
+
export const LATEST_ACCURACY_RUN_NAME = "latest-run";
|
|
21
|
+
|
|
22
|
+
export const HTML_TEST_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html");
|
|
23
|
+
|
|
24
|
+
export const MARKDOWN_TEST_BRIEF_FILE = path.join(GENERATED_ASSETS_DIR, "test-brief.md");
|
|
25
|
+
|
|
26
|
+
export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html");
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { describe, it, beforeAll, beforeEach, afterAll } from "vitest";
|
|
2
|
+
import { getAvailableModels } from "./models.js";
|
|
3
|
+
import { calculateToolCallingAccuracy } from "./accuracyScorer.js";
|
|
4
|
+
import { getVercelToolCallingAgent, VercelAgent } from "./agent.js";
|
|
5
|
+
import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js";
|
|
6
|
+
import { AccuracyTestingClient, MockedTools } from "./accuracyTestingClient.js";
|
|
7
|
+
import { AccuracyResultStorage, ExpectedToolCall } from "./accuracyResultStorage/resultStorage.js";
|
|
8
|
+
import { getAccuracyResultStorage } from "./accuracyResultStorage/getAccuracyResultStorage.js";
|
|
9
|
+
import { getCommitSHA } from "./gitInfo.js";
|
|
10
|
+
|
|
11
|
+
export interface AccuracyTestConfig {
|
|
12
|
+
/** The prompt to be provided to LLM for evaluation. */
|
|
13
|
+
prompt: string;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* A list of tools and their parameters that we expect LLM to call based on
|
|
17
|
+
* how vague or detailed the prompt is. Ideally this should be a list of
|
|
18
|
+
* bare minimum and critical tool calls that are required to solve the
|
|
19
|
+
* problem mentioned in the prompt but because, for even a slightly vague
|
|
20
|
+
* prompt, LLM might decide to do additional confirmation by calling other
|
|
21
|
+
* tools, its fine to include those other tool calls as well to get a
|
|
22
|
+
* perfect 1 on the tool calling accuracy score. */
|
|
23
|
+
expectedToolCalls: ExpectedToolCall[];
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* The additional system prompt to be appended to already injected system
|
|
27
|
+
* prompt. */
|
|
28
|
+
systemPrompt?: string;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* A small hint appended to the actual prompt in test, which is supposed to
|
|
32
|
+
* hint LLM to assume that the MCP server is already connected so that it
|
|
33
|
+
* does not call the connect tool.
|
|
34
|
+
* By default it is assumed to be true */
|
|
35
|
+
injectConnectedAssumption?: boolean;
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* A map of tool names to their mocked implementation. When the mocked
|
|
39
|
+
* implementations are available, the testing client will prefer those over
|
|
40
|
+
* actual MCP tool calls. */
|
|
41
|
+
mockedTools?: MockedTools;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function describeAccuracyTests(accuracyTestConfigs: AccuracyTestConfig[]) {
|
|
45
|
+
if (!process.env.MDB_ACCURACY_RUN_ID) {
|
|
46
|
+
throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const models = getAvailableModels();
|
|
50
|
+
if (!models.length) {
|
|
51
|
+
throw new Error("No models available to test. Ensure that the API keys are properly setup!");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const eachModel = describe.each(models);
|
|
55
|
+
|
|
56
|
+
eachModel(`$displayName`, function (model) {
|
|
57
|
+
const accuracyRunId = `${process.env.MDB_ACCURACY_RUN_ID}`;
|
|
58
|
+
const mdbIntegration = setupMongoDBIntegrationTest();
|
|
59
|
+
const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration);
|
|
60
|
+
|
|
61
|
+
let commitSHA: string;
|
|
62
|
+
let accuracyResultStorage: AccuracyResultStorage;
|
|
63
|
+
let testMCPClient: AccuracyTestingClient;
|
|
64
|
+
let agent: VercelAgent;
|
|
65
|
+
|
|
66
|
+
beforeAll(async () => {
|
|
67
|
+
const retrievedCommitSHA = await getCommitSHA();
|
|
68
|
+
if (!retrievedCommitSHA) {
|
|
69
|
+
throw new Error("Could not derive commitSHA, exiting accuracy tests!");
|
|
70
|
+
}
|
|
71
|
+
commitSHA = retrievedCommitSHA;
|
|
72
|
+
|
|
73
|
+
accuracyResultStorage = getAccuracyResultStorage();
|
|
74
|
+
testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString());
|
|
75
|
+
agent = getVercelToolCallingAgent();
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
beforeEach(async () => {
|
|
79
|
+
await cleanupTestDatabases(mdbIntegration);
|
|
80
|
+
await populateTestData();
|
|
81
|
+
testMCPClient.resetForTests();
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
afterAll(async () => {
|
|
85
|
+
await accuracyResultStorage?.close();
|
|
86
|
+
await testMCPClient?.close();
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
const eachTest = it.each(accuracyTestConfigs);
|
|
90
|
+
|
|
91
|
+
eachTest("$prompt", async function (testConfig) {
|
|
92
|
+
testMCPClient.mockTools(testConfig.mockedTools ?? {});
|
|
93
|
+
const toolsForModel = await testMCPClient.vercelTools();
|
|
94
|
+
const promptForModel =
|
|
95
|
+
testConfig.injectConnectedAssumption === false
|
|
96
|
+
? testConfig.prompt
|
|
97
|
+
: [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ");
|
|
98
|
+
|
|
99
|
+
const timeBeforePrompt = Date.now();
|
|
100
|
+
const result = await agent.prompt(promptForModel, model, toolsForModel);
|
|
101
|
+
const timeAfterPrompt = Date.now();
|
|
102
|
+
|
|
103
|
+
const llmToolCalls = testMCPClient.getLLMToolCalls();
|
|
104
|
+
const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls);
|
|
105
|
+
|
|
106
|
+
const responseTime = timeAfterPrompt - timeBeforePrompt;
|
|
107
|
+
await accuracyResultStorage.saveModelResponseForPrompt({
|
|
108
|
+
commitSHA,
|
|
109
|
+
runId: accuracyRunId,
|
|
110
|
+
prompt: testConfig.prompt,
|
|
111
|
+
expectedToolCalls: testConfig.expectedToolCalls,
|
|
112
|
+
modelResponse: {
|
|
113
|
+
provider: model.provider,
|
|
114
|
+
requestedModel: model.modelName,
|
|
115
|
+
respondingModel: result.respondingModel,
|
|
116
|
+
llmResponseTime: responseTime,
|
|
117
|
+
toolCallingAccuracy: toolCallingAccuracy,
|
|
118
|
+
llmToolCalls: llmToolCalls,
|
|
119
|
+
tokensUsed: result.tokensUsage,
|
|
120
|
+
text: result.text,
|
|
121
|
+
messages: result.messages,
|
|
122
|
+
},
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
}
|