scorecard-ai-mcp 1.0.0-alpha.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/README.md +19 -14
  2. package/compat.d.mts +2 -0
  3. package/compat.d.mts.map +1 -1
  4. package/compat.d.ts +2 -0
  5. package/compat.d.ts.map +1 -1
  6. package/compat.js +38 -1
  7. package/compat.js.map +1 -1
  8. package/compat.mjs +37 -0
  9. package/compat.mjs.map +1 -1
  10. package/index.js.map +1 -1
  11. package/index.mjs.map +1 -1
  12. package/options.d.mts +9 -6
  13. package/options.d.mts.map +1 -1
  14. package/options.d.ts +9 -6
  15. package/options.d.ts.map +1 -1
  16. package/options.js +5 -46
  17. package/options.js.map +1 -1
  18. package/options.mjs +5 -46
  19. package/options.mjs.map +1 -1
  20. package/package.json +4 -4
  21. package/server.d.mts +16 -2
  22. package/server.d.mts.map +1 -1
  23. package/server.d.ts +16 -2
  24. package/server.d.ts.map +1 -1
  25. package/server.js +28 -10
  26. package/server.js.map +1 -1
  27. package/server.mjs +28 -11
  28. package/server.mjs.map +1 -1
  29. package/src/compat.ts +40 -0
  30. package/src/index.ts +2 -2
  31. package/src/options.ts +12 -54
  32. package/src/server.ts +40 -15
  33. package/src/tools/index.ts +13 -8
  34. package/src/tools/metrics/create-metrics.ts +252 -0
  35. package/src/tools/metrics/update-metrics.ts +252 -0
  36. package/src/tools/runs/create-runs.ts +2 -2
  37. package/src/tools/systems/delete-systems.ts +1 -1
  38. package/src/tools/systems/versions/create-systems-versions.ts +45 -0
  39. package/src/tools/{system-configs/get-system-configs.ts → systems/versions/get-systems-versions.ts} +9 -9
  40. package/src/tools/{system-configs/list-system-configs.ts → systems/versions/list-systems-versions.ts} +6 -6
  41. package/src/tools/testsets/get-testsets.ts +1 -1
  42. package/src/tools/types.ts +0 -1
  43. package/tools/index.d.mts.map +1 -1
  44. package/tools/index.d.ts.map +1 -1
  45. package/tools/index.js +13 -8
  46. package/tools/index.js.map +1 -1
  47. package/tools/index.mjs +13 -8
  48. package/tools/index.mjs.map +1 -1
  49. package/tools/{system-configs/list-system-configs.d.mts → metrics/create-metrics.d.mts} +10 -1
  50. package/tools/metrics/create-metrics.d.mts.map +1 -0
  51. package/tools/{system-configs/create-system-configs.d.ts → metrics/create-metrics.d.ts} +10 -1
  52. package/tools/metrics/create-metrics.d.ts.map +1 -0
  53. package/tools/metrics/create-metrics.js +245 -0
  54. package/tools/metrics/create-metrics.js.map +1 -0
  55. package/tools/metrics/create-metrics.mjs +241 -0
  56. package/tools/metrics/create-metrics.mjs.map +1 -0
  57. package/tools/{system-configs/get-system-configs.d.mts → metrics/update-metrics.d.mts} +10 -1
  58. package/tools/metrics/update-metrics.d.mts.map +1 -0
  59. package/tools/{system-configs/get-system-configs.d.ts → metrics/update-metrics.d.ts} +10 -1
  60. package/tools/metrics/update-metrics.d.ts.map +1 -0
  61. package/tools/metrics/update-metrics.js +245 -0
  62. package/tools/metrics/update-metrics.js.map +1 -0
  63. package/tools/metrics/update-metrics.mjs +241 -0
  64. package/tools/metrics/update-metrics.mjs.map +1 -0
  65. package/tools/projects/create-projects.d.mts +9 -0
  66. package/tools/projects/create-projects.d.mts.map +1 -1
  67. package/tools/projects/create-projects.d.ts +9 -0
  68. package/tools/projects/create-projects.d.ts.map +1 -1
  69. package/tools/projects/list-projects.d.mts +9 -0
  70. package/tools/projects/list-projects.d.mts.map +1 -1
  71. package/tools/projects/list-projects.d.ts +9 -0
  72. package/tools/projects/list-projects.d.ts.map +1 -1
  73. package/tools/records/create-records.d.mts +9 -0
  74. package/tools/records/create-records.d.mts.map +1 -1
  75. package/tools/records/create-records.d.ts +9 -0
  76. package/tools/records/create-records.d.ts.map +1 -1
  77. package/tools/runs/create-runs.d.mts +9 -0
  78. package/tools/runs/create-runs.d.mts.map +1 -1
  79. package/tools/runs/create-runs.d.ts +9 -0
  80. package/tools/runs/create-runs.d.ts.map +1 -1
  81. package/tools/runs/create-runs.js +2 -2
  82. package/tools/runs/create-runs.js.map +1 -1
  83. package/tools/runs/create-runs.mjs +2 -2
  84. package/tools/runs/create-runs.mjs.map +1 -1
  85. package/tools/scores/upsert-scores.d.mts +9 -0
  86. package/tools/scores/upsert-scores.d.mts.map +1 -1
  87. package/tools/scores/upsert-scores.d.ts +9 -0
  88. package/tools/scores/upsert-scores.d.ts.map +1 -1
  89. package/tools/systems/create-systems.d.mts +9 -0
  90. package/tools/systems/create-systems.d.mts.map +1 -1
  91. package/tools/systems/create-systems.d.ts +9 -0
  92. package/tools/systems/create-systems.d.ts.map +1 -1
  93. package/tools/systems/delete-systems.d.mts +9 -0
  94. package/tools/systems/delete-systems.d.mts.map +1 -1
  95. package/tools/systems/delete-systems.d.ts +9 -0
  96. package/tools/systems/delete-systems.d.ts.map +1 -1
  97. package/tools/systems/delete-systems.js +1 -1
  98. package/tools/systems/delete-systems.js.map +1 -1
  99. package/tools/systems/delete-systems.mjs +1 -1
  100. package/tools/systems/delete-systems.mjs.map +1 -1
  101. package/tools/systems/get-systems.d.mts +9 -0
  102. package/tools/systems/get-systems.d.mts.map +1 -1
  103. package/tools/systems/get-systems.d.ts +9 -0
  104. package/tools/systems/get-systems.d.ts.map +1 -1
  105. package/tools/systems/list-systems.d.mts +9 -0
  106. package/tools/systems/list-systems.d.mts.map +1 -1
  107. package/tools/systems/list-systems.d.ts +9 -0
  108. package/tools/systems/list-systems.d.ts.map +1 -1
  109. package/tools/systems/update-systems.d.mts +9 -0
  110. package/tools/systems/update-systems.d.mts.map +1 -1
  111. package/tools/systems/update-systems.d.ts +9 -0
  112. package/tools/systems/update-systems.d.ts.map +1 -1
  113. package/tools/systems/versions/create-systems-versions.d.mts +41 -0
  114. package/tools/systems/versions/create-systems-versions.d.mts.map +1 -0
  115. package/tools/systems/versions/create-systems-versions.d.ts +41 -0
  116. package/tools/systems/versions/create-systems-versions.d.ts.map +1 -0
  117. package/tools/systems/versions/create-systems-versions.js +40 -0
  118. package/tools/systems/versions/create-systems-versions.js.map +1 -0
  119. package/tools/systems/versions/create-systems-versions.mjs +36 -0
  120. package/tools/systems/versions/create-systems-versions.mjs.map +1 -0
  121. package/tools/systems/versions/get-systems-versions.d.mts +41 -0
  122. package/tools/systems/versions/get-systems-versions.d.mts.map +1 -0
  123. package/tools/{system-configs/create-system-configs.d.mts → systems/versions/get-systems-versions.d.ts} +11 -2
  124. package/tools/systems/versions/get-systems-versions.d.ts.map +1 -0
  125. package/tools/{system-configs/get-system-configs.js → systems/versions/get-systems-versions.js} +9 -9
  126. package/tools/systems/versions/get-systems-versions.js.map +1 -0
  127. package/tools/{system-configs/get-system-configs.mjs → systems/versions/get-systems-versions.mjs} +9 -9
  128. package/tools/systems/versions/get-systems-versions.mjs.map +1 -0
  129. package/tools/systems/versions/list-systems-versions.d.mts +41 -0
  130. package/tools/systems/versions/list-systems-versions.d.mts.map +1 -0
  131. package/tools/systems/versions/list-systems-versions.d.ts +41 -0
  132. package/tools/systems/versions/list-systems-versions.d.ts.map +1 -0
  133. package/tools/{system-configs/list-system-configs.js → systems/versions/list-systems-versions.js} +6 -6
  134. package/tools/systems/versions/list-systems-versions.js.map +1 -0
  135. package/tools/{system-configs/list-system-configs.mjs → systems/versions/list-systems-versions.mjs} +6 -6
  136. package/tools/systems/versions/list-systems-versions.mjs.map +1 -0
  137. package/tools/testcases/create-testcases.d.mts +9 -0
  138. package/tools/testcases/create-testcases.d.mts.map +1 -1
  139. package/tools/testcases/create-testcases.d.ts +9 -0
  140. package/tools/testcases/create-testcases.d.ts.map +1 -1
  141. package/tools/testcases/delete-testcases.d.mts +9 -0
  142. package/tools/testcases/delete-testcases.d.mts.map +1 -1
  143. package/tools/testcases/delete-testcases.d.ts +9 -0
  144. package/tools/testcases/delete-testcases.d.ts.map +1 -1
  145. package/tools/testcases/get-testcases.d.mts +9 -0
  146. package/tools/testcases/get-testcases.d.mts.map +1 -1
  147. package/tools/testcases/get-testcases.d.ts +9 -0
  148. package/tools/testcases/get-testcases.d.ts.map +1 -1
  149. package/tools/testcases/list-testcases.d.mts +9 -0
  150. package/tools/testcases/list-testcases.d.mts.map +1 -1
  151. package/tools/testcases/list-testcases.d.ts +9 -0
  152. package/tools/testcases/list-testcases.d.ts.map +1 -1
  153. package/tools/testcases/update-testcases.d.mts +9 -0
  154. package/tools/testcases/update-testcases.d.mts.map +1 -1
  155. package/tools/testcases/update-testcases.d.ts +9 -0
  156. package/tools/testcases/update-testcases.d.ts.map +1 -1
  157. package/tools/testsets/create-testsets.d.mts +9 -0
  158. package/tools/testsets/create-testsets.d.mts.map +1 -1
  159. package/tools/testsets/create-testsets.d.ts +9 -0
  160. package/tools/testsets/create-testsets.d.ts.map +1 -1
  161. package/tools/testsets/delete-testsets.d.mts +9 -0
  162. package/tools/testsets/delete-testsets.d.mts.map +1 -1
  163. package/tools/testsets/delete-testsets.d.ts +9 -0
  164. package/tools/testsets/delete-testsets.d.ts.map +1 -1
  165. package/tools/testsets/get-testsets.d.mts +9 -0
  166. package/tools/testsets/get-testsets.d.mts.map +1 -1
  167. package/tools/testsets/get-testsets.d.ts +9 -0
  168. package/tools/testsets/get-testsets.d.ts.map +1 -1
  169. package/tools/testsets/get-testsets.js +1 -1
  170. package/tools/testsets/get-testsets.js.map +1 -1
  171. package/tools/testsets/get-testsets.mjs +1 -1
  172. package/tools/testsets/get-testsets.mjs.map +1 -1
  173. package/tools/testsets/list-testsets.d.mts +9 -0
  174. package/tools/testsets/list-testsets.d.mts.map +1 -1
  175. package/tools/testsets/list-testsets.d.ts +9 -0
  176. package/tools/testsets/list-testsets.d.ts.map +1 -1
  177. package/tools/testsets/update-testsets.d.mts +9 -0
  178. package/tools/testsets/update-testsets.d.mts.map +1 -1
  179. package/tools/testsets/update-testsets.d.ts +9 -0
  180. package/tools/testsets/update-testsets.d.ts.map +1 -1
  181. package/tools/types.d.mts.map +1 -1
  182. package/tools/types.d.ts.map +1 -1
  183. package/src/tools/system-configs/create-system-configs.ts +0 -64
  184. package/tools/system-configs/create-system-configs.d.mts.map +0 -1
  185. package/tools/system-configs/create-system-configs.d.ts.map +0 -1
  186. package/tools/system-configs/create-system-configs.js +0 -58
  187. package/tools/system-configs/create-system-configs.js.map +0 -1
  188. package/tools/system-configs/create-system-configs.mjs +0 -54
  189. package/tools/system-configs/create-system-configs.mjs.map +0 -1
  190. package/tools/system-configs/get-system-configs.d.mts.map +0 -1
  191. package/tools/system-configs/get-system-configs.d.ts.map +0 -1
  192. package/tools/system-configs/get-system-configs.js.map +0 -1
  193. package/tools/system-configs/get-system-configs.mjs.map +0 -1
  194. package/tools/system-configs/list-system-configs.d.mts.map +0 -1
  195. package/tools/system-configs/list-system-configs.d.ts +0 -32
  196. package/tools/system-configs/list-system-configs.d.ts.map +0 -1
  197. package/tools/system-configs/list-system-configs.js.map +0 -1
  198. package/tools/system-configs/list-system-configs.mjs.map +0 -1
@@ -17,6 +17,8 @@ import list_testcases from './testcases/list-testcases';
17
17
  import delete_testcases from './testcases/delete-testcases';
18
18
  import get_testcases from './testcases/get-testcases';
19
19
  import create_runs from './runs/create-runs';
20
+ import create_metrics from './metrics/create-metrics';
21
+ import update_metrics from './metrics/update-metrics';
20
22
  import create_records from './records/create-records';
21
23
  import upsert_scores from './scores/upsert-scores';
22
24
  import create_systems from './systems/create-systems';
@@ -24,9 +26,9 @@ import update_systems from './systems/update-systems';
24
26
  import list_systems from './systems/list-systems';
25
27
  import delete_systems from './systems/delete-systems';
26
28
  import get_systems from './systems/get-systems';
27
- import create_system_configs from './system-configs/create-system-configs';
28
- import list_system_configs from './system-configs/list-system-configs';
29
- import get_system_configs from './system-configs/get-system-configs';
29
+ import create_systems_versions from './systems/versions/create-systems-versions';
30
+ import list_systems_versions from './systems/versions/list-systems-versions';
31
+ import get_systems_versions from './systems/versions/get-systems-versions';
30
32
 
31
33
  export const endpoints: Endpoint[] = [];
32
34
 
@@ -47,6 +49,8 @@ addEndpoint(list_testcases);
47
49
  addEndpoint(delete_testcases);
48
50
  addEndpoint(get_testcases);
49
51
  addEndpoint(create_runs);
52
+ addEndpoint(create_metrics);
53
+ addEndpoint(update_metrics);
50
54
  addEndpoint(create_records);
51
55
  addEndpoint(upsert_scores);
52
56
  addEndpoint(create_systems);
@@ -54,9 +58,9 @@ addEndpoint(update_systems);
54
58
  addEndpoint(list_systems);
55
59
  addEndpoint(delete_systems);
56
60
  addEndpoint(get_systems);
57
- addEndpoint(create_system_configs);
58
- addEndpoint(list_system_configs);
59
- addEndpoint(get_system_configs);
61
+ addEndpoint(create_systems_versions);
62
+ addEndpoint(list_systems_versions);
63
+ addEndpoint(get_systems_versions);
60
64
 
61
65
  export type Filter = {
62
66
  type: 'resource' | 'operation' | 'tag' | 'tool';
@@ -82,9 +86,10 @@ export function query(filters: Filter[], endpoints: Endpoint[]): Endpoint[] {
82
86
  });
83
87
 
84
88
  // Check if any filters didn't match
85
- if (unmatchedFilters.size > 0) {
89
+ const unmatched = Array.from(unmatchedFilters).filter((f) => f.type === 'tool' || f.type === 'resource');
90
+ if (unmatched.length > 0) {
86
91
  throw new Error(
87
- `The following filters did not match any endpoints: ${[...unmatchedFilters]
92
+ `The following filters did not match any endpoints: ${unmatched
88
93
  .map((f) => `${f.type}=${f.value}`)
89
94
  .join(', ')}`,
90
95
  );
@@ -0,0 +1,252 @@
1
+ // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ import { asTextContentResult } from 'scorecard-ai-mcp/tools/types';
4
+
5
+ import { Tool } from '@modelcontextprotocol/sdk/types.js';
6
+ import type { Metadata } from '../';
7
+ import Scorecard from 'scorecard-ai';
8
+
9
+ export const metadata: Metadata = {
10
+ resource: 'metrics',
11
+ operation: 'write',
12
+ tags: [],
13
+ httpMethod: 'post',
14
+ httpPath: '/projects/{projectId}/metrics',
15
+ operationId: 'createMetric',
16
+ };
17
+
18
+ export const tool: Tool = {
19
+ name: 'create_metrics',
20
+ description:
21
+ 'Create a new Metric for evaluating system outputs. The structure of a metric depends on the evalType and outputType of the metric.',
22
+ inputSchema: {
23
+ type: 'object',
24
+ anyOf: [
25
+ {
26
+ type: 'object',
27
+ properties: {
28
+ projectId: {
29
+ type: 'string',
30
+ },
31
+ evalType: {
32
+ type: 'string',
33
+ description: 'AI-based evaluation type.',
34
+ enum: ['ai'],
35
+ },
36
+ name: {
37
+ type: 'string',
38
+ description: 'The name of the Metric.',
39
+ },
40
+ outputType: {
41
+ type: 'string',
42
+ description: 'Integer output type.',
43
+ enum: ['int'],
44
+ },
45
+ promptTemplate: {
46
+ type: 'string',
47
+ description:
48
+ 'The complete prompt template for AI evaluation. Should include placeholders for dynamic content.',
49
+ },
50
+ description: {
51
+ type: 'string',
52
+ description: 'The description of the Metric.',
53
+ },
54
+ evalModelName: {
55
+ type: 'string',
56
+ description: 'The AI model to use for evaluation.',
57
+ },
58
+ guidelines: {
59
+ type: 'string',
60
+ description: 'Guidelines for AI evaluation on how to score the metric.',
61
+ },
62
+ passingThreshold: {
63
+ type: 'integer',
64
+ description: 'The threshold for determining pass/fail from integer scores (1-5).',
65
+ },
66
+ temperature: {
67
+ type: 'number',
68
+ description: 'The temperature for AI evaluation (0-2).',
69
+ },
70
+ },
71
+ },
72
+ {
73
+ type: 'object',
74
+ properties: {
75
+ projectId: {
76
+ type: 'string',
77
+ },
78
+ evalType: {
79
+ type: 'string',
80
+ description: 'Human-based evaluation type.',
81
+ enum: ['human'],
82
+ },
83
+ name: {
84
+ type: 'string',
85
+ description: 'The name of the Metric.',
86
+ },
87
+ outputType: {
88
+ type: 'string',
89
+ description: 'Integer output type.',
90
+ enum: ['int'],
91
+ },
92
+ description: {
93
+ type: 'string',
94
+ description: 'The description of the Metric.',
95
+ },
96
+ guidelines: {
97
+ type: 'string',
98
+ description: 'Guidelines for human evaluators.',
99
+ },
100
+ passingThreshold: {
101
+ type: 'integer',
102
+ description: 'The threshold for determining pass/fail from integer scores (1-5).',
103
+ },
104
+ },
105
+ },
106
+ {
107
+ type: 'object',
108
+ properties: {
109
+ projectId: {
110
+ type: 'string',
111
+ },
112
+ evalType: {
113
+ type: 'string',
114
+ description: 'Heuristic-based evaluation type.',
115
+ enum: ['heuristic'],
116
+ },
117
+ name: {
118
+ type: 'string',
119
+ description: 'The name of the Metric.',
120
+ },
121
+ outputType: {
122
+ type: 'string',
123
+ description: 'Integer output type.',
124
+ enum: ['int'],
125
+ },
126
+ description: {
127
+ type: 'string',
128
+ description: 'The description of the Metric.',
129
+ },
130
+ guidelines: {
131
+ type: 'string',
132
+ description: 'Optional guidelines for heuristic evaluation logic.',
133
+ },
134
+ passingThreshold: {
135
+ type: 'integer',
136
+ description: 'The threshold for determining pass/fail from integer scores (1-5).',
137
+ },
138
+ },
139
+ },
140
+ {
141
+ type: 'object',
142
+ properties: {
143
+ projectId: {
144
+ type: 'string',
145
+ },
146
+ evalType: {
147
+ type: 'string',
148
+ description: 'AI-based evaluation type.',
149
+ enum: ['ai'],
150
+ },
151
+ name: {
152
+ type: 'string',
153
+ description: 'The name of the Metric.',
154
+ },
155
+ outputType: {
156
+ type: 'string',
157
+ description: 'Boolean output type.',
158
+ enum: ['boolean'],
159
+ },
160
+ promptTemplate: {
161
+ type: 'string',
162
+ description:
163
+ 'The complete prompt template for AI evaluation. Should include placeholders for dynamic content.',
164
+ },
165
+ description: {
166
+ type: 'string',
167
+ description: 'The description of the Metric.',
168
+ },
169
+ evalModelName: {
170
+ type: 'string',
171
+ description: 'The AI model to use for evaluation.',
172
+ },
173
+ guidelines: {
174
+ type: 'string',
175
+ description: 'Guidelines for AI evaluation on how to score the metric.',
176
+ },
177
+ temperature: {
178
+ type: 'number',
179
+ description: 'The temperature for AI evaluation (0-2).',
180
+ },
181
+ },
182
+ },
183
+ {
184
+ type: 'object',
185
+ properties: {
186
+ projectId: {
187
+ type: 'string',
188
+ },
189
+ evalType: {
190
+ type: 'string',
191
+ description: 'Human-based evaluation type.',
192
+ enum: ['human'],
193
+ },
194
+ name: {
195
+ type: 'string',
196
+ description: 'The name of the Metric.',
197
+ },
198
+ outputType: {
199
+ type: 'string',
200
+ description: 'Boolean output type.',
201
+ enum: ['boolean'],
202
+ },
203
+ description: {
204
+ type: 'string',
205
+ description: 'The description of the Metric.',
206
+ },
207
+ guidelines: {
208
+ type: 'string',
209
+ description: 'Guidelines for human evaluators.',
210
+ },
211
+ },
212
+ },
213
+ {
214
+ type: 'object',
215
+ properties: {
216
+ projectId: {
217
+ type: 'string',
218
+ },
219
+ evalType: {
220
+ type: 'string',
221
+ description: 'Heuristic-based evaluation type.',
222
+ enum: ['heuristic'],
223
+ },
224
+ name: {
225
+ type: 'string',
226
+ description: 'The name of the Metric.',
227
+ },
228
+ outputType: {
229
+ type: 'string',
230
+ description: 'Boolean output type.',
231
+ enum: ['boolean'],
232
+ },
233
+ description: {
234
+ type: 'string',
235
+ description: 'The description of the Metric.',
236
+ },
237
+ guidelines: {
238
+ type: 'string',
239
+ description: 'Optional guidelines for heuristic evaluation logic.',
240
+ },
241
+ },
242
+ },
243
+ ],
244
+ },
245
+ };
246
+
247
+ export const handler = async (client: Scorecard, args: Record<string, unknown> | undefined) => {
248
+ const { projectId, ...body } = args as any;
249
+ return asTextContentResult(await client.metrics.create(projectId, body));
250
+ };
251
+
252
+ export default { metadata, tool, handler };
@@ -0,0 +1,252 @@
1
+ // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ import { asTextContentResult } from 'scorecard-ai-mcp/tools/types';
4
+
5
+ import { Tool } from '@modelcontextprotocol/sdk/types.js';
6
+ import type { Metadata } from '../';
7
+ import Scorecard from 'scorecard-ai';
8
+
9
+ export const metadata: Metadata = {
10
+ resource: 'metrics',
11
+ operation: 'write',
12
+ tags: [],
13
+ httpMethod: 'patch',
14
+ httpPath: '/metrics/{metricId}',
15
+ operationId: 'updateMetric',
16
+ };
17
+
18
+ export const tool: Tool = {
19
+ name: 'update_metrics',
20
+ description:
21
+ 'Update an existing Metric. You must specify the evalType and outputType of the metric. The structure of a metric depends on the evalType and outputType of the metric.',
22
+ inputSchema: {
23
+ type: 'object',
24
+ anyOf: [
25
+ {
26
+ type: 'object',
27
+ properties: {
28
+ metricId: {
29
+ type: 'string',
30
+ },
31
+ evalType: {
32
+ type: 'string',
33
+ description: 'AI-based evaluation type.',
34
+ enum: ['ai'],
35
+ },
36
+ outputType: {
37
+ type: 'string',
38
+ description: 'Integer output type.',
39
+ enum: ['int'],
40
+ },
41
+ description: {
42
+ type: 'string',
43
+ description: 'The description of the Metric.',
44
+ },
45
+ evalModelName: {
46
+ type: 'string',
47
+ description: 'The AI model to use for evaluation.',
48
+ },
49
+ guidelines: {
50
+ type: 'string',
51
+ description: 'Guidelines for AI evaluation on how to score the metric.',
52
+ },
53
+ name: {
54
+ type: 'string',
55
+ description: 'The name of the Metric.',
56
+ },
57
+ passingThreshold: {
58
+ type: 'integer',
59
+ description: 'The threshold for determining pass/fail from integer scores (1-5).',
60
+ },
61
+ promptTemplate: {
62
+ type: 'string',
63
+ description:
64
+ 'The complete prompt template for AI evaluation. Should include placeholders for dynamic content.',
65
+ },
66
+ temperature: {
67
+ type: 'number',
68
+ description: 'The temperature for AI evaluation (0-2).',
69
+ },
70
+ },
71
+ },
72
+ {
73
+ type: 'object',
74
+ properties: {
75
+ metricId: {
76
+ type: 'string',
77
+ },
78
+ evalType: {
79
+ type: 'string',
80
+ description: 'Human-based evaluation type.',
81
+ enum: ['human'],
82
+ },
83
+ outputType: {
84
+ type: 'string',
85
+ description: 'Integer output type.',
86
+ enum: ['int'],
87
+ },
88
+ description: {
89
+ type: 'string',
90
+ description: 'The description of the Metric.',
91
+ },
92
+ guidelines: {
93
+ type: 'string',
94
+ description: 'Guidelines for human evaluators.',
95
+ },
96
+ name: {
97
+ type: 'string',
98
+ description: 'The name of the Metric.',
99
+ },
100
+ passingThreshold: {
101
+ type: 'integer',
102
+ description: 'The threshold for determining pass/fail from integer scores (1-5).',
103
+ },
104
+ },
105
+ },
106
+ {
107
+ type: 'object',
108
+ properties: {
109
+ metricId: {
110
+ type: 'string',
111
+ },
112
+ evalType: {
113
+ type: 'string',
114
+ description: 'Heuristic-based evaluation type.',
115
+ enum: ['heuristic'],
116
+ },
117
+ outputType: {
118
+ type: 'string',
119
+ description: 'Integer output type.',
120
+ enum: ['int'],
121
+ },
122
+ description: {
123
+ type: 'string',
124
+ description: 'The description of the Metric.',
125
+ },
126
+ guidelines: {
127
+ type: 'string',
128
+ description: 'Optional guidelines for heuristic evaluation logic.',
129
+ },
130
+ name: {
131
+ type: 'string',
132
+ description: 'The name of the Metric.',
133
+ },
134
+ passingThreshold: {
135
+ type: 'integer',
136
+ description: 'The threshold for determining pass/fail from integer scores (1-5).',
137
+ },
138
+ },
139
+ },
140
+ {
141
+ type: 'object',
142
+ properties: {
143
+ metricId: {
144
+ type: 'string',
145
+ },
146
+ evalType: {
147
+ type: 'string',
148
+ description: 'AI-based evaluation type.',
149
+ enum: ['ai'],
150
+ },
151
+ outputType: {
152
+ type: 'string',
153
+ description: 'Boolean output type.',
154
+ enum: ['boolean'],
155
+ },
156
+ description: {
157
+ type: 'string',
158
+ description: 'The description of the Metric.',
159
+ },
160
+ evalModelName: {
161
+ type: 'string',
162
+ description: 'The AI model to use for evaluation.',
163
+ },
164
+ guidelines: {
165
+ type: 'string',
166
+ description: 'Guidelines for AI evaluation on how to score the metric.',
167
+ },
168
+ name: {
169
+ type: 'string',
170
+ description: 'The name of the Metric.',
171
+ },
172
+ promptTemplate: {
173
+ type: 'string',
174
+ description:
175
+ 'The complete prompt template for AI evaluation. Should include placeholders for dynamic content.',
176
+ },
177
+ temperature: {
178
+ type: 'number',
179
+ description: 'The temperature for AI evaluation (0-2).',
180
+ },
181
+ },
182
+ },
183
+ {
184
+ type: 'object',
185
+ properties: {
186
+ metricId: {
187
+ type: 'string',
188
+ },
189
+ evalType: {
190
+ type: 'string',
191
+ description: 'Human-based evaluation type.',
192
+ enum: ['human'],
193
+ },
194
+ outputType: {
195
+ type: 'string',
196
+ description: 'Boolean output type.',
197
+ enum: ['boolean'],
198
+ },
199
+ description: {
200
+ type: 'string',
201
+ description: 'The description of the Metric.',
202
+ },
203
+ guidelines: {
204
+ type: 'string',
205
+ description: 'Guidelines for human evaluators.',
206
+ },
207
+ name: {
208
+ type: 'string',
209
+ description: 'The name of the Metric.',
210
+ },
211
+ },
212
+ },
213
+ {
214
+ type: 'object',
215
+ properties: {
216
+ metricId: {
217
+ type: 'string',
218
+ },
219
+ evalType: {
220
+ type: 'string',
221
+ description: 'Heuristic-based evaluation type.',
222
+ enum: ['heuristic'],
223
+ },
224
+ outputType: {
225
+ type: 'string',
226
+ description: 'Boolean output type.',
227
+ enum: ['boolean'],
228
+ },
229
+ description: {
230
+ type: 'string',
231
+ description: 'The description of the Metric.',
232
+ },
233
+ guidelines: {
234
+ type: 'string',
235
+ description: 'Optional guidelines for heuristic evaluation logic.',
236
+ },
237
+ name: {
238
+ type: 'string',
239
+ description: 'The name of the Metric.',
240
+ },
241
+ },
242
+ },
243
+ ],
244
+ },
245
+ };
246
+
247
+ export const handler = async (client: Scorecard, args: Record<string, unknown> | undefined) => {
248
+ const { metricId, ...body } = args as any;
249
+ return asTextContentResult(await client.metrics.update(metricId, body));
250
+ };
251
+
252
+ export default { metadata, tool, handler };
@@ -31,9 +31,9 @@ export const tool: Tool = {
31
31
  type: 'string',
32
32
  },
33
33
  },
34
- systemConfigId: {
34
+ systemVersionId: {
35
35
  type: 'string',
36
- description: 'The ID of the system configuration this Run is using.',
36
+ description: 'The ID of the system version this Run is using.',
37
37
  },
38
38
  testsetId: {
39
39
  type: 'string',
@@ -17,7 +17,7 @@ export const metadata: Metadata = {
17
17
 
18
18
  export const tool: Tool = {
19
19
  name: 'delete_systems',
20
- description: 'Delete a system definition by ID. This will not delete associated system configurations.',
20
+ description: 'Delete a system definition by ID. This will not delete associated system versions.',
21
21
  inputSchema: {
22
22
  type: 'object',
23
23
  properties: {
@@ -0,0 +1,45 @@
1
+ // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ import { asTextContentResult } from 'scorecard-ai-mcp/tools/types';
4
+
5
+ import { Tool } from '@modelcontextprotocol/sdk/types.js';
6
+ import type { Metadata } from '../../';
7
+ import Scorecard from 'scorecard-ai';
8
+
9
+ export const metadata: Metadata = {
10
+ resource: 'systems.versions',
11
+ operation: 'write',
12
+ tags: [],
13
+ httpMethod: 'post',
14
+ httpPath: '/systems/{systemId}/configs',
15
+ operationId: 'createSystemVersion',
16
+ };
17
+
18
+ export const tool: Tool = {
19
+ name: 'create_systems_versions',
20
+ description:
21
+ "Create a new version for a system.\n\nEach version contains specific parameter values that match the system's `configSchema` - things like model parameters, thresholds, or processing options.\nOnce created, versions cannot be modified, ensuring stable reference points for evaluations.\n\nWhen creating a system version:\n- The `config` object is validated against the parent system's `configSchema`.\n- System versions with validation errors are still stored, with errors included in the response.\n- Validation errors indicate fields that don't match the schema but don't prevent creation.\n- Having validation errors may affect how some evaluation metrics are calculated.",
22
+ inputSchema: {
23
+ type: 'object',
24
+ properties: {
25
+ systemId: {
26
+ type: 'string',
27
+ },
28
+ config: {
29
+ type: 'object',
30
+ description: 'The configuration of the system version.',
31
+ },
32
+ name: {
33
+ type: 'string',
34
+ description: 'The name of the system version.',
35
+ },
36
+ },
37
+ },
38
+ };
39
+
40
+ export const handler = async (client: Scorecard, args: Record<string, unknown> | undefined) => {
41
+ const { systemId, ...body } = args as any;
42
+ return asTextContentResult(await client.systems.versions.create(systemId, body));
43
+ };
44
+
45
+ export default { metadata, tool, handler };
@@ -3,25 +3,25 @@
3
3
  import { asTextContentResult } from 'scorecard-ai-mcp/tools/types';
4
4
 
5
5
  import { Tool } from '@modelcontextprotocol/sdk/types.js';
6
- import type { Metadata } from '../';
6
+ import type { Metadata } from '../../';
7
7
  import Scorecard from 'scorecard-ai';
8
8
 
9
9
  export const metadata: Metadata = {
10
- resource: 'system_configs',
10
+ resource: 'systems.versions',
11
11
  operation: 'read',
12
12
  tags: [],
13
13
  httpMethod: 'get',
14
- httpPath: '/systems/configs/{systemConfigId}',
15
- operationId: 'getSystemConfig',
14
+ httpPath: '/systems/configs/{systemVersionId}',
15
+ operationId: 'getSystemVersion',
16
16
  };
17
17
 
18
18
  export const tool: Tool = {
19
- name: 'get_system_configs',
20
- description: 'Retrieve a specific system configuration by ID.',
19
+ name: 'get_systems_versions',
20
+ description: 'Retrieve a specific system version by ID.',
21
21
  inputSchema: {
22
22
  type: 'object',
23
23
  properties: {
24
- systemConfigId: {
24
+ systemVersionId: {
25
25
  type: 'string',
26
26
  },
27
27
  },
@@ -29,8 +29,8 @@ export const tool: Tool = {
29
29
  };
30
30
 
31
31
  export const handler = async (client: Scorecard, args: Record<string, unknown> | undefined) => {
32
- const { systemConfigId, ...body } = args as any;
33
- return asTextContentResult(await client.systemConfigs.get(systemConfigId));
32
+ const { systemVersionId, ...body } = args as any;
33
+ return asTextContentResult(await client.systems.versions.get(systemVersionId));
34
34
  };
35
35
 
36
36
  export default { metadata, tool, handler };