@datalayer/agent-runtimes 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/README.md +182 -1
  2. package/lib/AgentNode.d.ts +3 -0
  3. package/lib/AgentNode.js +676 -0
  4. package/lib/App.js +1 -1
  5. package/lib/agent-node/themeStore.d.ts +3 -0
  6. package/lib/agent-node/themeStore.js +156 -0
  7. package/lib/agent-node-main.d.ts +1 -0
  8. package/lib/agent-node-main.js +14 -0
  9. package/lib/agents/AgentDetails.d.ts +22 -1
  10. package/lib/agents/AgentDetails.js +34 -47
  11. package/lib/api/index.d.ts +0 -1
  12. package/lib/api/index.js +4 -2
  13. package/lib/chat/Chat.d.ts +5 -106
  14. package/lib/chat/Chat.js +20 -14
  15. package/lib/chat/ChatFloating.d.ts +7 -140
  16. package/lib/chat/ChatFloating.js +3 -3
  17. package/lib/chat/ChatPopupStandalone.d.ts +8 -47
  18. package/lib/chat/ChatPopupStandalone.js +3 -3
  19. package/lib/chat/ChatSidebar.d.ts +4 -69
  20. package/lib/chat/ChatSidebar.js +83 -51
  21. package/lib/chat/ChatStandalone.d.ts +4 -54
  22. package/lib/chat/ChatStandalone.js +3 -3
  23. package/lib/chat/base/ChatBase.js +1414 -174
  24. package/lib/chat/display/FloatingBrandButton.js +8 -1
  25. package/lib/chat/header/ChatHeader.d.ts +3 -1
  26. package/lib/chat/header/ChatHeader.js +15 -12
  27. package/lib/chat/header/ChatHeaderBase.d.ts +30 -5
  28. package/lib/chat/header/ChatHeaderBase.js +41 -16
  29. package/lib/chat/indicators/McpStatusIndicator.d.ts +7 -4
  30. package/lib/chat/indicators/McpStatusIndicator.js +7 -32
  31. package/lib/chat/indicators/SandboxStatusIndicator.d.ts +4 -1
  32. package/lib/chat/indicators/SandboxStatusIndicator.js +91 -56
  33. package/lib/chat/indicators/SkillsStatusIndicator.d.ts +7 -0
  34. package/lib/chat/indicators/SkillsStatusIndicator.js +88 -0
  35. package/lib/chat/indicators/index.d.ts +1 -0
  36. package/lib/chat/indicators/index.js +1 -0
  37. package/lib/chat/messages/ChatMessageList.d.ts +1 -1
  38. package/lib/chat/messages/ChatMessageList.js +154 -114
  39. package/lib/chat/messages/ChatMessages.js +6 -2
  40. package/lib/chat/prompt/InputFooter.d.ts +21 -6
  41. package/lib/chat/prompt/InputFooter.js +76 -20
  42. package/lib/chat/prompt/InputPrompt.d.ts +5 -1
  43. package/lib/chat/prompt/InputPrompt.js +4 -4
  44. package/lib/chat/prompt/InputPromptFooter.d.ts +3 -1
  45. package/lib/chat/prompt/InputPromptFooter.js +3 -3
  46. package/lib/chat/prompt/InputPromptLexical.d.ts +3 -1
  47. package/lib/chat/prompt/InputPromptLexical.js +12 -5
  48. package/lib/chat/prompt/InputPromptText.d.ts +3 -1
  49. package/lib/chat/prompt/InputPromptText.js +2 -2
  50. package/lib/chat/tools/ToolApprovalBanner.js +1 -1
  51. package/lib/chat/tools/ToolCallDisplay.d.ts +3 -1
  52. package/lib/chat/tools/ToolCallDisplay.js +2 -2
  53. package/lib/chat/usage/TokenUsageBar.js +20 -2
  54. package/lib/client/AgentRuntimesClientContext.d.ts +53 -0
  55. package/lib/client/AgentRuntimesClientContext.js +55 -0
  56. package/lib/client/AgentsMixin.d.ts +0 -18
  57. package/lib/client/AgentsMixin.js +20 -30
  58. package/lib/client/IAgentRuntimesClient.d.ts +215 -0
  59. package/lib/client/IAgentRuntimesClient.js +5 -0
  60. package/lib/client/SdkAgentRuntimesClient.d.ts +151 -0
  61. package/lib/client/SdkAgentRuntimesClient.js +134 -0
  62. package/lib/client/index.d.ts +4 -1
  63. package/lib/client/index.js +3 -1
  64. package/lib/components/NotificationEventCard.js +5 -1
  65. package/lib/config/AgentConfiguration.d.ts +22 -0
  66. package/lib/config/AgentConfiguration.js +319 -64
  67. package/lib/context/ContextDistribution.d.ts +3 -1
  68. package/lib/context/ContextDistribution.js +8 -27
  69. package/lib/context/ContextInspector.d.ts +3 -1
  70. package/lib/context/ContextInspector.js +19 -67
  71. package/lib/context/ContextPanel.d.ts +3 -1
  72. package/lib/context/ContextPanel.js +104 -64
  73. package/lib/context/ContextUsage.d.ts +3 -1
  74. package/lib/context/ContextUsage.js +3 -3
  75. package/lib/context/CostTracker.d.ts +9 -3
  76. package/lib/context/CostTracker.js +26 -47
  77. package/lib/context/CostUsageChart.d.ts +12 -0
  78. package/lib/context/CostUsageChart.js +378 -0
  79. package/lib/context/GraphFlowChart.d.ts +16 -0
  80. package/lib/context/GraphFlowChart.js +182 -0
  81. package/lib/context/TokenUsageChart.d.ts +8 -1
  82. package/lib/context/TokenUsageChart.js +349 -211
  83. package/lib/context/TurnGraphChart.d.ts +39 -0
  84. package/lib/context/TurnGraphChart.js +538 -0
  85. package/lib/context/otelWsPool.d.ts +20 -0
  86. package/lib/context/otelWsPool.js +69 -0
  87. package/lib/examples/A2UiComponentGalleryExample.d.ts +0 -17
  88. package/lib/examples/A2UiComponentGalleryExample.js +315 -522
  89. package/lib/examples/A2UiContactCardExample.d.ts +0 -18
  90. package/lib/examples/A2UiContactCardExample.js +154 -411
  91. package/lib/examples/A2UiRestaurantExample.d.ts +0 -30
  92. package/lib/examples/A2UiRestaurantExample.js +114 -212
  93. package/lib/examples/A2UiViewerExample.d.ts +0 -18
  94. package/lib/examples/A2UiViewerExample.js +283 -532
  95. package/lib/examples/AgUiBackendToolRenderingExample.js +1 -1
  96. package/lib/examples/AgUiHaikuGenUiExample.d.ts +1 -1
  97. package/lib/examples/AgUiHaikuGenUiExample.js +1 -1
  98. package/lib/examples/AgUiSharedStateExample.js +2 -1
  99. package/lib/examples/AgentCheckpointsExample.js +14 -28
  100. package/lib/examples/AgentCodemodeExample.d.ts +4 -6
  101. package/lib/examples/AgentCodemodeExample.js +603 -169
  102. package/lib/examples/AgentEvalsExample.js +339 -53
  103. package/lib/examples/AgentGuardrailsExample.js +383 -66
  104. package/lib/examples/AgentHooksExample.d.ts +3 -0
  105. package/lib/examples/AgentHooksExample.js +122 -0
  106. package/lib/examples/AgentInferenceProviderExample.d.ts +3 -0
  107. package/lib/examples/AgentInferenceProviderExample.js +329 -0
  108. package/lib/examples/AgentMCPExample.d.ts +3 -0
  109. package/lib/examples/AgentMCPExample.js +481 -0
  110. package/lib/examples/AgentMemoryExample.d.ts +1 -2
  111. package/lib/examples/AgentMemoryExample.js +78 -33
  112. package/lib/examples/AgentMonitoringExample.js +261 -200
  113. package/lib/examples/AgentNotificationsExample.d.ts +1 -2
  114. package/lib/examples/AgentNotificationsExample.js +114 -33
  115. package/lib/examples/AgentOtelExample.js +32 -42
  116. package/lib/examples/AgentOutputsExample.d.ts +11 -6
  117. package/lib/examples/AgentOutputsExample.js +433 -81
  118. package/lib/examples/AgentParametersExample.d.ts +3 -0
  119. package/lib/examples/AgentParametersExample.js +248 -0
  120. package/lib/examples/AgentSandboxExample.d.ts +3 -3
  121. package/lib/examples/AgentSandboxExample.js +74 -45
  122. package/lib/examples/AgentSkillsExample.js +95 -103
  123. package/lib/examples/AgentSubagentsExample.d.ts +14 -0
  124. package/lib/examples/AgentSubagentsExample.js +228 -0
  125. package/lib/examples/AgentToolApprovalsExample.js +49 -561
  126. package/lib/examples/AgentTriggersExample.js +823 -569
  127. package/lib/examples/{AgentspecExample.d.ts → AgentspecsExample.d.ts} +2 -2
  128. package/lib/examples/AgentspecsExample.js +1096 -0
  129. package/lib/examples/ChatCustomExample.js +16 -28
  130. package/lib/examples/ChatExample.js +13 -29
  131. package/lib/examples/CopilotKitLexicalExample.js +2 -1
  132. package/lib/examples/CopilotKitNotebookExample.js +2 -1
  133. package/lib/examples/HomeExample.d.ts +15 -0
  134. package/lib/examples/HomeExample.js +77 -0
  135. package/lib/examples/Lexical2Example.js +4 -2
  136. package/lib/examples/{LexicalExample.d.ts → LexicalAgentExample.d.ts} +4 -4
  137. package/lib/examples/{LexicalExample.js → LexicalAgentExample.js} +66 -17
  138. package/lib/examples/{LexicalSidebarExample.d.ts → LexicalAgentSidebarExample.d.ts} +5 -5
  139. package/lib/examples/LexicalAgentSidebarExample.js +261 -0
  140. package/lib/examples/NotebookAgentExample.d.ts +9 -0
  141. package/lib/examples/NotebookAgentExample.js +192 -0
  142. package/lib/examples/{NotebookSidebarExample.d.ts → NotebookAgentSidebarExample.d.ts} +2 -2
  143. package/lib/examples/NotebookAgentSidebarExample.js +221 -0
  144. package/lib/examples/{DatalayerNotebookExample.d.ts → NotebookCollaborationExample.d.ts} +4 -4
  145. package/lib/examples/{DatalayerNotebookExample.js → NotebookCollaborationExample.js} +3 -3
  146. package/lib/examples/NotebookExample.d.ts +4 -7
  147. package/lib/examples/NotebookExample.js +14 -146
  148. package/lib/examples/components/AuthRequiredView.d.ts +6 -0
  149. package/lib/examples/components/AuthRequiredView.js +33 -0
  150. package/lib/examples/components/ExampleWrapper.d.ts +9 -3
  151. package/lib/examples/components/ExampleWrapper.js +45 -9
  152. package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.js +1 -1
  153. package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.js +1 -1
  154. package/lib/examples/{ag-ui → components}/haiku/index.d.ts +1 -1
  155. package/lib/examples/{ag-ui → components}/haiku/index.js +1 -1
  156. package/lib/examples/components/index.d.ts +3 -0
  157. package/lib/examples/components/index.js +4 -0
  158. package/lib/examples/{ag-ui → components}/weather/index.d.ts +1 -1
  159. package/lib/examples/{ag-ui → components}/weather/index.js +1 -1
  160. package/lib/examples/example-selector.d.ts +17 -4
  161. package/lib/examples/example-selector.js +108 -41
  162. package/lib/examples/index.d.ts +10 -6
  163. package/lib/examples/index.js +10 -6
  164. package/lib/examples/lexical/initial-content.json +6 -6
  165. package/lib/examples/main.js +257 -27
  166. package/lib/examples/utils/a2ui.d.ts +18 -0
  167. package/lib/examples/utils/a2ui.js +69 -0
  168. package/lib/examples/utils/a2uiMarkdownProvider.d.ts +7 -0
  169. package/lib/examples/utils/a2uiMarkdownProvider.js +9 -0
  170. package/lib/examples/utils/agentId.d.ts +18 -0
  171. package/lib/examples/utils/agentId.js +54 -0
  172. package/lib/examples/utils/agents/earthquake-detector.json +11 -11
  173. package/lib/examples/utils/agents/sales-forecaster.json +11 -11
  174. package/lib/examples/utils/agents/social-post-generator.json +11 -11
  175. package/lib/examples/utils/agents/stock-market.json +11 -11
  176. package/lib/examples/utils/examplesStore.js +82 -27
  177. package/lib/examples/utils/useExampleAgentRuntimesUrl.d.ts +5 -0
  178. package/lib/examples/utils/useExampleAgentRuntimesUrl.js +19 -0
  179. package/lib/hooks/index.d.ts +8 -8
  180. package/lib/hooks/index.js +7 -7
  181. package/lib/hooks/useA2A.d.ts +2 -3
  182. package/lib/hooks/useAIAgentsWebSocket.d.ts +43 -4
  183. package/lib/hooks/useAIAgentsWebSocket.js +153 -12
  184. package/lib/hooks/useAcp.d.ts +1 -2
  185. package/lib/hooks/useAgUi.d.ts +1 -1
  186. package/lib/hooks/{useAgents.d.ts → useAgentRuntimes.d.ts} +70 -4
  187. package/lib/hooks/{useAgents.js → useAgentRuntimes.js} +237 -32
  188. package/lib/hooks/useAgentsCatalog.js +1 -1
  189. package/lib/hooks/useAgentsService.d.ts +2 -2
  190. package/lib/hooks/useAgentsService.js +7 -7
  191. package/lib/hooks/useCheckpoints.js +1 -1
  192. package/lib/hooks/useConfig.d.ts +4 -1
  193. package/lib/hooks/useConfig.js +10 -3
  194. package/lib/hooks/useContextSnapshot.d.ts +9 -4
  195. package/lib/hooks/useContextSnapshot.js +9 -37
  196. package/lib/hooks/useMonitoring.js +3 -0
  197. package/lib/hooks/useSandbox.d.ts +20 -8
  198. package/lib/hooks/useSandbox.js +105 -40
  199. package/lib/hooks/useSkills.d.ts +23 -5
  200. package/lib/hooks/useSkills.js +94 -39
  201. package/lib/hooks/useToolApprovals.d.ts +60 -36
  202. package/lib/hooks/useToolApprovals.js +318 -69
  203. package/lib/hooks/useVercelAI.d.ts +1 -1
  204. package/lib/index.d.ts +2 -1
  205. package/lib/index.js +1 -0
  206. package/lib/inference/index.d.ts +0 -1
  207. package/lib/middleware/index.d.ts +0 -1
  208. package/lib/protocols/AGUIAdapter.js +6 -0
  209. package/lib/protocols/VercelAIAdapter.d.ts +7 -0
  210. package/lib/protocols/VercelAIAdapter.js +59 -7
  211. package/lib/specs/agents/agents.d.ts +21 -4
  212. package/lib/specs/agents/agents.js +2879 -316
  213. package/lib/specs/agents/index.js +3 -1
  214. package/lib/specs/benchmarks.d.ts +20 -0
  215. package/lib/specs/benchmarks.js +205 -0
  216. package/lib/specs/envvars.js +27 -20
  217. package/lib/specs/evals.d.ts +10 -9
  218. package/lib/specs/evals.js +128 -88
  219. package/lib/specs/events.d.ts +3 -10
  220. package/lib/specs/events.js +127 -84
  221. package/lib/specs/frontendTools.js +2 -2
  222. package/lib/specs/guardrails.d.ts +0 -7
  223. package/lib/specs/guardrails.js +240 -159
  224. package/lib/specs/mcpServers.js +35 -6
  225. package/lib/specs/memory.d.ts +0 -2
  226. package/lib/specs/memory.js +4 -17
  227. package/lib/specs/models.d.ts +0 -2
  228. package/lib/specs/models.js +20 -15
  229. package/lib/specs/notifications.js +102 -18
  230. package/lib/specs/outputs.js +15 -9
  231. package/lib/specs/personas.d.ts +41 -0
  232. package/lib/specs/personas.js +168 -0
  233. package/lib/specs/skills.d.ts +1 -1
  234. package/lib/specs/skills.js +23 -23
  235. package/lib/specs/teams/index.js +3 -1
  236. package/lib/specs/teams/teams.js +468 -348
  237. package/lib/specs/tools.js +4 -4
  238. package/lib/specs/triggers.js +61 -11
  239. package/lib/stores/agentRuntimeStore.d.ts +208 -0
  240. package/lib/stores/agentRuntimeStore.js +650 -0
  241. package/lib/stores/conversationStore.js +2 -2
  242. package/lib/stores/index.d.ts +1 -1
  243. package/lib/stores/index.js +1 -1
  244. package/lib/tools/adapters/copilotkit/lexicalHooks.d.ts +1 -2
  245. package/lib/tools/adapters/copilotkit/lexicalHooks.js +1 -3
  246. package/lib/tools/adapters/copilotkit/notebookHooks.d.ts +1 -2
  247. package/lib/tools/adapters/copilotkit/notebookHooks.js +1 -3
  248. package/lib/tools/index.d.ts +0 -2
  249. package/lib/tools/index.js +0 -1
  250. package/lib/types/agents-lifecycle.d.ts +18 -0
  251. package/lib/types/agents.d.ts +6 -0
  252. package/lib/types/agentspecs.d.ts +54 -1
  253. package/lib/types/benchmarks.d.ts +43 -0
  254. package/lib/types/benchmarks.js +5 -0
  255. package/lib/types/chat.d.ts +325 -8
  256. package/lib/types/context.d.ts +27 -0
  257. package/lib/types/cost.d.ts +2 -2
  258. package/lib/types/evals.d.ts +26 -17
  259. package/lib/types/index.d.ts +3 -0
  260. package/lib/types/index.js +3 -0
  261. package/lib/types/mcp.d.ts +8 -0
  262. package/lib/types/models.d.ts +2 -2
  263. package/lib/types/personas.d.ts +25 -0
  264. package/lib/types/personas.js +5 -0
  265. package/lib/types/skills.d.ts +43 -1
  266. package/lib/types/stream.d.ts +110 -0
  267. package/lib/types/stream.js +36 -0
  268. package/lib/utils/utils.d.ts +9 -5
  269. package/lib/utils/utils.js +9 -5
  270. package/package.json +19 -11
  271. package/scripts/codegen/__pycache__/generate_agents.cpython-313.pyc +0 -0
  272. package/scripts/codegen/__pycache__/generate_benchmarks.cpython-313.pyc +0 -0
  273. package/scripts/codegen/__pycache__/generate_evals.cpython-313.pyc +0 -0
  274. package/scripts/codegen/__pycache__/generate_events.cpython-313.pyc +0 -0
  275. package/scripts/codegen/__pycache__/versioning.cpython-313.pyc +0 -0
  276. package/scripts/codegen/generate_agents.py +187 -45
  277. package/scripts/codegen/generate_benchmarks.py +441 -0
  278. package/scripts/codegen/generate_evals.py +94 -16
  279. package/scripts/codegen/generate_events.py +35 -14
  280. package/scripts/codegen/generate_personas.py +319 -0
  281. package/scripts/codegen/generate_skills.py +9 -9
  282. package/scripts/sync-jupyter.sh +26 -7
  283. package/lib/api/tool-approvals.d.ts +0 -62
  284. package/lib/api/tool-approvals.js +0 -145
  285. package/lib/examples/AgentspecExample.js +0 -705
  286. package/lib/examples/LexicalSidebarExample.js +0 -163
  287. package/lib/examples/NotebookSidebarExample.js +0 -119
  288. package/lib/examples/NotebookSimpleExample.d.ts +0 -6
  289. package/lib/examples/NotebookSimpleExample.js +0 -22
  290. package/lib/examples/ag-ui/index.d.ts +0 -10
  291. package/lib/examples/ag-ui/index.js +0 -16
  292. package/lib/hooks/useAgentsRegistry.d.ts +0 -10
  293. package/lib/hooks/useAgentsRegistry.js +0 -20
  294. package/lib/stores/agentsStore.d.ts +0 -123
  295. package/lib/stores/agentsStore.js +0 -270
  296. /package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.d.ts +0 -0
  297. /package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.d.ts +0 -0
  298. /package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.d.ts +0 -0
  299. /package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.js +0 -0
@@ -31,7 +31,9 @@ export function getAgentSpecs(agentId) {
31
31
  */
32
32
  export function listAgentSpecs(prefix) {
33
33
  const specs = Object.values(AGENT_SPECS);
34
- return prefix !== undefined ? specs.filter(s => s.id.startsWith(prefix)) : specs;
34
+ return prefix !== undefined
35
+ ? specs.filter(s => s.id.startsWith(prefix))
36
+ : specs;
35
37
  }
36
38
  /**
37
39
  * Collect all required environment variables for an agent spec.
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Benchmark Catalog
3
+ *
4
+ * Predefined evaluation benchmark configurations.
5
+ *
6
+ * This file is AUTO-GENERATED from YAML specifications.
7
+ * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.
8
+ */
9
+ import type { BenchmarkSpec } from '../types';
10
+ export declare const AGENTBENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
11
+ export declare const GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
12
+ export declare const HUMANEVAL_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
13
+ export declare const MMLU_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
14
+ export declare const SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
15
+ export declare const SWE_BENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
16
+ export declare const TOOLBENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
17
+ export declare const TRUTHFULQA_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
18
+ export declare const BENCHMARK_CATALOG: Record<string, BenchmarkSpec>;
19
+ export declare function getBenchmarkSpecs(): BenchmarkSpec[];
20
+ export declare function getBenchmarkSpec(benchmarkId: string): BenchmarkSpec | undefined;
@@ -0,0 +1,205 @@
1
+ /*
2
+ * Copyright (c) 2025-2026 Datalayer, Inc.
3
+ * Distributed under the terms of the Modified BSD License.
4
+ */
5
+ // ============================================================================
6
+ // Benchmark Definitions
7
+ // ============================================================================
8
+ export const AGENTBENCH_BENCHMARK_SPEC_0_0_1 = {
9
+ id: 'agentbench',
10
+ version: '0.0.1',
11
+ name: 'AgentBench',
12
+ description: 'Multi-dimensional LLM-as-agent evaluation across 8 diverse environments including web browsing, operating system interaction, database queries, digital card games, lateral thinking, and household tasks.',
13
+ category: 'Agentic',
14
+ task_count: 4080,
15
+ metric: 'success_rate',
16
+ source: 'https://github.com/THUDM/AgentBench',
17
+ difficulty: 'hard',
18
+ languages: ['python', 'sql', 'bash'],
19
+ dataset_source: 'hosted',
20
+ supports_live_monitoring: true,
21
+ supports_experiment_comparison: true,
22
+ evaluator_shapes: ['pass_rate', 'numeric'],
23
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
24
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
25
+ trace_integration: true,
26
+ dataset_editability: 'read-only',
27
+ sdk_support: 'experimental',
28
+ };
29
+ export const GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1 = {
30
+ id: 'gpqa-diamond',
31
+ version: '0.0.1',
32
+ name: 'GPQA Diamond',
33
+ description: 'Graduate-level science questions crafted by domain experts. Tests advanced reasoning in physics, chemistry, and biology with questions that require PhD-level understanding to answer correctly.',
34
+ category: 'Knowledge',
35
+ task_count: 448,
36
+ metric: 'accuracy',
37
+ source: 'https://github.com/idavidrein/gpqa',
38
+ difficulty: 'expert',
39
+ languages: ['english'],
40
+ dataset_source: 'hosted',
41
+ supports_live_monitoring: false,
42
+ supports_experiment_comparison: true,
43
+ evaluator_shapes: ['numeric'],
44
+ evaluators: ['precision-recall-evaluator:0.0.1'],
45
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
46
+ trace_integration: true,
47
+ dataset_editability: 'read-only',
48
+ sdk_support: 'experimental',
49
+ };
50
+ export const HUMANEVAL_BENCHMARK_SPEC_0_0_1 = {
51
+ id: 'humaneval',
52
+ version: '0.0.1',
53
+ name: 'HumanEval',
54
+ description: 'Python function implementation from docstrings. Measures functional correctness of code generation by testing against hand-written test cases. Widely used as a baseline for code generation benchmarks.',
55
+ category: 'Coding',
56
+ task_count: 164,
57
+ metric: 'pass@k',
58
+ source: 'https://github.com/openai/human-eval',
59
+ difficulty: 'medium',
60
+ languages: ['python'],
61
+ dataset_source: 'hosted',
62
+ supports_live_monitoring: false,
63
+ supports_experiment_comparison: true,
64
+ evaluator_shapes: ['pass_rate'],
65
+ evaluators: ['precision-recall-evaluator:0.0.1'],
66
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
67
+ trace_integration: true,
68
+ dataset_editability: 'read-only',
69
+ sdk_support: 'experimental',
70
+ };
71
+ export const MMLU_BENCHMARK_SPEC_0_0_1 = {
72
+ id: 'mmlu',
73
+ version: '0.0.1',
74
+ name: 'MMLU',
75
+ description: 'Massive Multitask Language Understanding: 57-subject knowledge benchmark spanning STEM, humanities, social sciences, and more. Tests broad knowledge and reasoning across diverse academic domains.',
76
+ category: 'Knowledge',
77
+ task_count: 15908,
78
+ metric: 'accuracy',
79
+ source: 'https://github.com/hendrycks/test',
80
+ difficulty: 'medium',
81
+ languages: ['english'],
82
+ dataset_source: 'hosted',
83
+ supports_live_monitoring: false,
84
+ supports_experiment_comparison: true,
85
+ evaluator_shapes: ['numeric'],
86
+ evaluators: ['precision-recall-evaluator:0.0.1'],
87
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
88
+ trace_integration: true,
89
+ dataset_editability: 'read-only',
90
+ sdk_support: 'experimental',
91
+ };
92
+ export const SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1 = {
93
+ id: 'swe-bench-verified',
94
+ version: '0.0.1',
95
+ name: 'SWE-bench Verified',
96
+ description: 'Human-validated subset of SWE-bench with verified ground-truth patches. Provides higher confidence evaluation of software engineering capabilities by eliminating ambiguous or flawed test cases from the full benchmark.',
97
+ category: 'Coding',
98
+ task_count: 500,
99
+ metric: 'pass@1',
100
+ source: 'https://www.swebench.com/',
101
+ difficulty: 'hard',
102
+ languages: ['python'],
103
+ dataset_source: 'hosted',
104
+ supports_live_monitoring: true,
105
+ supports_experiment_comparison: true,
106
+ evaluator_shapes: ['pass_rate'],
107
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
108
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
109
+ trace_integration: true,
110
+ dataset_editability: 'read-only',
111
+ sdk_support: 'experimental',
112
+ };
113
+ export const SWE_BENCH_BENCHMARK_SPEC_0_0_1 = {
114
+ id: 'swe-bench',
115
+ version: '0.0.1',
116
+ name: 'SWE-bench',
117
+ description: "Real-world software engineering tasks from GitHub issues. Tests an agent's ability to understand bug reports and feature requests, then produce working code patches that pass existing test suites.",
118
+ category: 'Coding',
119
+ task_count: 2294,
120
+ metric: 'pass@1',
121
+ source: 'https://www.swebench.com/',
122
+ difficulty: 'hard',
123
+ languages: ['python'],
124
+ dataset_source: 'hosted',
125
+ supports_live_monitoring: true,
126
+ supports_experiment_comparison: true,
127
+ evaluator_shapes: ['pass_rate'],
128
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
129
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
130
+ trace_integration: true,
131
+ dataset_editability: 'read-only',
132
+ sdk_support: 'experimental',
133
+ };
134
+ export const TOOLBENCH_BENCHMARK_SPEC_0_0_1 = {
135
+ id: 'toolbench',
136
+ version: '0.0.1',
137
+ name: 'ToolBench',
138
+ description: 'Large-scale benchmark for tool-augmented LLMs covering 16000+ real-world APIs across 49 categories. Evaluates multi-step tool usage, API selection, argument generation, and response parsing in complex, chained workflows.',
139
+ category: 'Agentic',
140
+ task_count: 12657,
141
+ metric: 'pass_rate',
142
+ source: 'https://github.com/OpenBMB/ToolBench',
143
+ difficulty: 'hard',
144
+ languages: ['python', 'json'],
145
+ dataset_source: 'hosted',
146
+ supports_live_monitoring: true,
147
+ supports_experiment_comparison: true,
148
+ evaluator_shapes: ['pass_rate', 'numeric'],
149
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
150
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
151
+ trace_integration: true,
152
+ dataset_editability: 'read-only',
153
+ sdk_support: 'experimental',
154
+ };
155
+ export const TRUTHFULQA_BENCHMARK_SPEC_0_0_1 = {
156
+ id: 'truthfulqa',
157
+ version: '0.0.1',
158
+ name: 'TruthfulQA',
159
+ description: 'Benchmark measuring whether a language model generates truthful answers to questions spanning 38 categories including health, law, finance, and politics. Designed to test resilience against common human misconceptions and falsehoods that models may have learned from training data.',
160
+ category: 'Safety',
161
+ task_count: 817,
162
+ metric: 'truthful_informative',
163
+ source: 'https://github.com/sylinrl/TruthfulQA',
164
+ difficulty: 'medium',
165
+ languages: ['english'],
166
+ dataset_source: 'hosted',
167
+ supports_live_monitoring: false,
168
+ supports_experiment_comparison: true,
169
+ evaluator_shapes: ['categorical', 'numeric'],
170
+ evaluators: ['llm-judge:0.0.1'],
171
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
172
+ trace_integration: true,
173
+ dataset_editability: 'read-only',
174
+ sdk_support: 'experimental',
175
+ };
176
+ // ============================================================================
177
+ // Benchmark Catalog
178
+ // ============================================================================
179
+ export const BENCHMARK_CATALOG = {
180
+ agentbench: AGENTBENCH_BENCHMARK_SPEC_0_0_1,
181
+ 'gpqa-diamond': GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1,
182
+ humaneval: HUMANEVAL_BENCHMARK_SPEC_0_0_1,
183
+ mmlu: MMLU_BENCHMARK_SPEC_0_0_1,
184
+ 'swe-bench-verified': SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1,
185
+ 'swe-bench': SWE_BENCH_BENCHMARK_SPEC_0_0_1,
186
+ toolbench: TOOLBENCH_BENCHMARK_SPEC_0_0_1,
187
+ truthfulqa: TRUTHFULQA_BENCHMARK_SPEC_0_0_1,
188
+ };
189
+ export function getBenchmarkSpecs() {
190
+ return Object.values(BENCHMARK_CATALOG);
191
+ }
192
+ function resolveBenchmarkId(benchmarkId) {
193
+ if (benchmarkId in BENCHMARK_CATALOG)
194
+ return benchmarkId;
195
+ const idx = benchmarkId.lastIndexOf(':');
196
+ if (idx > 0) {
197
+ const base = benchmarkId.slice(0, idx);
198
+ if (base in BENCHMARK_CATALOG)
199
+ return base;
200
+ }
201
+ return benchmarkId;
202
+ }
203
+ export function getBenchmarkSpec(benchmarkId) {
204
+ return BENCHMARK_CATALOG[resolveBenchmarkId(benchmarkId)];
205
+ }
@@ -11,7 +11,7 @@ export const ALPHAVANTAGE_API_KEY_SPEC_0_0_1 = {
11
11
  name: 'Alpha Vantage API Key',
12
12
  description: 'API key for accessing Alpha Vantage financial market data and stock information. Provides real-time and historical stock prices, forex data, and cryptocurrency information.',
13
13
  registrationUrl: 'https://www.alphavantage.co/support/#api-key',
14
- tags: ["authentication", "api-key", "finance", "stocks", "market-data"],
14
+ tags: ['authentication', 'api-key', 'finance', 'stocks', 'market-data'],
15
15
  icon: 'key',
16
16
  emoji: '🔑',
17
17
  };
@@ -21,7 +21,7 @@ export const GITHUB_TOKEN_SPEC_0_0_1 = {
21
21
  name: 'GitHub Token',
22
22
  description: 'GitHub API token for repository management and code operations. Required for GitHub MCP server and GitHub skill to interact with GitHub repositories programmatically.',
23
23
  registrationUrl: 'https://github.com/settings/tokens',
24
- tags: ["authentication", "token", "github", "git", "mcp-server", "skill"],
24
+ tags: ['authentication', 'token', 'github', 'git', 'mcp-server', 'skill'],
25
25
  icon: 'key',
26
26
  emoji: '🔑',
27
27
  };
@@ -31,7 +31,7 @@ export const GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1 = {
31
31
  name: 'Google OAuth Client ID',
32
32
  description: 'OAuth 2.0 client ID for Google Workspace authentication. Required for Google Drive, Gmail, Calendar, and Docs integration through the Google Workspace MCP server.',
33
33
  registrationUrl: 'https://console.cloud.google.com/apis/credentials',
34
- tags: ["authentication", "oauth", "google", "workspace", "client-id"],
34
+ tags: ['authentication', 'oauth', 'google', 'workspace', 'client-id'],
35
35
  icon: 'key',
36
36
  emoji: '🔑',
37
37
  };
@@ -41,7 +41,14 @@ export const GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1 = {
41
41
  name: 'Google OAuth Client Secret',
42
42
  description: 'OAuth 2.0 client secret for Google Workspace authentication. Used in conjunction with client ID for secure API access to Google services.',
43
43
  registrationUrl: 'https://console.cloud.google.com/apis/credentials',
44
- tags: ["authentication", "oauth", "google", "workspace", "client-secret", "security"],
44
+ tags: [
45
+ 'authentication',
46
+ 'oauth',
47
+ 'google',
48
+ 'workspace',
49
+ 'client-secret',
50
+ 'security',
51
+ ],
45
52
  icon: 'lock',
46
53
  emoji: '🔒',
47
54
  };
@@ -51,7 +58,7 @@ export const HF_TOKEN_SPEC_0_0_1 = {
51
58
  name: 'Hugging Face Token',
52
59
  description: 'Access token for Hugging Face API. Required for Hugging Face MCP server authentication. Create a READ token from your settings.',
53
60
  registrationUrl: 'https://huggingface.co/settings/tokens',
54
- tags: ["authentication", "api-key", "huggingface", "machine-learning"],
61
+ tags: ['authentication', 'api-key', 'huggingface', 'machine-learning'],
55
62
  icon: 'key',
56
63
  emoji: '🔑',
57
64
  };
@@ -61,7 +68,7 @@ export const KAGGLE_TOKEN_SPEC_0_0_1 = {
61
68
  name: 'Kaggle API Token',
62
69
  description: 'API token for accessing Kaggle datasets, competitions, notebooks, and models. Required for Kaggle MCP server authentication.',
63
70
  registrationUrl: 'https://www.kaggle.com/settings/account',
64
- tags: ["authentication", "api-key", "kaggle", "data"],
71
+ tags: ['authentication', 'api-key', 'kaggle', 'data'],
65
72
  icon: 'key',
66
73
  emoji: '🔑',
67
74
  };
@@ -71,7 +78,7 @@ export const SLACK_BOT_TOKEN_SPEC_0_0_1 = {
71
78
  name: 'Slack Bot Token',
72
79
  description: 'OAuth token for Slack bot authentication. Required for Slack MCP server to send messages, manage channels, and interact with workspace members.',
73
80
  registrationUrl: 'https://api.slack.com/apps',
74
- tags: ["authentication", "oauth", "token", "slack", "messaging", "bot"],
81
+ tags: ['authentication', 'oauth', 'token', 'slack', 'messaging', 'bot'],
75
82
  icon: 'key',
76
83
  emoji: '🔑',
77
84
  };
@@ -80,7 +87,7 @@ export const SLACK_CHANNEL_IDS_SPEC_0_0_1 = {
80
87
  version: '0.0.1',
81
88
  name: 'Slack Channel IDs',
82
89
  description: 'Comma-separated list of Slack channel IDs that the bot is allowed to access. Restricts bot operations to specific channels for security and organization.',
83
- tags: ["configuration", "slack", "channels", "identifier"],
90
+ tags: ['configuration', 'slack', 'channels', 'identifier'],
84
91
  icon: 'hash',
85
92
  emoji: undefined,
86
93
  };
@@ -90,7 +97,7 @@ export const SLACK_TEAM_ID_SPEC_0_0_1 = {
90
97
  name: 'Slack Team ID',
91
98
  description: 'Unique identifier for the Slack workspace (team). Required to specify which workspace the bot should connect to.',
92
99
  registrationUrl: 'https://api.slack.com/apps',
93
- tags: ["configuration", "slack", "workspace", "identifier"],
100
+ tags: ['configuration', 'slack', 'workspace', 'identifier'],
94
101
  icon: 'organization',
95
102
  emoji: '🏢',
96
103
  };
@@ -100,7 +107,7 @@ export const TAVILY_API_KEY_SPEC_0_0_1 = {
100
107
  name: 'Tavily API Key',
101
108
  description: 'API key for Tavily web search and research capabilities. Required for web crawling, content extraction, and search operations.',
102
109
  registrationUrl: 'https://tavily.com/api-keys',
103
- tags: ["authentication", "api-key", "search", "web", "research"],
110
+ tags: ['authentication', 'api-key', 'search', 'web', 'research'],
104
111
  icon: 'key',
105
112
  emoji: '🔑',
106
113
  };
@@ -108,16 +115,16 @@ export const TAVILY_API_KEY_SPEC_0_0_1 = {
108
115
  // Environment Variable Catalog
109
116
  // ============================================================================
110
117
  export const ENVVAR_CATALOG = {
111
- 'ALPHAVANTAGE_API_KEY': ALPHAVANTAGE_API_KEY_SPEC_0_0_1,
112
- 'GITHUB_TOKEN': GITHUB_TOKEN_SPEC_0_0_1,
113
- 'GOOGLE_OAUTH_CLIENT_ID': GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1,
114
- 'GOOGLE_OAUTH_CLIENT_SECRET': GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1,
115
- 'HF_TOKEN': HF_TOKEN_SPEC_0_0_1,
116
- 'KAGGLE_TOKEN': KAGGLE_TOKEN_SPEC_0_0_1,
117
- 'SLACK_BOT_TOKEN': SLACK_BOT_TOKEN_SPEC_0_0_1,
118
- 'SLACK_CHANNEL_IDS': SLACK_CHANNEL_IDS_SPEC_0_0_1,
119
- 'SLACK_TEAM_ID': SLACK_TEAM_ID_SPEC_0_0_1,
120
- 'TAVILY_API_KEY': TAVILY_API_KEY_SPEC_0_0_1,
118
+ ALPHAVANTAGE_API_KEY: ALPHAVANTAGE_API_KEY_SPEC_0_0_1,
119
+ GITHUB_TOKEN: GITHUB_TOKEN_SPEC_0_0_1,
120
+ GOOGLE_OAUTH_CLIENT_ID: GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1,
121
+ GOOGLE_OAUTH_CLIENT_SECRET: GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1,
122
+ HF_TOKEN: HF_TOKEN_SPEC_0_0_1,
123
+ KAGGLE_TOKEN: KAGGLE_TOKEN_SPEC_0_0_1,
124
+ SLACK_BOT_TOKEN: SLACK_BOT_TOKEN_SPEC_0_0_1,
125
+ SLACK_CHANNEL_IDS: SLACK_CHANNEL_IDS_SPEC_0_0_1,
126
+ SLACK_TEAM_ID: SLACK_TEAM_ID_SPEC_0_0_1,
127
+ TAVILY_API_KEY: TAVILY_API_KEY_SPEC_0_0_1,
121
128
  };
122
129
  function resolveEnvvarId(envvarId) {
123
130
  if (envvarId in ENVVAR_CATALOG)
@@ -1,20 +1,21 @@
1
1
  /**
2
2
  * Eval Catalog
3
3
  *
4
- * Predefined evaluation benchmark configurations.
4
+ * Predefined built-in evaluator configurations.
5
5
  *
6
6
  * This file is AUTO-GENERATED from YAML specifications.
7
7
  * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.
8
8
  */
9
9
  import type { EvalSpec } from '../types';
10
- export declare const AGENTBENCH_EVAL_SPEC_0_0_1: EvalSpec;
11
- export declare const GPQA_DIAMOND_EVAL_SPEC_0_0_1: EvalSpec;
12
- export declare const HUMANEVAL_EVAL_SPEC_0_0_1: EvalSpec;
13
- export declare const MMLU_EVAL_SPEC_0_0_1: EvalSpec;
14
- export declare const SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1: EvalSpec;
15
- export declare const SWE_BENCH_EVAL_SPEC_0_0_1: EvalSpec;
16
- export declare const TOOLBENCH_EVAL_SPEC_0_0_1: EvalSpec;
17
- export declare const TRUTHFULQA_EVAL_SPEC_0_0_1: EvalSpec;
10
+ export declare const CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1: EvalSpec;
11
+ export declare const CONTAINS_EVAL_SPEC_0_0_1: EvalSpec;
12
+ export declare const EQUALS_EXPECTED_EVAL_SPEC_0_0_1: EvalSpec;
13
+ export declare const EQUALS_EVAL_SPEC_0_0_1: EvalSpec;
14
+ export declare const HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1: EvalSpec;
15
+ export declare const IS_INSTANCE_EVAL_SPEC_0_0_1: EvalSpec;
16
+ export declare const LLM_JUDGE_EVAL_SPEC_0_0_1: EvalSpec;
17
+ export declare const MAX_DURATION_EVAL_SPEC_0_0_1: EvalSpec;
18
+ export declare const PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1: EvalSpec;
18
19
  export declare const EVAL_CATALOG: Record<string, EvalSpec>;
19
20
  export declare function getEvalSpecs(): EvalSpec[];
20
21
  export declare function getEvalSpec(evalId: string): EvalSpec | undefined;
@@ -5,114 +5,154 @@
5
5
  // ============================================================================
6
6
  // Eval Definitions
7
7
  // ============================================================================
8
- export const AGENTBENCH_EVAL_SPEC_0_0_1 = {
9
- id: 'agentbench',
8
+ export const CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1 = {
9
+ id: 'confusion-matrix-evaluator',
10
10
  version: '0.0.1',
11
- name: 'AgentBench',
12
- description: 'Multi-dimensional LLM-as-agent evaluation across 8 diverse environments including web browsing, operating system interaction, database queries, digital card games, lateral thinking, and household tasks.',
13
- category: 'Agentic',
14
- task_count: 4080,
15
- metric: 'success_rate',
16
- source: 'https://github.com/THUDM/AgentBench',
17
- difficulty: 'hard',
18
- languages: ['python', 'sql', 'bash'],
11
+ name: 'Confusion Matrix Evaluator',
12
+ description: 'Aggregate evaluator for precision/recall style confusion-matrix reporting.',
13
+ category: 'Report',
14
+ evaluator_type: 'report',
15
+ pydantic_class: 'ConfusionMatrixEvaluator',
16
+ output_kind: 'report_table',
17
+ cost_tier: 'free',
18
+ latency: 'fast',
19
+ requires: ['expected_output'],
20
+ source: 'https://ai.pydantic.dev/evals/',
21
+ default_config: {},
19
22
  };
20
- export const GPQA_DIAMOND_EVAL_SPEC_0_0_1 = {
21
- id: 'gpqa-diamond',
23
+ export const CONTAINS_EVAL_SPEC_0_0_1 = {
24
+ id: 'contains',
22
25
  version: '0.0.1',
23
- name: 'GPQA Diamond',
24
- description: 'Graduate-level science questions crafted by domain experts. Tests advanced reasoning in physics, chemistry, and biology with questions that require PhD-level understanding to answer correctly.',
25
- category: 'Reasoning',
26
- task_count: 448,
27
- metric: 'accuracy',
28
- source: 'https://github.com/idavidrein/gpqa',
29
- difficulty: 'expert',
30
- languages: ['english'],
26
+ name: 'Contains',
27
+ description: 'Assert that expected content appears in the model output.',
28
+ category: 'Comparison',
29
+ evaluator_type: 'case',
30
+ pydantic_class: 'ContainsEvaluator',
31
+ output_kind: 'boolean',
32
+ cost_tier: 'free',
33
+ latency: 'instant',
34
+ requires: ['expected_output'],
35
+ source: 'https://ai.pydantic.dev/evals/',
36
+ default_config: {},
31
37
  };
32
- export const HUMANEVAL_EVAL_SPEC_0_0_1 = {
33
- id: 'humaneval',
38
+ export const EQUALS_EXPECTED_EVAL_SPEC_0_0_1 = {
39
+ id: 'equals-expected',
34
40
  version: '0.0.1',
35
- name: 'HumanEval',
36
- description: 'Python function implementation from docstrings. Measures functional correctness of code generation by testing against hand-written test cases. Widely used as a baseline for code generation benchmarks.',
37
- category: 'Coding',
38
- task_count: 164,
39
- metric: 'pass@k',
40
- source: 'https://github.com/openai/human-eval',
41
- difficulty: 'medium',
42
- languages: ['python'],
41
+ name: 'Equals Expected',
42
+ description: 'Compare model output against an expected value with strict matching.',
43
+ category: 'Comparison',
44
+ evaluator_type: 'case',
45
+ pydantic_class: 'EqualsExpectedEvaluator',
46
+ output_kind: 'boolean',
47
+ cost_tier: 'free',
48
+ latency: 'instant',
49
+ requires: ['expected_output'],
50
+ source: 'https://ai.pydantic.dev/evals/',
51
+ default_config: {},
43
52
  };
44
- export const MMLU_EVAL_SPEC_0_0_1 = {
45
- id: 'mmlu',
53
+ export const EQUALS_EVAL_SPEC_0_0_1 = {
54
+ id: 'equals',
46
55
  version: '0.0.1',
47
- name: 'MMLU',
48
- description: 'Massive Multitask Language Understanding: 57-subject knowledge benchmark spanning STEM, humanities, social sciences, and more. Tests broad knowledge and reasoning across diverse academic domains.',
49
- category: 'Knowledge',
50
- task_count: 15908,
51
- metric: 'accuracy',
52
- source: 'https://github.com/hendrycks/test',
53
- difficulty: 'medium',
54
- languages: ['english'],
56
+ name: 'Equals',
57
+ description: 'Assert exact equality between expected and actual values.',
58
+ category: 'Comparison',
59
+ evaluator_type: 'case',
60
+ pydantic_class: 'EqualsEvaluator',
61
+ output_kind: 'boolean',
62
+ cost_tier: 'free',
63
+ latency: 'instant',
64
+ requires: ['expected_output'],
65
+ source: 'https://ai.pydantic.dev/evals/',
66
+ default_config: {},
55
67
  };
56
- export const SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1 = {
57
- id: 'swe-bench-verified',
68
+ export const HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1 = {
69
+ id: 'has-matching-span',
58
70
  version: '0.0.1',
59
- name: 'SWE-bench Verified',
60
- description: 'Human-validated subset of SWE-bench with verified ground-truth patches. Provides higher confidence evaluation of software engineering capabilities by eliminating ambiguous or flawed test cases from the full benchmark.',
61
- category: 'Coding',
62
- task_count: 500,
63
- metric: 'pass@1',
64
- source: 'https://www.swebench.com/',
65
- difficulty: 'hard',
66
- languages: ['python'],
71
+ name: 'Has Matching Span',
72
+ description: 'Validate expected spans in structured traces and tool-call transcripts.',
73
+ category: 'Span-Based',
74
+ evaluator_type: 'case',
75
+ pydantic_class: 'HasMatchingSpanEvaluator',
76
+ output_kind: 'boolean',
77
+ cost_tier: 'free',
78
+ latency: 'fast',
79
+ requires: ['trace'],
80
+ source: 'https://ai.pydantic.dev/evals/',
81
+ default_config: {},
67
82
  };
68
- export const SWE_BENCH_EVAL_SPEC_0_0_1 = {
69
- id: 'swe-bench',
83
+ export const IS_INSTANCE_EVAL_SPEC_0_0_1 = {
84
+ id: 'is-instance',
70
85
  version: '0.0.1',
71
- name: 'SWE-bench',
72
- description: 'Real-world software engineering tasks from GitHub issues. Tests an agent\'s ability to understand bug reports and feature requests, then produce working code patches that pass existing test suites.',
73
- category: 'Coding',
74
- task_count: 2294,
75
- metric: 'pass@1',
76
- source: 'https://www.swebench.com/',
77
- difficulty: 'hard',
78
- languages: ['python'],
86
+ name: 'Is Instance',
87
+ description: 'Validate output type against an expected Python/JSON schema type.',
88
+ category: 'Type Validation',
89
+ evaluator_type: 'case',
90
+ pydantic_class: 'IsInstanceEvaluator',
91
+ output_kind: 'boolean',
92
+ cost_tier: 'free',
93
+ latency: 'instant',
94
+ requires: ['expected_type'],
95
+ source: 'https://ai.pydantic.dev/evals/',
96
+ default_config: {},
79
97
  };
80
- export const TOOLBENCH_EVAL_SPEC_0_0_1 = {
81
- id: 'toolbench',
98
+ export const LLM_JUDGE_EVAL_SPEC_0_0_1 = {
99
+ id: 'llm-judge',
82
100
  version: '0.0.1',
83
- name: 'ToolBench',
84
- description: 'Large-scale benchmark for tool-augmented LLMs covering 16000+ real-world APIs across 49 categories. Evaluates multi-step tool usage, API selection, argument generation, and response parsing in complex, chained workflows.',
85
- category: 'Agentic',
86
- task_count: 12657,
87
- metric: 'pass_rate',
88
- source: 'https://github.com/OpenBMB/ToolBench',
89
- difficulty: 'hard',
90
- languages: ['python', 'json'],
101
+ name: 'LLM Judge',
102
+ description: 'Use an LLM-as-a-judge prompt to score quality and provide rationale.',
103
+ category: 'LLM-as-a-Judge',
104
+ evaluator_type: 'case',
105
+ pydantic_class: 'LLMJudgeEvaluator',
106
+ output_kind: 'score_and_assertion',
107
+ cost_tier: 'llm',
108
+ latency: 'slow',
109
+ requires: ['model'],
110
+ source: 'https://ai.pydantic.dev/evals/',
111
+ default_config: { threshold: 0.7 },
91
112
  };
92
- export const TRUTHFULQA_EVAL_SPEC_0_0_1 = {
93
- id: 'truthfulqa',
113
+ export const MAX_DURATION_EVAL_SPEC_0_0_1 = {
114
+ id: 'max-duration',
94
115
  version: '0.0.1',
95
- name: 'TruthfulQA',
96
- description: 'Benchmark measuring whether a language model generates truthful answers to questions spanning 38 categories including health, law, finance, and politics. Designed to test resilience against common human misconceptions and falsehoods that models may have learned from training data.',
97
- category: 'Safety',
98
- task_count: 817,
99
- metric: 'truthful_informative',
100
- source: 'https://github.com/sylinrl/TruthfulQA',
101
- difficulty: 'medium',
102
- languages: ['english'],
116
+ name: 'Max Duration',
117
+ description: 'Assert response latency remains below a configured duration threshold.',
118
+ category: 'Performance',
119
+ evaluator_type: 'case',
120
+ pydantic_class: 'MaxDurationEvaluator',
121
+ output_kind: 'boolean_with_reason',
122
+ cost_tier: 'free',
123
+ latency: 'instant',
124
+ requires: ['duration_ms'],
125
+ source: 'https://ai.pydantic.dev/evals/',
126
+ default_config: { max_duration_ms: 5000 },
127
+ };
128
+ export const PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1 = {
129
+ id: 'precision-recall-evaluator',
130
+ version: '0.0.1',
131
+ name: 'Precision Recall Evaluator',
132
+ description: 'Aggregate evaluator for precision, recall, and pass-rate style benchmark reporting.',
133
+ category: 'Report',
134
+ evaluator_type: 'report',
135
+ pydantic_class: 'PrecisionRecallEvaluator',
136
+ output_kind: 'report_curve',
137
+ cost_tier: 'free',
138
+ latency: 'fast',
139
+ requires: ['expected_output'],
140
+ source: 'https://ai.pydantic.dev/evals/',
141
+ default_config: {},
103
142
  };
104
143
  // ============================================================================
105
144
  // Eval Catalog
106
145
  // ============================================================================
107
146
  export const EVAL_CATALOG = {
108
- 'agentbench': AGENTBENCH_EVAL_SPEC_0_0_1,
109
- 'gpqa-diamond': GPQA_DIAMOND_EVAL_SPEC_0_0_1,
110
- 'humaneval': HUMANEVAL_EVAL_SPEC_0_0_1,
111
- 'mmlu': MMLU_EVAL_SPEC_0_0_1,
112
- 'swe-bench-verified': SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1,
113
- 'swe-bench': SWE_BENCH_EVAL_SPEC_0_0_1,
114
- 'toolbench': TOOLBENCH_EVAL_SPEC_0_0_1,
115
- 'truthfulqa': TRUTHFULQA_EVAL_SPEC_0_0_1,
147
+ 'confusion-matrix-evaluator': CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1,
148
+ contains: CONTAINS_EVAL_SPEC_0_0_1,
149
+ 'equals-expected': EQUALS_EXPECTED_EVAL_SPEC_0_0_1,
150
+ equals: EQUALS_EVAL_SPEC_0_0_1,
151
+ 'has-matching-span': HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1,
152
+ 'is-instance': IS_INSTANCE_EVAL_SPEC_0_0_1,
153
+ 'llm-judge': LLM_JUDGE_EVAL_SPEC_0_0_1,
154
+ 'max-duration': MAX_DURATION_EVAL_SPEC_0_0_1,
155
+ 'precision-recall-evaluator': PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1,
116
156
  };
117
157
  export function getEvalSpecs() {
118
158
  return Object.values(EVAL_CATALOG);