@datalayer/agent-runtimes 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/README.md +182 -1
  2. package/lib/AgentNode.d.ts +3 -0
  3. package/lib/AgentNode.js +676 -0
  4. package/lib/App.js +1 -1
  5. package/lib/agent-node/themeStore.d.ts +3 -0
  6. package/lib/agent-node/themeStore.js +156 -0
  7. package/lib/agent-node-main.d.ts +1 -0
  8. package/lib/agent-node-main.js +14 -0
  9. package/lib/agents/AgentDetails.d.ts +22 -1
  10. package/lib/agents/AgentDetails.js +34 -47
  11. package/lib/api/index.d.ts +0 -1
  12. package/lib/api/index.js +4 -2
  13. package/lib/chat/Chat.d.ts +5 -106
  14. package/lib/chat/Chat.js +20 -14
  15. package/lib/chat/ChatFloating.d.ts +7 -140
  16. package/lib/chat/ChatFloating.js +3 -3
  17. package/lib/chat/ChatPopupStandalone.d.ts +8 -47
  18. package/lib/chat/ChatPopupStandalone.js +3 -3
  19. package/lib/chat/ChatSidebar.d.ts +4 -69
  20. package/lib/chat/ChatSidebar.js +83 -51
  21. package/lib/chat/ChatStandalone.d.ts +4 -54
  22. package/lib/chat/ChatStandalone.js +3 -3
  23. package/lib/chat/base/ChatBase.js +1414 -174
  24. package/lib/chat/display/FloatingBrandButton.js +8 -1
  25. package/lib/chat/header/ChatHeader.d.ts +3 -1
  26. package/lib/chat/header/ChatHeader.js +15 -12
  27. package/lib/chat/header/ChatHeaderBase.d.ts +30 -5
  28. package/lib/chat/header/ChatHeaderBase.js +41 -16
  29. package/lib/chat/indicators/McpStatusIndicator.d.ts +7 -4
  30. package/lib/chat/indicators/McpStatusIndicator.js +7 -32
  31. package/lib/chat/indicators/SandboxStatusIndicator.d.ts +4 -1
  32. package/lib/chat/indicators/SandboxStatusIndicator.js +91 -56
  33. package/lib/chat/indicators/SkillsStatusIndicator.d.ts +7 -0
  34. package/lib/chat/indicators/SkillsStatusIndicator.js +88 -0
  35. package/lib/chat/indicators/index.d.ts +1 -0
  36. package/lib/chat/indicators/index.js +1 -0
  37. package/lib/chat/messages/ChatMessageList.d.ts +1 -1
  38. package/lib/chat/messages/ChatMessageList.js +154 -114
  39. package/lib/chat/messages/ChatMessages.js +6 -2
  40. package/lib/chat/prompt/InputFooter.d.ts +21 -6
  41. package/lib/chat/prompt/InputFooter.js +76 -20
  42. package/lib/chat/prompt/InputPrompt.d.ts +5 -1
  43. package/lib/chat/prompt/InputPrompt.js +4 -4
  44. package/lib/chat/prompt/InputPromptFooter.d.ts +3 -1
  45. package/lib/chat/prompt/InputPromptFooter.js +3 -3
  46. package/lib/chat/prompt/InputPromptLexical.d.ts +3 -1
  47. package/lib/chat/prompt/InputPromptLexical.js +12 -5
  48. package/lib/chat/prompt/InputPromptText.d.ts +3 -1
  49. package/lib/chat/prompt/InputPromptText.js +2 -2
  50. package/lib/chat/tools/ToolApprovalBanner.js +1 -1
  51. package/lib/chat/tools/ToolCallDisplay.d.ts +3 -1
  52. package/lib/chat/tools/ToolCallDisplay.js +2 -2
  53. package/lib/chat/usage/TokenUsageBar.js +20 -2
  54. package/lib/client/AgentRuntimesClientContext.d.ts +53 -0
  55. package/lib/client/AgentRuntimesClientContext.js +55 -0
  56. package/lib/client/AgentsMixin.d.ts +0 -18
  57. package/lib/client/AgentsMixin.js +20 -30
  58. package/lib/client/IAgentRuntimesClient.d.ts +215 -0
  59. package/lib/client/IAgentRuntimesClient.js +5 -0
  60. package/lib/client/SdkAgentRuntimesClient.d.ts +151 -0
  61. package/lib/client/SdkAgentRuntimesClient.js +134 -0
  62. package/lib/client/index.d.ts +4 -1
  63. package/lib/client/index.js +3 -1
  64. package/lib/components/NotificationEventCard.js +5 -1
  65. package/lib/config/AgentConfiguration.d.ts +22 -0
  66. package/lib/config/AgentConfiguration.js +319 -64
  67. package/lib/context/ContextDistribution.d.ts +3 -1
  68. package/lib/context/ContextDistribution.js +8 -27
  69. package/lib/context/ContextInspector.d.ts +3 -1
  70. package/lib/context/ContextInspector.js +19 -67
  71. package/lib/context/ContextPanel.d.ts +3 -1
  72. package/lib/context/ContextPanel.js +104 -64
  73. package/lib/context/ContextUsage.d.ts +3 -1
  74. package/lib/context/ContextUsage.js +3 -3
  75. package/lib/context/CostTracker.d.ts +9 -3
  76. package/lib/context/CostTracker.js +26 -47
  77. package/lib/context/CostUsageChart.d.ts +12 -0
  78. package/lib/context/CostUsageChart.js +378 -0
  79. package/lib/context/GraphFlowChart.d.ts +16 -0
  80. package/lib/context/GraphFlowChart.js +182 -0
  81. package/lib/context/TokenUsageChart.d.ts +8 -1
  82. package/lib/context/TokenUsageChart.js +349 -211
  83. package/lib/context/TurnGraphChart.d.ts +39 -0
  84. package/lib/context/TurnGraphChart.js +538 -0
  85. package/lib/context/otelWsPool.d.ts +20 -0
  86. package/lib/context/otelWsPool.js +69 -0
  87. package/lib/examples/A2UiComponentGalleryExample.d.ts +0 -17
  88. package/lib/examples/A2UiComponentGalleryExample.js +315 -522
  89. package/lib/examples/A2UiContactCardExample.d.ts +0 -18
  90. package/lib/examples/A2UiContactCardExample.js +154 -411
  91. package/lib/examples/A2UiRestaurantExample.d.ts +0 -30
  92. package/lib/examples/A2UiRestaurantExample.js +114 -212
  93. package/lib/examples/A2UiViewerExample.d.ts +0 -18
  94. package/lib/examples/A2UiViewerExample.js +283 -532
  95. package/lib/examples/AgUiBackendToolRenderingExample.js +1 -1
  96. package/lib/examples/AgUiHaikuGenUiExample.d.ts +1 -1
  97. package/lib/examples/AgUiHaikuGenUiExample.js +1 -1
  98. package/lib/examples/AgUiSharedStateExample.js +2 -1
  99. package/lib/examples/AgentCheckpointsExample.js +14 -28
  100. package/lib/examples/AgentCodemodeExample.d.ts +4 -6
  101. package/lib/examples/AgentCodemodeExample.js +603 -169
  102. package/lib/examples/AgentEvalsExample.js +339 -53
  103. package/lib/examples/AgentGuardrailsExample.js +383 -66
  104. package/lib/examples/AgentHooksExample.d.ts +3 -0
  105. package/lib/examples/AgentHooksExample.js +122 -0
  106. package/lib/examples/AgentInferenceProviderExample.d.ts +3 -0
  107. package/lib/examples/AgentInferenceProviderExample.js +329 -0
  108. package/lib/examples/AgentMCPExample.d.ts +3 -0
  109. package/lib/examples/AgentMCPExample.js +481 -0
  110. package/lib/examples/AgentMemoryExample.d.ts +1 -2
  111. package/lib/examples/AgentMemoryExample.js +78 -33
  112. package/lib/examples/AgentMonitoringExample.js +261 -200
  113. package/lib/examples/AgentNotificationsExample.d.ts +1 -2
  114. package/lib/examples/AgentNotificationsExample.js +114 -33
  115. package/lib/examples/AgentOtelExample.js +32 -42
  116. package/lib/examples/AgentOutputsExample.d.ts +11 -6
  117. package/lib/examples/AgentOutputsExample.js +433 -81
  118. package/lib/examples/AgentParametersExample.d.ts +3 -0
  119. package/lib/examples/AgentParametersExample.js +248 -0
  120. package/lib/examples/AgentSandboxExample.d.ts +3 -3
  121. package/lib/examples/AgentSandboxExample.js +74 -45
  122. package/lib/examples/AgentSkillsExample.js +95 -103
  123. package/lib/examples/AgentSubagentsExample.d.ts +14 -0
  124. package/lib/examples/AgentSubagentsExample.js +228 -0
  125. package/lib/examples/AgentToolApprovalsExample.js +49 -561
  126. package/lib/examples/AgentTriggersExample.js +823 -569
  127. package/lib/examples/{AgentspecExample.d.ts → AgentspecsExample.d.ts} +2 -2
  128. package/lib/examples/AgentspecsExample.js +1096 -0
  129. package/lib/examples/ChatCustomExample.js +16 -28
  130. package/lib/examples/ChatExample.js +13 -29
  131. package/lib/examples/CopilotKitLexicalExample.js +2 -1
  132. package/lib/examples/CopilotKitNotebookExample.js +2 -1
  133. package/lib/examples/HomeExample.d.ts +15 -0
  134. package/lib/examples/HomeExample.js +77 -0
  135. package/lib/examples/Lexical2Example.js +4 -2
  136. package/lib/examples/{LexicalExample.d.ts → LexicalAgentExample.d.ts} +4 -4
  137. package/lib/examples/{LexicalExample.js → LexicalAgentExample.js} +66 -17
  138. package/lib/examples/{LexicalSidebarExample.d.ts → LexicalAgentSidebarExample.d.ts} +5 -5
  139. package/lib/examples/LexicalAgentSidebarExample.js +261 -0
  140. package/lib/examples/NotebookAgentExample.d.ts +9 -0
  141. package/lib/examples/NotebookAgentExample.js +192 -0
  142. package/lib/examples/{NotebookSidebarExample.d.ts → NotebookAgentSidebarExample.d.ts} +2 -2
  143. package/lib/examples/NotebookAgentSidebarExample.js +221 -0
  144. package/lib/examples/{DatalayerNotebookExample.d.ts → NotebookCollaborationExample.d.ts} +4 -4
  145. package/lib/examples/{DatalayerNotebookExample.js → NotebookCollaborationExample.js} +3 -3
  146. package/lib/examples/NotebookExample.d.ts +4 -7
  147. package/lib/examples/NotebookExample.js +14 -146
  148. package/lib/examples/components/AuthRequiredView.d.ts +6 -0
  149. package/lib/examples/components/AuthRequiredView.js +33 -0
  150. package/lib/examples/components/ExampleWrapper.d.ts +9 -3
  151. package/lib/examples/components/ExampleWrapper.js +45 -9
  152. package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.js +1 -1
  153. package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.js +1 -1
  154. package/lib/examples/{ag-ui → components}/haiku/index.d.ts +1 -1
  155. package/lib/examples/{ag-ui → components}/haiku/index.js +1 -1
  156. package/lib/examples/components/index.d.ts +3 -0
  157. package/lib/examples/components/index.js +4 -0
  158. package/lib/examples/{ag-ui → components}/weather/index.d.ts +1 -1
  159. package/lib/examples/{ag-ui → components}/weather/index.js +1 -1
  160. package/lib/examples/example-selector.d.ts +17 -4
  161. package/lib/examples/example-selector.js +108 -41
  162. package/lib/examples/index.d.ts +10 -6
  163. package/lib/examples/index.js +10 -6
  164. package/lib/examples/lexical/initial-content.json +6 -6
  165. package/lib/examples/main.js +257 -27
  166. package/lib/examples/utils/a2ui.d.ts +18 -0
  167. package/lib/examples/utils/a2ui.js +69 -0
  168. package/lib/examples/utils/a2uiMarkdownProvider.d.ts +7 -0
  169. package/lib/examples/utils/a2uiMarkdownProvider.js +9 -0
  170. package/lib/examples/utils/agentId.d.ts +18 -0
  171. package/lib/examples/utils/agentId.js +54 -0
  172. package/lib/examples/utils/agents/earthquake-detector.json +11 -11
  173. package/lib/examples/utils/agents/sales-forecaster.json +11 -11
  174. package/lib/examples/utils/agents/social-post-generator.json +11 -11
  175. package/lib/examples/utils/agents/stock-market.json +11 -11
  176. package/lib/examples/utils/examplesStore.js +82 -27
  177. package/lib/examples/utils/useExampleAgentRuntimesUrl.d.ts +5 -0
  178. package/lib/examples/utils/useExampleAgentRuntimesUrl.js +19 -0
  179. package/lib/hooks/index.d.ts +8 -8
  180. package/lib/hooks/index.js +7 -7
  181. package/lib/hooks/useA2A.d.ts +2 -3
  182. package/lib/hooks/useAIAgentsWebSocket.d.ts +43 -4
  183. package/lib/hooks/useAIAgentsWebSocket.js +153 -12
  184. package/lib/hooks/useAcp.d.ts +1 -2
  185. package/lib/hooks/useAgUi.d.ts +1 -1
  186. package/lib/hooks/{useAgents.d.ts → useAgentRuntimes.d.ts} +70 -4
  187. package/lib/hooks/{useAgents.js → useAgentRuntimes.js} +237 -32
  188. package/lib/hooks/useAgentsCatalog.js +1 -1
  189. package/lib/hooks/useAgentsService.d.ts +2 -2
  190. package/lib/hooks/useAgentsService.js +7 -7
  191. package/lib/hooks/useCheckpoints.js +1 -1
  192. package/lib/hooks/useConfig.d.ts +4 -1
  193. package/lib/hooks/useConfig.js +10 -3
  194. package/lib/hooks/useContextSnapshot.d.ts +9 -4
  195. package/lib/hooks/useContextSnapshot.js +9 -37
  196. package/lib/hooks/useMonitoring.js +3 -0
  197. package/lib/hooks/useSandbox.d.ts +20 -8
  198. package/lib/hooks/useSandbox.js +105 -40
  199. package/lib/hooks/useSkills.d.ts +23 -5
  200. package/lib/hooks/useSkills.js +94 -39
  201. package/lib/hooks/useToolApprovals.d.ts +60 -36
  202. package/lib/hooks/useToolApprovals.js +318 -69
  203. package/lib/hooks/useVercelAI.d.ts +1 -1
  204. package/lib/index.d.ts +2 -1
  205. package/lib/index.js +1 -0
  206. package/lib/inference/index.d.ts +0 -1
  207. package/lib/middleware/index.d.ts +0 -1
  208. package/lib/protocols/AGUIAdapter.js +6 -0
  209. package/lib/protocols/VercelAIAdapter.d.ts +7 -0
  210. package/lib/protocols/VercelAIAdapter.js +59 -7
  211. package/lib/specs/agents/agents.d.ts +21 -4
  212. package/lib/specs/agents/agents.js +2879 -316
  213. package/lib/specs/agents/index.js +3 -1
  214. package/lib/specs/benchmarks.d.ts +20 -0
  215. package/lib/specs/benchmarks.js +205 -0
  216. package/lib/specs/envvars.js +27 -20
  217. package/lib/specs/evals.d.ts +10 -9
  218. package/lib/specs/evals.js +128 -88
  219. package/lib/specs/events.d.ts +3 -10
  220. package/lib/specs/events.js +127 -84
  221. package/lib/specs/frontendTools.js +2 -2
  222. package/lib/specs/guardrails.d.ts +0 -7
  223. package/lib/specs/guardrails.js +240 -159
  224. package/lib/specs/mcpServers.js +35 -6
  225. package/lib/specs/memory.d.ts +0 -2
  226. package/lib/specs/memory.js +4 -17
  227. package/lib/specs/models.d.ts +0 -2
  228. package/lib/specs/models.js +20 -15
  229. package/lib/specs/notifications.js +102 -18
  230. package/lib/specs/outputs.js +15 -9
  231. package/lib/specs/personas.d.ts +41 -0
  232. package/lib/specs/personas.js +168 -0
  233. package/lib/specs/skills.d.ts +1 -1
  234. package/lib/specs/skills.js +23 -23
  235. package/lib/specs/teams/index.js +3 -1
  236. package/lib/specs/teams/teams.js +468 -348
  237. package/lib/specs/tools.js +4 -4
  238. package/lib/specs/triggers.js +61 -11
  239. package/lib/stores/agentRuntimeStore.d.ts +208 -0
  240. package/lib/stores/agentRuntimeStore.js +650 -0
  241. package/lib/stores/conversationStore.js +2 -2
  242. package/lib/stores/index.d.ts +1 -1
  243. package/lib/stores/index.js +1 -1
  244. package/lib/tools/adapters/copilotkit/lexicalHooks.d.ts +1 -2
  245. package/lib/tools/adapters/copilotkit/lexicalHooks.js +1 -3
  246. package/lib/tools/adapters/copilotkit/notebookHooks.d.ts +1 -2
  247. package/lib/tools/adapters/copilotkit/notebookHooks.js +1 -3
  248. package/lib/tools/index.d.ts +0 -2
  249. package/lib/tools/index.js +0 -1
  250. package/lib/types/agents-lifecycle.d.ts +18 -0
  251. package/lib/types/agents.d.ts +6 -0
  252. package/lib/types/agentspecs.d.ts +54 -1
  253. package/lib/types/benchmarks.d.ts +43 -0
  254. package/lib/types/benchmarks.js +5 -0
  255. package/lib/types/chat.d.ts +325 -8
  256. package/lib/types/context.d.ts +27 -0
  257. package/lib/types/cost.d.ts +2 -2
  258. package/lib/types/evals.d.ts +26 -17
  259. package/lib/types/index.d.ts +3 -0
  260. package/lib/types/index.js +3 -0
  261. package/lib/types/mcp.d.ts +8 -0
  262. package/lib/types/models.d.ts +2 -2
  263. package/lib/types/personas.d.ts +25 -0
  264. package/lib/types/personas.js +5 -0
  265. package/lib/types/skills.d.ts +43 -1
  266. package/lib/types/stream.d.ts +110 -0
  267. package/lib/types/stream.js +36 -0
  268. package/lib/utils/utils.d.ts +9 -5
  269. package/lib/utils/utils.js +9 -5
  270. package/package.json +19 -11
  271. package/scripts/codegen/__pycache__/generate_agents.cpython-313.pyc +0 -0
  272. package/scripts/codegen/__pycache__/generate_benchmarks.cpython-313.pyc +0 -0
  273. package/scripts/codegen/__pycache__/generate_evals.cpython-313.pyc +0 -0
  274. package/scripts/codegen/__pycache__/generate_events.cpython-313.pyc +0 -0
  275. package/scripts/codegen/__pycache__/versioning.cpython-313.pyc +0 -0
  276. package/scripts/codegen/generate_agents.py +187 -45
  277. package/scripts/codegen/generate_benchmarks.py +441 -0
  278. package/scripts/codegen/generate_evals.py +94 -16
  279. package/scripts/codegen/generate_events.py +35 -14
  280. package/scripts/codegen/generate_personas.py +319 -0
  281. package/scripts/codegen/generate_skills.py +9 -9
  282. package/scripts/sync-jupyter.sh +26 -7
  283. package/lib/api/tool-approvals.d.ts +0 -62
  284. package/lib/api/tool-approvals.js +0 -145
  285. package/lib/examples/AgentspecExample.js +0 -705
  286. package/lib/examples/LexicalSidebarExample.js +0 -163
  287. package/lib/examples/NotebookSidebarExample.js +0 -119
  288. package/lib/examples/NotebookSimpleExample.d.ts +0 -6
  289. package/lib/examples/NotebookSimpleExample.js +0 -22
  290. package/lib/examples/ag-ui/index.d.ts +0 -10
  291. package/lib/examples/ag-ui/index.js +0 -16
  292. package/lib/hooks/useAgentsRegistry.d.ts +0 -10
  293. package/lib/hooks/useAgentsRegistry.js +0 -20
  294. package/lib/stores/agentsStore.d.ts +0 -123
  295. package/lib/stores/agentsStore.js +0 -270
  296. /package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.d.ts +0 -0
  297. /package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.d.ts +0 -0
  298. /package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.d.ts +0 -0
  299. /package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.js +0 -0
@@ -0,0 +1,441 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2025-2026 Datalayer, Inc.
3
+ # Distributed under the terms of the Modified BSD License.
4
+
5
+ """
6
+ Generate Python and TypeScript code from YAML benchmark specifications.
7
+
8
+ Usage:
9
+ python generate_benchmarks.py \
10
+ --specs-dir specs/benchmarks \
11
+ --eval-specs-dir specs/evals \
12
+ --python-output agent_runtimes/specs/benchmarks.py \
13
+ --typescript-output src/specs/benchmarks.ts
14
+ """
15
+
16
+ import argparse
17
+ import sys
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ import yaml
22
+ from versioning import ensure_spec_version, version_suffix
23
+
24
+ ALLOWED_BENCHMARK_CATEGORIES = {
25
+ "Coding",
26
+ "Knowledge",
27
+ "Reasoning",
28
+ "Agentic",
29
+ "Safety",
30
+ }
31
+
32
+ ALLOWED_DIFFICULTY = {"easy", "medium", "hard", "expert"}
33
+
34
+ ALLOWED_DATASET_SOURCE = {"hosted", "local", "hybrid"}
35
+
36
+ ALLOWED_DATASET_EDITABILITY = {"read-only", "editable"}
37
+
38
+ ALLOWED_SDK_SUPPORT = {"none", "experimental", "stable"}
39
+
40
+ ALLOWED_EVALUATOR_SHAPES = {
41
+ "pass_rate",
42
+ "numeric",
43
+ "categorical",
44
+ "error_only",
45
+ }
46
+
47
+
48
+ def _required_str(spec: dict[str, Any], key: str) -> str:
49
+ """Return required non-empty string key or raise with actionable context."""
50
+ value = spec.get(key)
51
+ if not isinstance(value, str) or not value.strip():
52
+ raise ValueError(
53
+ f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': missing required field '{key}'"
54
+ )
55
+ return value.strip()
56
+
57
+
58
+ def _required_int(spec: dict[str, Any], key: str) -> int:
59
+ """Return required integer key or raise with actionable context."""
60
+ value = spec.get(key)
61
+ if not isinstance(value, int):
62
+ raise ValueError(
63
+ f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': field '{key}' must be an integer"
64
+ )
65
+ return value
66
+
67
+
68
+ def _normalize_eval_ref(eval_ref: str) -> str:
69
+ """Normalize evaluator references from id:version to base id."""
70
+ if ":" not in eval_ref:
71
+ return eval_ref
72
+ base, _, suffix = eval_ref.rpartition(":")
73
+ if base and "." in suffix:
74
+ return base
75
+ return eval_ref
76
+
77
+
78
+ def _required_string_list(spec: dict[str, Any], key: str) -> list[str]:
79
+ """Return required non-empty list of strings or raise."""
80
+ value = spec.get(key)
81
+ if not isinstance(value, list) or not value:
82
+ raise ValueError(
83
+ f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': missing required non-empty field '{key}'"
84
+ )
85
+ if not all(isinstance(item, str) and item.strip() for item in value):
86
+ raise ValueError(
87
+ f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': field '{key}' must contain non-empty strings"
88
+ )
89
+ return [item.strip() for item in value]
90
+
91
+
92
+ def _validate_benchmark_spec(
93
+ spec: dict[str, Any], eval_ids: set[str]
94
+ ) -> dict[str, Any]:
95
+ """Validate benchmark spec fields and evaluator dependencies."""
96
+ spec_id = str(spec.get("id") or "<unknown>")
97
+ category = _required_str(spec, "category")
98
+ task_count = _required_int(spec, "task_count")
99
+ metric = _required_str(spec, "metric")
100
+ difficulty = str(spec.get("difficulty", "medium"))
101
+ dataset_source = str(spec.get("dataset_source", "local"))
102
+ dataset_editability = str(spec.get("dataset_editability", "read-only"))
103
+ sdk_support = str(spec.get("sdk_support", "experimental"))
104
+ evaluators = _required_string_list(spec, "evaluators")
105
+ evaluator_shapes = spec.get("evaluator_shapes", [])
106
+
107
+ if category not in ALLOWED_BENCHMARK_CATEGORIES:
108
+ raise ValueError(
109
+ f"Invalid benchmark spec '{spec_id}': category '{category}' not in {sorted(ALLOWED_BENCHMARK_CATEGORIES)}"
110
+ )
111
+ if task_count < 0:
112
+ raise ValueError(f"Invalid benchmark spec '{spec_id}': task_count must be >= 0")
113
+ if not metric:
114
+ raise ValueError(f"Invalid benchmark spec '{spec_id}': metric is required")
115
+ if difficulty not in ALLOWED_DIFFICULTY:
116
+ raise ValueError(
117
+ f"Invalid benchmark spec '{spec_id}': difficulty '{difficulty}' not in {sorted(ALLOWED_DIFFICULTY)}"
118
+ )
119
+ if dataset_source not in ALLOWED_DATASET_SOURCE:
120
+ raise ValueError(
121
+ f"Invalid benchmark spec '{spec_id}': dataset_source '{dataset_source}' not in {sorted(ALLOWED_DATASET_SOURCE)}"
122
+ )
123
+ if dataset_editability not in ALLOWED_DATASET_EDITABILITY:
124
+ raise ValueError(
125
+ f"Invalid benchmark spec '{spec_id}': dataset_editability '{dataset_editability}' not in {sorted(ALLOWED_DATASET_EDITABILITY)}"
126
+ )
127
+ if sdk_support not in ALLOWED_SDK_SUPPORT:
128
+ raise ValueError(
129
+ f"Invalid benchmark spec '{spec_id}': sdk_support '{sdk_support}' not in {sorted(ALLOWED_SDK_SUPPORT)}"
130
+ )
131
+ if not isinstance(evaluator_shapes, list):
132
+ raise ValueError(
133
+ f"Invalid benchmark spec '{spec_id}': evaluator_shapes must be a list"
134
+ )
135
+ for shape in evaluator_shapes:
136
+ if shape not in ALLOWED_EVALUATOR_SHAPES:
137
+ raise ValueError(
138
+ f"Invalid benchmark spec '{spec_id}': evaluator_shapes item '{shape}' not in {sorted(ALLOWED_EVALUATOR_SHAPES)}"
139
+ )
140
+ for evaluator_ref in evaluators:
141
+ evaluator_id = _normalize_eval_ref(evaluator_ref)
142
+ if evaluator_id not in eval_ids:
143
+ raise ValueError(
144
+ f"Invalid benchmark spec '{spec_id}': evaluator '{evaluator_ref}' not found in eval specs"
145
+ )
146
+
147
+ return {
148
+ "category": category,
149
+ "task_count": task_count,
150
+ "metric": metric,
151
+ "difficulty": difficulty,
152
+ "dataset_source": dataset_source,
153
+ "dataset_editability": dataset_editability,
154
+ "sdk_support": sdk_support,
155
+ "evaluators": evaluators,
156
+ "evaluator_shapes": evaluator_shapes,
157
+ }
158
+
159
+
160
+ def _fmt_list(items: list[str]) -> str:
161
+ """Format a list of strings with double quotes for ruff compliance."""
162
+ if not items:
163
+ return "[]"
164
+ return "[" + ", ".join(f'"{item}"' for item in items) + "]"
165
+
166
+
167
+ def _ts_list(items: list[str]) -> str:
168
+ """Format a list of strings for TypeScript."""
169
+ if not items:
170
+ return "[]"
171
+ return "[" + ", ".join(f"'{item}'" for item in items) + "]"
172
+
173
+
174
+ def _esc(text: str) -> str:
175
+ """Escape single quotes for TypeScript string literals."""
176
+ return text.replace("'", "\\'")
177
+
178
+
179
+ def _esc_dq(text: str) -> str:
180
+ """Escape double quotes for Python string literals."""
181
+ return text.replace('"', '\\"')
182
+
183
+
184
+ def load_specs(specs_dir: Path) -> list[dict[str, Any]]:
185
+ """Load all YAML specifications from a directory (including subdirectories)."""
186
+ specs = []
187
+ for yaml_file in sorted(specs_dir.rglob("*.yaml")):
188
+ with open(yaml_file) as f:
189
+ spec = yaml.safe_load(f)
190
+ ensure_spec_version(spec)
191
+ specs.append(spec)
192
+ return specs
193
+
194
+
195
+ def generate_python_code(specs: list[dict[str, Any]], eval_ids: set[str]) -> str:
196
+ """Generate Python code from benchmark specifications."""
197
+ lines = [
198
+ "# Copyright (c) 2025-2026 Datalayer, Inc.",
199
+ "# Distributed under the terms of the Modified BSD License.",
200
+ '"""',
201
+ "Benchmark Catalog.",
202
+ "",
203
+ "Predefined evaluation benchmark configurations.",
204
+ "",
205
+ "This file is AUTO-GENERATED from YAML specifications.",
206
+ "DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
207
+ '"""',
208
+ "",
209
+ "from typing import Dict, List",
210
+ "",
211
+ "from agent_runtimes.types import BenchmarkSpec",
212
+ "",
213
+ "",
214
+ "# " + "=" * 76,
215
+ "# Benchmark Definitions",
216
+ "# " + "=" * 76,
217
+ "",
218
+ ]
219
+
220
+ for spec in specs:
221
+ benchmark_id = spec["id"]
222
+ version = spec["version"]
223
+ validated = _validate_benchmark_spec(spec, eval_ids)
224
+ const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
225
+ desc = _esc_dq(spec.get("description", "").strip().replace("\n", " "))
226
+
227
+ lines.extend(
228
+ [
229
+ f"{const_name} = BenchmarkSpec(",
230
+ f' id="{benchmark_id}",',
231
+ f' version="{version}",',
232
+ f' name="{spec["name"]}",',
233
+ f' description="{desc}",',
234
+ f' category="{validated["category"]}",',
235
+ f" task_count={validated['task_count']},",
236
+ f' metric="{validated["metric"]}",',
237
+ f' source="{spec.get("source", "")}",',
238
+ f' difficulty="{validated["difficulty"]}",',
239
+ f" languages={_fmt_list(spec.get('languages', []))},",
240
+ f' dataset_source="{validated["dataset_source"]}",',
241
+ f" supports_live_monitoring={str(spec.get('supports_live_monitoring', False))},",
242
+ f" supports_experiment_comparison={str(spec.get('supports_experiment_comparison', True))},",
243
+ f" evaluator_shapes={_fmt_list(validated['evaluator_shapes'])},",
244
+ f" evaluators={_fmt_list(validated['evaluators'])},",
245
+ f" recommended_windows={_fmt_list(spec.get('recommended_windows', ['1h', '6h', '24h', '7d', '30d']))},",
246
+ f" trace_integration={str(spec.get('trace_integration', True))},",
247
+ f' dataset_editability="{validated["dataset_editability"]}",',
248
+ f' sdk_support="{validated["sdk_support"]}",',
249
+ ")",
250
+ "",
251
+ ]
252
+ )
253
+
254
+ lines.extend(
255
+ [
256
+ "# " + "=" * 76,
257
+ "# Benchmark Catalog",
258
+ "# " + "=" * 76,
259
+ "",
260
+ "BENCHMARK_CATALOG: Dict[str, BenchmarkSpec] = {",
261
+ ]
262
+ )
263
+ for spec in specs:
264
+ benchmark_id = spec["id"]
265
+ version = spec["version"]
266
+ const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
267
+ lines.append(f' "{benchmark_id}": {const_name},')
268
+ lines.extend(
269
+ [
270
+ "}",
271
+ "",
272
+ "",
273
+ "def get_benchmark_spec(benchmark_id: str) -> BenchmarkSpec | None:",
274
+ ' """Get a benchmark specification by ID (accepts both bare and versioned refs)."""',
275
+ " spec = BENCHMARK_CATALOG.get(benchmark_id)",
276
+ " if spec is not None:",
277
+ " return spec",
278
+ " base, _, ver = benchmark_id.rpartition(':')",
279
+ " if base and '.' in ver:",
280
+ " return BENCHMARK_CATALOG.get(base)",
281
+ " return None",
282
+ "",
283
+ "",
284
+ "def list_benchmark_specs() -> List[BenchmarkSpec]:",
285
+ ' """List all benchmark specifications."""',
286
+ " return list(BENCHMARK_CATALOG.values())",
287
+ "",
288
+ ]
289
+ )
290
+ return "\n".join(lines)
291
+
292
+
293
+ def generate_typescript_code(specs: list[dict[str, Any]], eval_ids: set[str]) -> str:
294
+ """Generate TypeScript code from benchmark specifications."""
295
+ lines = [
296
+ "/*",
297
+ " * Copyright (c) 2025-2026 Datalayer, Inc.",
298
+ " * Distributed under the terms of the Modified BSD License.",
299
+ " */",
300
+ "",
301
+ "/**",
302
+ " * Benchmark Catalog",
303
+ " *",
304
+ " * Predefined evaluation benchmark configurations.",
305
+ " *",
306
+ " * This file is AUTO-GENERATED from YAML specifications.",
307
+ " * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
308
+ " */",
309
+ "",
310
+ "import type { BenchmarkSpec } from '../types';",
311
+ "",
312
+ "// " + "=" * 76,
313
+ "// Benchmark Definitions",
314
+ "// " + "=" * 76,
315
+ "",
316
+ ]
317
+
318
+ for spec in specs:
319
+ benchmark_id = spec["id"]
320
+ version = spec["version"]
321
+ validated = _validate_benchmark_spec(spec, eval_ids)
322
+ const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
323
+ desc = _esc(spec.get("description", "").strip().replace("\n", " "))
324
+
325
+ lines.extend(
326
+ [
327
+ f"export const {const_name}: BenchmarkSpec = {{",
328
+ f" id: '{benchmark_id}',",
329
+ f" version: '{version}',",
330
+ f" name: '{_esc(spec['name'])}',",
331
+ f" description: '{desc}',",
332
+ f" category: '{validated['category']}',",
333
+ f" task_count: {validated['task_count']},",
334
+ f" metric: '{validated['metric']}',",
335
+ f" source: '{spec.get('source', '')}',",
336
+ f" difficulty: '{validated['difficulty']}',",
337
+ f" languages: {_ts_list(spec.get('languages', []))},",
338
+ f" dataset_source: '{validated['dataset_source']}',",
339
+ f" supports_live_monitoring: {str(spec.get('supports_live_monitoring', False)).lower()},",
340
+ f" supports_experiment_comparison: {str(spec.get('supports_experiment_comparison', True)).lower()},",
341
+ f" evaluator_shapes: {_ts_list(validated['evaluator_shapes'])},",
342
+ f" evaluators: {_ts_list(validated['evaluators'])},",
343
+ f" recommended_windows: {_ts_list(spec.get('recommended_windows', ['1h', '6h', '24h', '7d', '30d']))},",
344
+ f" trace_integration: {str(spec.get('trace_integration', True)).lower()},",
345
+ f" dataset_editability: '{validated['dataset_editability']}',",
346
+ f" sdk_support: '{validated['sdk_support']}',",
347
+ "};",
348
+ "",
349
+ ]
350
+ )
351
+
352
+ lines.extend(
353
+ [
354
+ "// " + "=" * 76,
355
+ "// Benchmark Catalog",
356
+ "// " + "=" * 76,
357
+ "",
358
+ "export const BENCHMARK_CATALOG: Record<string, BenchmarkSpec> = {",
359
+ ]
360
+ )
361
+ for spec in specs:
362
+ benchmark_id = spec["id"]
363
+ version = spec["version"]
364
+ const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
365
+ lines.append(f" '{benchmark_id}': {const_name},")
366
+ lines.extend(
367
+ [
368
+ "};",
369
+ "",
370
+ "export function getBenchmarkSpecs(): BenchmarkSpec[] {",
371
+ " return Object.values(BENCHMARK_CATALOG);",
372
+ "}",
373
+ "",
374
+ "function resolveBenchmarkId(benchmarkId: string): string {",
375
+ " if (benchmarkId in BENCHMARK_CATALOG) return benchmarkId;",
376
+ " const idx = benchmarkId.lastIndexOf(':');",
377
+ " if (idx > 0) {",
378
+ " const base = benchmarkId.slice(0, idx);",
379
+ " if (base in BENCHMARK_CATALOG) return base;",
380
+ " }",
381
+ " return benchmarkId;",
382
+ "}",
383
+ "",
384
+ "export function getBenchmarkSpec(benchmarkId: string): BenchmarkSpec | undefined {",
385
+ " return BENCHMARK_CATALOG[resolveBenchmarkId(benchmarkId)];",
386
+ "}",
387
+ "",
388
+ ]
389
+ )
390
+ return "\n".join(lines)
391
+
392
+
393
+ def main():
394
+ """Main entry point."""
395
+ parser = argparse.ArgumentParser(
396
+ description="Generate Python and TypeScript code from YAML benchmark specifications"
397
+ )
398
+ parser.add_argument("--specs-dir", type=Path, required=True)
399
+ parser.add_argument("--eval-specs-dir", type=Path, required=True)
400
+ parser.add_argument("--python-output", type=Path, required=True)
401
+ parser.add_argument("--typescript-output", type=Path, required=True)
402
+ args = parser.parse_args()
403
+
404
+ if not args.specs_dir.exists():
405
+ print(f"Error: Specs directory does not exist: {args.specs_dir}")
406
+ sys.exit(1)
407
+ if not args.eval_specs_dir.exists():
408
+ print(f"Error: Eval specs directory does not exist: {args.eval_specs_dir}")
409
+ sys.exit(1)
410
+
411
+ print(f"Loading benchmark specs from {args.specs_dir}...")
412
+ specs = load_specs(args.specs_dir)
413
+ print(f"Loaded {len(specs)} benchmark specifications")
414
+ print(f"Loading eval specs from {args.eval_specs_dir}...")
415
+ eval_specs = load_specs(args.eval_specs_dir)
416
+ eval_ids = {
417
+ str(spec["id"])
418
+ for spec in eval_specs
419
+ if isinstance(spec, dict) and isinstance(spec.get("id"), str)
420
+ }
421
+ if not eval_ids:
422
+ print("Error: No eval specifications found for benchmark evaluator validation")
423
+ sys.exit(1)
424
+
425
+ print("Generating Python code...")
426
+ python_code = generate_python_code(specs, eval_ids)
427
+ args.python_output.parent.mkdir(parents=True, exist_ok=True)
428
+ args.python_output.write_text(python_code)
429
+ print(f"✓ Generated {args.python_output}")
430
+
431
+ print("Generating TypeScript code...")
432
+ typescript_code = generate_typescript_code(specs, eval_ids)
433
+ args.typescript_output.parent.mkdir(parents=True, exist_ok=True)
434
+ args.typescript_output.write_text(typescript_code)
435
+ print(f"✓ Generated {args.typescript_output}")
436
+
437
+ print(f"\n✓ Successfully generated code from {len(specs)} benchmark specs")
438
+
439
+
440
+ if __name__ == "__main__":
441
+ main()
@@ -6,13 +6,14 @@
6
6
  Generate Python and TypeScript code from YAML eval specifications.
7
7
 
8
8
  Usage:
9
- python generate_evals.py \\
10
- --specs-dir specs/evals \\
11
- --python-output agent_runtimes/specs/evals.py \\
9
+ python generate_evals.py \
10
+ --specs-dir specs/evals \
11
+ --python-output agent_runtimes/specs/evals.py \
12
12
  --typescript-output src/specs/evals.ts
13
13
  """
14
14
 
15
15
  import argparse
16
+ import json
16
17
  import sys
17
18
  from pathlib import Path
18
19
  from typing import Any
@@ -20,6 +21,65 @@ from typing import Any
20
21
  import yaml
21
22
  from versioning import ensure_spec_version, version_suffix
22
23
 
24
+ ALLOWED_EVAL_CATEGORIES = {
25
+ "Comparison",
26
+ "Type Validation",
27
+ "Performance",
28
+ "LLM-as-a-Judge",
29
+ "Span-Based",
30
+ "Report",
31
+ }
32
+
33
+ ALLOWED_EVALUATOR_TYPES = {"case", "report"}
34
+
35
+ ALLOWED_OUTPUT_KINDS = {
36
+ "boolean",
37
+ "boolean_with_reason",
38
+ "score",
39
+ "score_and_assertion",
40
+ "report_table",
41
+ "report_curve",
42
+ }
43
+
44
+
45
+ def _required_str(spec: dict[str, Any], key: str) -> str:
46
+ """Return required non-empty string key or raise with actionable context."""
47
+ value = spec.get(key)
48
+ if not isinstance(value, str) or not value.strip():
49
+ raise ValueError(
50
+ f"Invalid eval spec '{spec.get('id', '<unknown>')}': missing required field '{key}'"
51
+ )
52
+ return value.strip()
53
+
54
+
55
+ def _validate_eval_spec(spec: dict[str, Any]) -> dict[str, str]:
56
+ """Validate required eval fields and return normalized values."""
57
+ spec_id = str(spec.get("id") or "<unknown>")
58
+ category = _required_str(spec, "category")
59
+ evaluator_type = _required_str(spec, "evaluator_type").lower()
60
+ pydantic_class = _required_str(spec, "pydantic_class")
61
+ output_kind = _required_str(spec, "output_kind")
62
+
63
+ if category not in ALLOWED_EVAL_CATEGORIES:
64
+ raise ValueError(
65
+ f"Invalid eval spec '{spec_id}': category '{category}' not in {sorted(ALLOWED_EVAL_CATEGORIES)}"
66
+ )
67
+ if evaluator_type not in ALLOWED_EVALUATOR_TYPES:
68
+ raise ValueError(
69
+ f"Invalid eval spec '{spec_id}': evaluator_type '{evaluator_type}' not in {sorted(ALLOWED_EVALUATOR_TYPES)}"
70
+ )
71
+ if output_kind not in ALLOWED_OUTPUT_KINDS:
72
+ raise ValueError(
73
+ f"Invalid eval spec '{spec_id}': output_kind '{output_kind}' not in {sorted(ALLOWED_OUTPUT_KINDS)}"
74
+ )
75
+
76
+ return {
77
+ "category": category,
78
+ "evaluator_type": evaluator_type,
79
+ "pydantic_class": pydantic_class,
80
+ "output_kind": output_kind,
81
+ }
82
+
23
83
 
24
84
  def _fmt_list(items: list[str]) -> str:
25
85
  """Format a list of strings with double quotes for ruff compliance."""
@@ -64,7 +124,7 @@ def generate_python_code(specs: list[dict[str, Any]]) -> str:
64
124
  '"""',
65
125
  "Eval Catalog.",
66
126
  "",
67
- "Predefined evaluation benchmark configurations.",
127
+ "Predefined built-in evaluator configurations.",
68
128
  "",
69
129
  "This file is AUTO-GENERATED from YAML specifications.",
70
130
  "DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
@@ -84,24 +144,33 @@ def generate_python_code(specs: list[dict[str, Any]]) -> str:
84
144
  for spec in specs:
85
145
  eval_id = spec["id"]
86
146
  version = spec["version"]
147
+ validated = _validate_eval_spec(spec)
148
+ evaluator_type = validated["evaluator_type"]
149
+ output_kind = validated["output_kind"]
150
+ pydantic_class = validated["pydantic_class"]
151
+ category = validated["category"]
87
152
  const_name = (
88
153
  f"{eval_id.upper().replace('-', '_')}_EVAL_SPEC{version_suffix(version)}"
89
154
  )
90
155
  desc = _esc_dq(spec.get("description", "").strip().replace("\n", " "))
156
+ default_config = repr(spec.get("default_config", {}))
91
157
 
92
158
  lines.extend(
93
159
  [
94
160
  f"{const_name} = EvalSpec(",
95
161
  f' id="{eval_id}",',
96
162
  f' version="{version}",',
97
- f' name="{spec["name"]}",',
163
+ f' name="{_esc_dq(spec["name"])}",',
98
164
  f' description="{desc}",',
99
- f' category="{spec["category"]}",',
100
- f" task_count={spec['task_count']},",
101
- f' metric="{spec["metric"]}",',
165
+ f' category="{category}",',
166
+ f' evaluator_type="{evaluator_type}",',
167
+ f' pydantic_class="{pydantic_class}",',
168
+ f' output_kind="{output_kind}",',
169
+ f' cost_tier="{spec.get("cost_tier", "free")}",',
170
+ f' latency="{spec.get("latency", "instant")}",',
171
+ f" requires={_fmt_list(spec.get('requires', []))},",
102
172
  f' source="{spec.get("source", "")}",',
103
- f' difficulty="{spec.get("difficulty", "medium")}",',
104
- f" languages={_fmt_list(spec.get('languages', []))},",
173
+ f" default_config={default_config},",
105
174
  ")",
106
175
  "",
107
176
  ]
@@ -159,7 +228,7 @@ def generate_typescript_code(specs: list[dict[str, Any]]) -> str:
159
228
  "/**",
160
229
  " * Eval Catalog",
161
230
  " *",
162
- " * Predefined evaluation benchmark configurations.",
231
+ " * Predefined built-in evaluator configurations.",
163
232
  " *",
164
233
  " * This file is AUTO-GENERATED from YAML specifications.",
165
234
  " * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
@@ -176,10 +245,16 @@ def generate_typescript_code(specs: list[dict[str, Any]]) -> str:
176
245
  for spec in specs:
177
246
  eval_id = spec["id"]
178
247
  version = spec["version"]
248
+ validated = _validate_eval_spec(spec)
249
+ evaluator_type = validated["evaluator_type"]
250
+ output_kind = validated["output_kind"]
251
+ pydantic_class = validated["pydantic_class"]
252
+ category = validated["category"]
179
253
  const_name = (
180
254
  f"{eval_id.upper().replace('-', '_')}_EVAL_SPEC{version_suffix(version)}"
181
255
  )
182
256
  desc = _esc(spec.get("description", "").strip().replace("\n", " "))
257
+ default_config = json.dumps(spec.get("default_config", {}), ensure_ascii=True)
183
258
 
184
259
  lines.extend(
185
260
  [
@@ -188,12 +263,15 @@ def generate_typescript_code(specs: list[dict[str, Any]]) -> str:
188
263
  f" version: '{version}',",
189
264
  f" name: '{_esc(spec['name'])}',",
190
265
  f" description: '{desc}',",
191
- f" category: '{spec['category']}',",
192
- f" task_count: {spec['task_count']},",
193
- f" metric: '{spec['metric']}',",
266
+ f" category: '{category}',",
267
+ f" evaluator_type: '{evaluator_type}',",
268
+ f" pydantic_class: '{pydantic_class}',",
269
+ f" output_kind: '{output_kind}',",
270
+ f" cost_tier: '{spec.get('cost_tier', 'free')}',",
271
+ f" latency: '{spec.get('latency', 'instant')}',",
272
+ f" requires: {_ts_list(spec.get('requires', []))},",
194
273
  f" source: '{spec.get('source', '')}',",
195
- f" difficulty: '{spec.get('difficulty', 'medium')}',",
196
- f" languages: {_ts_list(spec.get('languages', []))},",
274
+ f" default_config: {default_config},",
197
275
  "};",
198
276
  "",
199
277
  ]