@datalayer/agent-runtimes 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +182 -1
- package/lib/AgentNode.d.ts +3 -0
- package/lib/AgentNode.js +676 -0
- package/lib/App.js +1 -1
- package/lib/agent-node/themeStore.d.ts +3 -0
- package/lib/agent-node/themeStore.js +156 -0
- package/lib/agent-node-main.d.ts +1 -0
- package/lib/agent-node-main.js +14 -0
- package/lib/agents/AgentDetails.d.ts +22 -1
- package/lib/agents/AgentDetails.js +34 -47
- package/lib/api/index.d.ts +0 -1
- package/lib/api/index.js +4 -2
- package/lib/chat/Chat.d.ts +5 -106
- package/lib/chat/Chat.js +20 -14
- package/lib/chat/ChatFloating.d.ts +7 -140
- package/lib/chat/ChatFloating.js +3 -3
- package/lib/chat/ChatPopupStandalone.d.ts +8 -47
- package/lib/chat/ChatPopupStandalone.js +3 -3
- package/lib/chat/ChatSidebar.d.ts +4 -69
- package/lib/chat/ChatSidebar.js +83 -51
- package/lib/chat/ChatStandalone.d.ts +4 -54
- package/lib/chat/ChatStandalone.js +3 -3
- package/lib/chat/base/ChatBase.js +1414 -174
- package/lib/chat/display/FloatingBrandButton.js +8 -1
- package/lib/chat/header/ChatHeader.d.ts +3 -1
- package/lib/chat/header/ChatHeader.js +15 -12
- package/lib/chat/header/ChatHeaderBase.d.ts +30 -5
- package/lib/chat/header/ChatHeaderBase.js +41 -16
- package/lib/chat/indicators/McpStatusIndicator.d.ts +7 -4
- package/lib/chat/indicators/McpStatusIndicator.js +7 -32
- package/lib/chat/indicators/SandboxStatusIndicator.d.ts +4 -1
- package/lib/chat/indicators/SandboxStatusIndicator.js +91 -56
- package/lib/chat/indicators/SkillsStatusIndicator.d.ts +7 -0
- package/lib/chat/indicators/SkillsStatusIndicator.js +88 -0
- package/lib/chat/indicators/index.d.ts +1 -0
- package/lib/chat/indicators/index.js +1 -0
- package/lib/chat/messages/ChatMessageList.d.ts +1 -1
- package/lib/chat/messages/ChatMessageList.js +154 -114
- package/lib/chat/messages/ChatMessages.js +6 -2
- package/lib/chat/prompt/InputFooter.d.ts +21 -6
- package/lib/chat/prompt/InputFooter.js +76 -20
- package/lib/chat/prompt/InputPrompt.d.ts +5 -1
- package/lib/chat/prompt/InputPrompt.js +4 -4
- package/lib/chat/prompt/InputPromptFooter.d.ts +3 -1
- package/lib/chat/prompt/InputPromptFooter.js +3 -3
- package/lib/chat/prompt/InputPromptLexical.d.ts +3 -1
- package/lib/chat/prompt/InputPromptLexical.js +12 -5
- package/lib/chat/prompt/InputPromptText.d.ts +3 -1
- package/lib/chat/prompt/InputPromptText.js +2 -2
- package/lib/chat/tools/ToolApprovalBanner.js +1 -1
- package/lib/chat/tools/ToolCallDisplay.d.ts +3 -1
- package/lib/chat/tools/ToolCallDisplay.js +2 -2
- package/lib/chat/usage/TokenUsageBar.js +20 -2
- package/lib/client/AgentRuntimesClientContext.d.ts +53 -0
- package/lib/client/AgentRuntimesClientContext.js +55 -0
- package/lib/client/AgentsMixin.d.ts +0 -18
- package/lib/client/AgentsMixin.js +20 -30
- package/lib/client/IAgentRuntimesClient.d.ts +215 -0
- package/lib/client/IAgentRuntimesClient.js +5 -0
- package/lib/client/SdkAgentRuntimesClient.d.ts +151 -0
- package/lib/client/SdkAgentRuntimesClient.js +134 -0
- package/lib/client/index.d.ts +4 -1
- package/lib/client/index.js +3 -1
- package/lib/components/NotificationEventCard.js +5 -1
- package/lib/config/AgentConfiguration.d.ts +22 -0
- package/lib/config/AgentConfiguration.js +319 -64
- package/lib/context/ContextDistribution.d.ts +3 -1
- package/lib/context/ContextDistribution.js +8 -27
- package/lib/context/ContextInspector.d.ts +3 -1
- package/lib/context/ContextInspector.js +19 -67
- package/lib/context/ContextPanel.d.ts +3 -1
- package/lib/context/ContextPanel.js +104 -64
- package/lib/context/ContextUsage.d.ts +3 -1
- package/lib/context/ContextUsage.js +3 -3
- package/lib/context/CostTracker.d.ts +9 -3
- package/lib/context/CostTracker.js +26 -47
- package/lib/context/CostUsageChart.d.ts +12 -0
- package/lib/context/CostUsageChart.js +378 -0
- package/lib/context/GraphFlowChart.d.ts +16 -0
- package/lib/context/GraphFlowChart.js +182 -0
- package/lib/context/TokenUsageChart.d.ts +8 -1
- package/lib/context/TokenUsageChart.js +349 -211
- package/lib/context/TurnGraphChart.d.ts +39 -0
- package/lib/context/TurnGraphChart.js +538 -0
- package/lib/context/otelWsPool.d.ts +20 -0
- package/lib/context/otelWsPool.js +69 -0
- package/lib/examples/A2UiComponentGalleryExample.d.ts +0 -17
- package/lib/examples/A2UiComponentGalleryExample.js +315 -522
- package/lib/examples/A2UiContactCardExample.d.ts +0 -18
- package/lib/examples/A2UiContactCardExample.js +154 -411
- package/lib/examples/A2UiRestaurantExample.d.ts +0 -30
- package/lib/examples/A2UiRestaurantExample.js +114 -212
- package/lib/examples/A2UiViewerExample.d.ts +0 -18
- package/lib/examples/A2UiViewerExample.js +283 -532
- package/lib/examples/AgUiBackendToolRenderingExample.js +1 -1
- package/lib/examples/AgUiHaikuGenUiExample.d.ts +1 -1
- package/lib/examples/AgUiHaikuGenUiExample.js +1 -1
- package/lib/examples/AgUiSharedStateExample.js +2 -1
- package/lib/examples/AgentCheckpointsExample.js +14 -28
- package/lib/examples/AgentCodemodeExample.d.ts +4 -6
- package/lib/examples/AgentCodemodeExample.js +603 -169
- package/lib/examples/AgentEvalsExample.js +339 -53
- package/lib/examples/AgentGuardrailsExample.js +383 -66
- package/lib/examples/AgentHooksExample.d.ts +3 -0
- package/lib/examples/AgentHooksExample.js +122 -0
- package/lib/examples/AgentInferenceProviderExample.d.ts +3 -0
- package/lib/examples/AgentInferenceProviderExample.js +329 -0
- package/lib/examples/AgentMCPExample.d.ts +3 -0
- package/lib/examples/AgentMCPExample.js +481 -0
- package/lib/examples/AgentMemoryExample.d.ts +1 -2
- package/lib/examples/AgentMemoryExample.js +78 -33
- package/lib/examples/AgentMonitoringExample.js +261 -200
- package/lib/examples/AgentNotificationsExample.d.ts +1 -2
- package/lib/examples/AgentNotificationsExample.js +114 -33
- package/lib/examples/AgentOtelExample.js +32 -42
- package/lib/examples/AgentOutputsExample.d.ts +11 -6
- package/lib/examples/AgentOutputsExample.js +433 -81
- package/lib/examples/AgentParametersExample.d.ts +3 -0
- package/lib/examples/AgentParametersExample.js +248 -0
- package/lib/examples/AgentSandboxExample.d.ts +3 -3
- package/lib/examples/AgentSandboxExample.js +74 -45
- package/lib/examples/AgentSkillsExample.js +95 -103
- package/lib/examples/AgentSubagentsExample.d.ts +14 -0
- package/lib/examples/AgentSubagentsExample.js +228 -0
- package/lib/examples/AgentToolApprovalsExample.js +49 -561
- package/lib/examples/AgentTriggersExample.js +823 -569
- package/lib/examples/{AgentspecExample.d.ts → AgentspecsExample.d.ts} +2 -2
- package/lib/examples/AgentspecsExample.js +1096 -0
- package/lib/examples/ChatCustomExample.js +16 -28
- package/lib/examples/ChatExample.js +13 -29
- package/lib/examples/CopilotKitLexicalExample.js +2 -1
- package/lib/examples/CopilotKitNotebookExample.js +2 -1
- package/lib/examples/HomeExample.d.ts +15 -0
- package/lib/examples/HomeExample.js +77 -0
- package/lib/examples/Lexical2Example.js +4 -2
- package/lib/examples/{LexicalExample.d.ts → LexicalAgentExample.d.ts} +4 -4
- package/lib/examples/{LexicalExample.js → LexicalAgentExample.js} +66 -17
- package/lib/examples/{LexicalSidebarExample.d.ts → LexicalAgentSidebarExample.d.ts} +5 -5
- package/lib/examples/LexicalAgentSidebarExample.js +261 -0
- package/lib/examples/NotebookAgentExample.d.ts +9 -0
- package/lib/examples/NotebookAgentExample.js +192 -0
- package/lib/examples/{NotebookSidebarExample.d.ts → NotebookAgentSidebarExample.d.ts} +2 -2
- package/lib/examples/NotebookAgentSidebarExample.js +221 -0
- package/lib/examples/{DatalayerNotebookExample.d.ts → NotebookCollaborationExample.d.ts} +4 -4
- package/lib/examples/{DatalayerNotebookExample.js → NotebookCollaborationExample.js} +3 -3
- package/lib/examples/NotebookExample.d.ts +4 -7
- package/lib/examples/NotebookExample.js +14 -146
- package/lib/examples/components/AuthRequiredView.d.ts +6 -0
- package/lib/examples/components/AuthRequiredView.js +33 -0
- package/lib/examples/components/ExampleWrapper.d.ts +9 -3
- package/lib/examples/components/ExampleWrapper.js +45 -9
- package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.js +1 -1
- package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.js +1 -1
- package/lib/examples/{ag-ui → components}/haiku/index.d.ts +1 -1
- package/lib/examples/{ag-ui → components}/haiku/index.js +1 -1
- package/lib/examples/components/index.d.ts +3 -0
- package/lib/examples/components/index.js +4 -0
- package/lib/examples/{ag-ui → components}/weather/index.d.ts +1 -1
- package/lib/examples/{ag-ui → components}/weather/index.js +1 -1
- package/lib/examples/example-selector.d.ts +17 -4
- package/lib/examples/example-selector.js +108 -41
- package/lib/examples/index.d.ts +10 -6
- package/lib/examples/index.js +10 -6
- package/lib/examples/lexical/initial-content.json +6 -6
- package/lib/examples/main.js +257 -27
- package/lib/examples/utils/a2ui.d.ts +18 -0
- package/lib/examples/utils/a2ui.js +69 -0
- package/lib/examples/utils/a2uiMarkdownProvider.d.ts +7 -0
- package/lib/examples/utils/a2uiMarkdownProvider.js +9 -0
- package/lib/examples/utils/agentId.d.ts +18 -0
- package/lib/examples/utils/agentId.js +54 -0
- package/lib/examples/utils/agents/earthquake-detector.json +11 -11
- package/lib/examples/utils/agents/sales-forecaster.json +11 -11
- package/lib/examples/utils/agents/social-post-generator.json +11 -11
- package/lib/examples/utils/agents/stock-market.json +11 -11
- package/lib/examples/utils/examplesStore.js +82 -27
- package/lib/examples/utils/useExampleAgentRuntimesUrl.d.ts +5 -0
- package/lib/examples/utils/useExampleAgentRuntimesUrl.js +19 -0
- package/lib/hooks/index.d.ts +8 -8
- package/lib/hooks/index.js +7 -7
- package/lib/hooks/useA2A.d.ts +2 -3
- package/lib/hooks/useAIAgentsWebSocket.d.ts +43 -4
- package/lib/hooks/useAIAgentsWebSocket.js +153 -12
- package/lib/hooks/useAcp.d.ts +1 -2
- package/lib/hooks/useAgUi.d.ts +1 -1
- package/lib/hooks/{useAgents.d.ts → useAgentRuntimes.d.ts} +70 -4
- package/lib/hooks/{useAgents.js → useAgentRuntimes.js} +237 -32
- package/lib/hooks/useAgentsCatalog.js +1 -1
- package/lib/hooks/useAgentsService.d.ts +2 -2
- package/lib/hooks/useAgentsService.js +7 -7
- package/lib/hooks/useCheckpoints.js +1 -1
- package/lib/hooks/useConfig.d.ts +4 -1
- package/lib/hooks/useConfig.js +10 -3
- package/lib/hooks/useContextSnapshot.d.ts +9 -4
- package/lib/hooks/useContextSnapshot.js +9 -37
- package/lib/hooks/useMonitoring.js +3 -0
- package/lib/hooks/useSandbox.d.ts +20 -8
- package/lib/hooks/useSandbox.js +105 -40
- package/lib/hooks/useSkills.d.ts +23 -5
- package/lib/hooks/useSkills.js +94 -39
- package/lib/hooks/useToolApprovals.d.ts +60 -36
- package/lib/hooks/useToolApprovals.js +318 -69
- package/lib/hooks/useVercelAI.d.ts +1 -1
- package/lib/index.d.ts +2 -1
- package/lib/index.js +1 -0
- package/lib/inference/index.d.ts +0 -1
- package/lib/middleware/index.d.ts +0 -1
- package/lib/protocols/AGUIAdapter.js +6 -0
- package/lib/protocols/VercelAIAdapter.d.ts +7 -0
- package/lib/protocols/VercelAIAdapter.js +59 -7
- package/lib/specs/agents/agents.d.ts +21 -4
- package/lib/specs/agents/agents.js +2879 -316
- package/lib/specs/agents/index.js +3 -1
- package/lib/specs/benchmarks.d.ts +20 -0
- package/lib/specs/benchmarks.js +205 -0
- package/lib/specs/envvars.js +27 -20
- package/lib/specs/evals.d.ts +10 -9
- package/lib/specs/evals.js +128 -88
- package/lib/specs/events.d.ts +3 -10
- package/lib/specs/events.js +127 -84
- package/lib/specs/frontendTools.js +2 -2
- package/lib/specs/guardrails.d.ts +0 -7
- package/lib/specs/guardrails.js +240 -159
- package/lib/specs/mcpServers.js +35 -6
- package/lib/specs/memory.d.ts +0 -2
- package/lib/specs/memory.js +4 -17
- package/lib/specs/models.d.ts +0 -2
- package/lib/specs/models.js +20 -15
- package/lib/specs/notifications.js +102 -18
- package/lib/specs/outputs.js +15 -9
- package/lib/specs/personas.d.ts +41 -0
- package/lib/specs/personas.js +168 -0
- package/lib/specs/skills.d.ts +1 -1
- package/lib/specs/skills.js +23 -23
- package/lib/specs/teams/index.js +3 -1
- package/lib/specs/teams/teams.js +468 -348
- package/lib/specs/tools.js +4 -4
- package/lib/specs/triggers.js +61 -11
- package/lib/stores/agentRuntimeStore.d.ts +208 -0
- package/lib/stores/agentRuntimeStore.js +650 -0
- package/lib/stores/conversationStore.js +2 -2
- package/lib/stores/index.d.ts +1 -1
- package/lib/stores/index.js +1 -1
- package/lib/tools/adapters/copilotkit/lexicalHooks.d.ts +1 -2
- package/lib/tools/adapters/copilotkit/lexicalHooks.js +1 -3
- package/lib/tools/adapters/copilotkit/notebookHooks.d.ts +1 -2
- package/lib/tools/adapters/copilotkit/notebookHooks.js +1 -3
- package/lib/tools/index.d.ts +0 -2
- package/lib/tools/index.js +0 -1
- package/lib/types/agents-lifecycle.d.ts +18 -0
- package/lib/types/agents.d.ts +6 -0
- package/lib/types/agentspecs.d.ts +54 -1
- package/lib/types/benchmarks.d.ts +43 -0
- package/lib/types/benchmarks.js +5 -0
- package/lib/types/chat.d.ts +325 -8
- package/lib/types/context.d.ts +27 -0
- package/lib/types/cost.d.ts +2 -2
- package/lib/types/evals.d.ts +26 -17
- package/lib/types/index.d.ts +3 -0
- package/lib/types/index.js +3 -0
- package/lib/types/mcp.d.ts +8 -0
- package/lib/types/models.d.ts +2 -2
- package/lib/types/personas.d.ts +25 -0
- package/lib/types/personas.js +5 -0
- package/lib/types/skills.d.ts +43 -1
- package/lib/types/stream.d.ts +110 -0
- package/lib/types/stream.js +36 -0
- package/lib/utils/utils.d.ts +9 -5
- package/lib/utils/utils.js +9 -5
- package/package.json +19 -11
- package/scripts/codegen/__pycache__/generate_agents.cpython-313.pyc +0 -0
- package/scripts/codegen/__pycache__/generate_benchmarks.cpython-313.pyc +0 -0
- package/scripts/codegen/__pycache__/generate_evals.cpython-313.pyc +0 -0
- package/scripts/codegen/__pycache__/generate_events.cpython-313.pyc +0 -0
- package/scripts/codegen/__pycache__/versioning.cpython-313.pyc +0 -0
- package/scripts/codegen/generate_agents.py +187 -45
- package/scripts/codegen/generate_benchmarks.py +441 -0
- package/scripts/codegen/generate_evals.py +94 -16
- package/scripts/codegen/generate_events.py +35 -14
- package/scripts/codegen/generate_personas.py +319 -0
- package/scripts/codegen/generate_skills.py +9 -9
- package/scripts/sync-jupyter.sh +26 -7
- package/lib/api/tool-approvals.d.ts +0 -62
- package/lib/api/tool-approvals.js +0 -145
- package/lib/examples/AgentspecExample.js +0 -705
- package/lib/examples/LexicalSidebarExample.js +0 -163
- package/lib/examples/NotebookSidebarExample.js +0 -119
- package/lib/examples/NotebookSimpleExample.d.ts +0 -6
- package/lib/examples/NotebookSimpleExample.js +0 -22
- package/lib/examples/ag-ui/index.d.ts +0 -10
- package/lib/examples/ag-ui/index.js +0 -16
- package/lib/hooks/useAgentsRegistry.d.ts +0 -10
- package/lib/hooks/useAgentsRegistry.js +0 -20
- package/lib/stores/agentsStore.d.ts +0 -123
- package/lib/stores/agentsStore.js +0 -270
- /package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.d.ts +0 -0
- /package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.d.ts +0 -0
- /package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.d.ts +0 -0
- /package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.js +0 -0
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) 2025-2026 Datalayer, Inc.
|
|
3
|
+
# Distributed under the terms of the Modified BSD License.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Generate Python and TypeScript code from YAML benchmark specifications.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python generate_benchmarks.py \
|
|
10
|
+
--specs-dir specs/benchmarks \
|
|
11
|
+
--eval-specs-dir specs/evals \
|
|
12
|
+
--python-output agent_runtimes/specs/benchmarks.py \
|
|
13
|
+
--typescript-output src/specs/benchmarks.ts
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import yaml
|
|
22
|
+
from versioning import ensure_spec_version, version_suffix
|
|
23
|
+
|
|
24
|
+
ALLOWED_BENCHMARK_CATEGORIES = {
|
|
25
|
+
"Coding",
|
|
26
|
+
"Knowledge",
|
|
27
|
+
"Reasoning",
|
|
28
|
+
"Agentic",
|
|
29
|
+
"Safety",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
ALLOWED_DIFFICULTY = {"easy", "medium", "hard", "expert"}
|
|
33
|
+
|
|
34
|
+
ALLOWED_DATASET_SOURCE = {"hosted", "local", "hybrid"}
|
|
35
|
+
|
|
36
|
+
ALLOWED_DATASET_EDITABILITY = {"read-only", "editable"}
|
|
37
|
+
|
|
38
|
+
ALLOWED_SDK_SUPPORT = {"none", "experimental", "stable"}
|
|
39
|
+
|
|
40
|
+
ALLOWED_EVALUATOR_SHAPES = {
|
|
41
|
+
"pass_rate",
|
|
42
|
+
"numeric",
|
|
43
|
+
"categorical",
|
|
44
|
+
"error_only",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _required_str(spec: dict[str, Any], key: str) -> str:
|
|
49
|
+
"""Return required non-empty string key or raise with actionable context."""
|
|
50
|
+
value = spec.get(key)
|
|
51
|
+
if not isinstance(value, str) or not value.strip():
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': missing required field '{key}'"
|
|
54
|
+
)
|
|
55
|
+
return value.strip()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _required_int(spec: dict[str, Any], key: str) -> int:
|
|
59
|
+
"""Return required integer key or raise with actionable context."""
|
|
60
|
+
value = spec.get(key)
|
|
61
|
+
if not isinstance(value, int):
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': field '{key}' must be an integer"
|
|
64
|
+
)
|
|
65
|
+
return value
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _normalize_eval_ref(eval_ref: str) -> str:
|
|
69
|
+
"""Normalize evaluator references from id:version to base id."""
|
|
70
|
+
if ":" not in eval_ref:
|
|
71
|
+
return eval_ref
|
|
72
|
+
base, _, suffix = eval_ref.rpartition(":")
|
|
73
|
+
if base and "." in suffix:
|
|
74
|
+
return base
|
|
75
|
+
return eval_ref
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _required_string_list(spec: dict[str, Any], key: str) -> list[str]:
|
|
79
|
+
"""Return required non-empty list of strings or raise."""
|
|
80
|
+
value = spec.get(key)
|
|
81
|
+
if not isinstance(value, list) or not value:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': missing required non-empty field '{key}'"
|
|
84
|
+
)
|
|
85
|
+
if not all(isinstance(item, str) and item.strip() for item in value):
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Invalid benchmark spec '{spec.get('id', '<unknown>')}': field '{key}' must contain non-empty strings"
|
|
88
|
+
)
|
|
89
|
+
return [item.strip() for item in value]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _validate_benchmark_spec(
|
|
93
|
+
spec: dict[str, Any], eval_ids: set[str]
|
|
94
|
+
) -> dict[str, Any]:
|
|
95
|
+
"""Validate benchmark spec fields and evaluator dependencies."""
|
|
96
|
+
spec_id = str(spec.get("id") or "<unknown>")
|
|
97
|
+
category = _required_str(spec, "category")
|
|
98
|
+
task_count = _required_int(spec, "task_count")
|
|
99
|
+
metric = _required_str(spec, "metric")
|
|
100
|
+
difficulty = str(spec.get("difficulty", "medium"))
|
|
101
|
+
dataset_source = str(spec.get("dataset_source", "local"))
|
|
102
|
+
dataset_editability = str(spec.get("dataset_editability", "read-only"))
|
|
103
|
+
sdk_support = str(spec.get("sdk_support", "experimental"))
|
|
104
|
+
evaluators = _required_string_list(spec, "evaluators")
|
|
105
|
+
evaluator_shapes = spec.get("evaluator_shapes", [])
|
|
106
|
+
|
|
107
|
+
if category not in ALLOWED_BENCHMARK_CATEGORIES:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Invalid benchmark spec '{spec_id}': category '{category}' not in {sorted(ALLOWED_BENCHMARK_CATEGORIES)}"
|
|
110
|
+
)
|
|
111
|
+
if task_count < 0:
|
|
112
|
+
raise ValueError(f"Invalid benchmark spec '{spec_id}': task_count must be >= 0")
|
|
113
|
+
if not metric:
|
|
114
|
+
raise ValueError(f"Invalid benchmark spec '{spec_id}': metric is required")
|
|
115
|
+
if difficulty not in ALLOWED_DIFFICULTY:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Invalid benchmark spec '{spec_id}': difficulty '{difficulty}' not in {sorted(ALLOWED_DIFFICULTY)}"
|
|
118
|
+
)
|
|
119
|
+
if dataset_source not in ALLOWED_DATASET_SOURCE:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Invalid benchmark spec '{spec_id}': dataset_source '{dataset_source}' not in {sorted(ALLOWED_DATASET_SOURCE)}"
|
|
122
|
+
)
|
|
123
|
+
if dataset_editability not in ALLOWED_DATASET_EDITABILITY:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"Invalid benchmark spec '{spec_id}': dataset_editability '{dataset_editability}' not in {sorted(ALLOWED_DATASET_EDITABILITY)}"
|
|
126
|
+
)
|
|
127
|
+
if sdk_support not in ALLOWED_SDK_SUPPORT:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"Invalid benchmark spec '{spec_id}': sdk_support '{sdk_support}' not in {sorted(ALLOWED_SDK_SUPPORT)}"
|
|
130
|
+
)
|
|
131
|
+
if not isinstance(evaluator_shapes, list):
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"Invalid benchmark spec '{spec_id}': evaluator_shapes must be a list"
|
|
134
|
+
)
|
|
135
|
+
for shape in evaluator_shapes:
|
|
136
|
+
if shape not in ALLOWED_EVALUATOR_SHAPES:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"Invalid benchmark spec '{spec_id}': evaluator_shapes item '{shape}' not in {sorted(ALLOWED_EVALUATOR_SHAPES)}"
|
|
139
|
+
)
|
|
140
|
+
for evaluator_ref in evaluators:
|
|
141
|
+
evaluator_id = _normalize_eval_ref(evaluator_ref)
|
|
142
|
+
if evaluator_id not in eval_ids:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"Invalid benchmark spec '{spec_id}': evaluator '{evaluator_ref}' not found in eval specs"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
"category": category,
|
|
149
|
+
"task_count": task_count,
|
|
150
|
+
"metric": metric,
|
|
151
|
+
"difficulty": difficulty,
|
|
152
|
+
"dataset_source": dataset_source,
|
|
153
|
+
"dataset_editability": dataset_editability,
|
|
154
|
+
"sdk_support": sdk_support,
|
|
155
|
+
"evaluators": evaluators,
|
|
156
|
+
"evaluator_shapes": evaluator_shapes,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _fmt_list(items: list[str]) -> str:
|
|
161
|
+
"""Format a list of strings with double quotes for ruff compliance."""
|
|
162
|
+
if not items:
|
|
163
|
+
return "[]"
|
|
164
|
+
return "[" + ", ".join(f'"{item}"' for item in items) + "]"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _ts_list(items: list[str]) -> str:
|
|
168
|
+
"""Format a list of strings for TypeScript."""
|
|
169
|
+
if not items:
|
|
170
|
+
return "[]"
|
|
171
|
+
return "[" + ", ".join(f"'{item}'" for item in items) + "]"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _esc(text: str) -> str:
|
|
175
|
+
"""Escape single quotes for TypeScript string literals."""
|
|
176
|
+
return text.replace("'", "\\'")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _esc_dq(text: str) -> str:
|
|
180
|
+
"""Escape double quotes for Python string literals."""
|
|
181
|
+
return text.replace('"', '\\"')
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def load_specs(specs_dir: Path) -> list[dict[str, Any]]:
|
|
185
|
+
"""Load all YAML specifications from a directory (including subdirectories)."""
|
|
186
|
+
specs = []
|
|
187
|
+
for yaml_file in sorted(specs_dir.rglob("*.yaml")):
|
|
188
|
+
with open(yaml_file) as f:
|
|
189
|
+
spec = yaml.safe_load(f)
|
|
190
|
+
ensure_spec_version(spec)
|
|
191
|
+
specs.append(spec)
|
|
192
|
+
return specs
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def generate_python_code(specs: list[dict[str, Any]], eval_ids: set[str]) -> str:
|
|
196
|
+
"""Generate Python code from benchmark specifications."""
|
|
197
|
+
lines = [
|
|
198
|
+
"# Copyright (c) 2025-2026 Datalayer, Inc.",
|
|
199
|
+
"# Distributed under the terms of the Modified BSD License.",
|
|
200
|
+
'"""',
|
|
201
|
+
"Benchmark Catalog.",
|
|
202
|
+
"",
|
|
203
|
+
"Predefined evaluation benchmark configurations.",
|
|
204
|
+
"",
|
|
205
|
+
"This file is AUTO-GENERATED from YAML specifications.",
|
|
206
|
+
"DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
|
|
207
|
+
'"""',
|
|
208
|
+
"",
|
|
209
|
+
"from typing import Dict, List",
|
|
210
|
+
"",
|
|
211
|
+
"from agent_runtimes.types import BenchmarkSpec",
|
|
212
|
+
"",
|
|
213
|
+
"",
|
|
214
|
+
"# " + "=" * 76,
|
|
215
|
+
"# Benchmark Definitions",
|
|
216
|
+
"# " + "=" * 76,
|
|
217
|
+
"",
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
for spec in specs:
|
|
221
|
+
benchmark_id = spec["id"]
|
|
222
|
+
version = spec["version"]
|
|
223
|
+
validated = _validate_benchmark_spec(spec, eval_ids)
|
|
224
|
+
const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
|
|
225
|
+
desc = _esc_dq(spec.get("description", "").strip().replace("\n", " "))
|
|
226
|
+
|
|
227
|
+
lines.extend(
|
|
228
|
+
[
|
|
229
|
+
f"{const_name} = BenchmarkSpec(",
|
|
230
|
+
f' id="{benchmark_id}",',
|
|
231
|
+
f' version="{version}",',
|
|
232
|
+
f' name="{spec["name"]}",',
|
|
233
|
+
f' description="{desc}",',
|
|
234
|
+
f' category="{validated["category"]}",',
|
|
235
|
+
f" task_count={validated['task_count']},",
|
|
236
|
+
f' metric="{validated["metric"]}",',
|
|
237
|
+
f' source="{spec.get("source", "")}",',
|
|
238
|
+
f' difficulty="{validated["difficulty"]}",',
|
|
239
|
+
f" languages={_fmt_list(spec.get('languages', []))},",
|
|
240
|
+
f' dataset_source="{validated["dataset_source"]}",',
|
|
241
|
+
f" supports_live_monitoring={str(spec.get('supports_live_monitoring', False))},",
|
|
242
|
+
f" supports_experiment_comparison={str(spec.get('supports_experiment_comparison', True))},",
|
|
243
|
+
f" evaluator_shapes={_fmt_list(validated['evaluator_shapes'])},",
|
|
244
|
+
f" evaluators={_fmt_list(validated['evaluators'])},",
|
|
245
|
+
f" recommended_windows={_fmt_list(spec.get('recommended_windows', ['1h', '6h', '24h', '7d', '30d']))},",
|
|
246
|
+
f" trace_integration={str(spec.get('trace_integration', True))},",
|
|
247
|
+
f' dataset_editability="{validated["dataset_editability"]}",',
|
|
248
|
+
f' sdk_support="{validated["sdk_support"]}",',
|
|
249
|
+
")",
|
|
250
|
+
"",
|
|
251
|
+
]
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
lines.extend(
|
|
255
|
+
[
|
|
256
|
+
"# " + "=" * 76,
|
|
257
|
+
"# Benchmark Catalog",
|
|
258
|
+
"# " + "=" * 76,
|
|
259
|
+
"",
|
|
260
|
+
"BENCHMARK_CATALOG: Dict[str, BenchmarkSpec] = {",
|
|
261
|
+
]
|
|
262
|
+
)
|
|
263
|
+
for spec in specs:
|
|
264
|
+
benchmark_id = spec["id"]
|
|
265
|
+
version = spec["version"]
|
|
266
|
+
const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
|
|
267
|
+
lines.append(f' "{benchmark_id}": {const_name},')
|
|
268
|
+
lines.extend(
|
|
269
|
+
[
|
|
270
|
+
"}",
|
|
271
|
+
"",
|
|
272
|
+
"",
|
|
273
|
+
"def get_benchmark_spec(benchmark_id: str) -> BenchmarkSpec | None:",
|
|
274
|
+
' """Get a benchmark specification by ID (accepts both bare and versioned refs)."""',
|
|
275
|
+
" spec = BENCHMARK_CATALOG.get(benchmark_id)",
|
|
276
|
+
" if spec is not None:",
|
|
277
|
+
" return spec",
|
|
278
|
+
" base, _, ver = benchmark_id.rpartition(':')",
|
|
279
|
+
" if base and '.' in ver:",
|
|
280
|
+
" return BENCHMARK_CATALOG.get(base)",
|
|
281
|
+
" return None",
|
|
282
|
+
"",
|
|
283
|
+
"",
|
|
284
|
+
"def list_benchmark_specs() -> List[BenchmarkSpec]:",
|
|
285
|
+
' """List all benchmark specifications."""',
|
|
286
|
+
" return list(BENCHMARK_CATALOG.values())",
|
|
287
|
+
"",
|
|
288
|
+
]
|
|
289
|
+
)
|
|
290
|
+
return "\n".join(lines)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def generate_typescript_code(specs: list[dict[str, Any]], eval_ids: set[str]) -> str:
|
|
294
|
+
"""Generate TypeScript code from benchmark specifications."""
|
|
295
|
+
lines = [
|
|
296
|
+
"/*",
|
|
297
|
+
" * Copyright (c) 2025-2026 Datalayer, Inc.",
|
|
298
|
+
" * Distributed under the terms of the Modified BSD License.",
|
|
299
|
+
" */",
|
|
300
|
+
"",
|
|
301
|
+
"/**",
|
|
302
|
+
" * Benchmark Catalog",
|
|
303
|
+
" *",
|
|
304
|
+
" * Predefined evaluation benchmark configurations.",
|
|
305
|
+
" *",
|
|
306
|
+
" * This file is AUTO-GENERATED from YAML specifications.",
|
|
307
|
+
" * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
|
|
308
|
+
" */",
|
|
309
|
+
"",
|
|
310
|
+
"import type { BenchmarkSpec } from '../types';",
|
|
311
|
+
"",
|
|
312
|
+
"// " + "=" * 76,
|
|
313
|
+
"// Benchmark Definitions",
|
|
314
|
+
"// " + "=" * 76,
|
|
315
|
+
"",
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
for spec in specs:
|
|
319
|
+
benchmark_id = spec["id"]
|
|
320
|
+
version = spec["version"]
|
|
321
|
+
validated = _validate_benchmark_spec(spec, eval_ids)
|
|
322
|
+
const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
|
|
323
|
+
desc = _esc(spec.get("description", "").strip().replace("\n", " "))
|
|
324
|
+
|
|
325
|
+
lines.extend(
|
|
326
|
+
[
|
|
327
|
+
f"export const {const_name}: BenchmarkSpec = {{",
|
|
328
|
+
f" id: '{benchmark_id}',",
|
|
329
|
+
f" version: '{version}',",
|
|
330
|
+
f" name: '{_esc(spec['name'])}',",
|
|
331
|
+
f" description: '{desc}',",
|
|
332
|
+
f" category: '{validated['category']}',",
|
|
333
|
+
f" task_count: {validated['task_count']},",
|
|
334
|
+
f" metric: '{validated['metric']}',",
|
|
335
|
+
f" source: '{spec.get('source', '')}',",
|
|
336
|
+
f" difficulty: '{validated['difficulty']}',",
|
|
337
|
+
f" languages: {_ts_list(spec.get('languages', []))},",
|
|
338
|
+
f" dataset_source: '{validated['dataset_source']}',",
|
|
339
|
+
f" supports_live_monitoring: {str(spec.get('supports_live_monitoring', False)).lower()},",
|
|
340
|
+
f" supports_experiment_comparison: {str(spec.get('supports_experiment_comparison', True)).lower()},",
|
|
341
|
+
f" evaluator_shapes: {_ts_list(validated['evaluator_shapes'])},",
|
|
342
|
+
f" evaluators: {_ts_list(validated['evaluators'])},",
|
|
343
|
+
f" recommended_windows: {_ts_list(spec.get('recommended_windows', ['1h', '6h', '24h', '7d', '30d']))},",
|
|
344
|
+
f" trace_integration: {str(spec.get('trace_integration', True)).lower()},",
|
|
345
|
+
f" dataset_editability: '{validated['dataset_editability']}',",
|
|
346
|
+
f" sdk_support: '{validated['sdk_support']}',",
|
|
347
|
+
"};",
|
|
348
|
+
"",
|
|
349
|
+
]
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
lines.extend(
|
|
353
|
+
[
|
|
354
|
+
"// " + "=" * 76,
|
|
355
|
+
"// Benchmark Catalog",
|
|
356
|
+
"// " + "=" * 76,
|
|
357
|
+
"",
|
|
358
|
+
"export const BENCHMARK_CATALOG: Record<string, BenchmarkSpec> = {",
|
|
359
|
+
]
|
|
360
|
+
)
|
|
361
|
+
for spec in specs:
|
|
362
|
+
benchmark_id = spec["id"]
|
|
363
|
+
version = spec["version"]
|
|
364
|
+
const_name = f"{benchmark_id.upper().replace('-', '_')}_BENCHMARK_SPEC{version_suffix(version)}"
|
|
365
|
+
lines.append(f" '{benchmark_id}': {const_name},")
|
|
366
|
+
lines.extend(
|
|
367
|
+
[
|
|
368
|
+
"};",
|
|
369
|
+
"",
|
|
370
|
+
"export function getBenchmarkSpecs(): BenchmarkSpec[] {",
|
|
371
|
+
" return Object.values(BENCHMARK_CATALOG);",
|
|
372
|
+
"}",
|
|
373
|
+
"",
|
|
374
|
+
"function resolveBenchmarkId(benchmarkId: string): string {",
|
|
375
|
+
" if (benchmarkId in BENCHMARK_CATALOG) return benchmarkId;",
|
|
376
|
+
" const idx = benchmarkId.lastIndexOf(':');",
|
|
377
|
+
" if (idx > 0) {",
|
|
378
|
+
" const base = benchmarkId.slice(0, idx);",
|
|
379
|
+
" if (base in BENCHMARK_CATALOG) return base;",
|
|
380
|
+
" }",
|
|
381
|
+
" return benchmarkId;",
|
|
382
|
+
"}",
|
|
383
|
+
"",
|
|
384
|
+
"export function getBenchmarkSpec(benchmarkId: string): BenchmarkSpec | undefined {",
|
|
385
|
+
" return BENCHMARK_CATALOG[resolveBenchmarkId(benchmarkId)];",
|
|
386
|
+
"}",
|
|
387
|
+
"",
|
|
388
|
+
]
|
|
389
|
+
)
|
|
390
|
+
return "\n".join(lines)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def main():
|
|
394
|
+
"""Main entry point."""
|
|
395
|
+
parser = argparse.ArgumentParser(
|
|
396
|
+
description="Generate Python and TypeScript code from YAML benchmark specifications"
|
|
397
|
+
)
|
|
398
|
+
parser.add_argument("--specs-dir", type=Path, required=True)
|
|
399
|
+
parser.add_argument("--eval-specs-dir", type=Path, required=True)
|
|
400
|
+
parser.add_argument("--python-output", type=Path, required=True)
|
|
401
|
+
parser.add_argument("--typescript-output", type=Path, required=True)
|
|
402
|
+
args = parser.parse_args()
|
|
403
|
+
|
|
404
|
+
if not args.specs_dir.exists():
|
|
405
|
+
print(f"Error: Specs directory does not exist: {args.specs_dir}")
|
|
406
|
+
sys.exit(1)
|
|
407
|
+
if not args.eval_specs_dir.exists():
|
|
408
|
+
print(f"Error: Eval specs directory does not exist: {args.eval_specs_dir}")
|
|
409
|
+
sys.exit(1)
|
|
410
|
+
|
|
411
|
+
print(f"Loading benchmark specs from {args.specs_dir}...")
|
|
412
|
+
specs = load_specs(args.specs_dir)
|
|
413
|
+
print(f"Loaded {len(specs)} benchmark specifications")
|
|
414
|
+
print(f"Loading eval specs from {args.eval_specs_dir}...")
|
|
415
|
+
eval_specs = load_specs(args.eval_specs_dir)
|
|
416
|
+
eval_ids = {
|
|
417
|
+
str(spec["id"])
|
|
418
|
+
for spec in eval_specs
|
|
419
|
+
if isinstance(spec, dict) and isinstance(spec.get("id"), str)
|
|
420
|
+
}
|
|
421
|
+
if not eval_ids:
|
|
422
|
+
print("Error: No eval specifications found for benchmark evaluator validation")
|
|
423
|
+
sys.exit(1)
|
|
424
|
+
|
|
425
|
+
print("Generating Python code...")
|
|
426
|
+
python_code = generate_python_code(specs, eval_ids)
|
|
427
|
+
args.python_output.parent.mkdir(parents=True, exist_ok=True)
|
|
428
|
+
args.python_output.write_text(python_code)
|
|
429
|
+
print(f"✓ Generated {args.python_output}")
|
|
430
|
+
|
|
431
|
+
print("Generating TypeScript code...")
|
|
432
|
+
typescript_code = generate_typescript_code(specs, eval_ids)
|
|
433
|
+
args.typescript_output.parent.mkdir(parents=True, exist_ok=True)
|
|
434
|
+
args.typescript_output.write_text(typescript_code)
|
|
435
|
+
print(f"✓ Generated {args.typescript_output}")
|
|
436
|
+
|
|
437
|
+
print(f"\n✓ Successfully generated code from {len(specs)} benchmark specs")
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
if __name__ == "__main__":
|
|
441
|
+
main()
|
|
@@ -6,13 +6,14 @@
|
|
|
6
6
|
Generate Python and TypeScript code from YAML eval specifications.
|
|
7
7
|
|
|
8
8
|
Usage:
|
|
9
|
-
python generate_evals.py
|
|
10
|
-
--specs-dir specs/evals
|
|
11
|
-
--python-output agent_runtimes/specs/evals.py
|
|
9
|
+
python generate_evals.py \
|
|
10
|
+
--specs-dir specs/evals \
|
|
11
|
+
--python-output agent_runtimes/specs/evals.py \
|
|
12
12
|
--typescript-output src/specs/evals.ts
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
15
|
import argparse
|
|
16
|
+
import json
|
|
16
17
|
import sys
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
from typing import Any
|
|
@@ -20,6 +21,65 @@ from typing import Any
|
|
|
20
21
|
import yaml
|
|
21
22
|
from versioning import ensure_spec_version, version_suffix
|
|
22
23
|
|
|
24
|
+
ALLOWED_EVAL_CATEGORIES = {
|
|
25
|
+
"Comparison",
|
|
26
|
+
"Type Validation",
|
|
27
|
+
"Performance",
|
|
28
|
+
"LLM-as-a-Judge",
|
|
29
|
+
"Span-Based",
|
|
30
|
+
"Report",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
ALLOWED_EVALUATOR_TYPES = {"case", "report"}
|
|
34
|
+
|
|
35
|
+
ALLOWED_OUTPUT_KINDS = {
|
|
36
|
+
"boolean",
|
|
37
|
+
"boolean_with_reason",
|
|
38
|
+
"score",
|
|
39
|
+
"score_and_assertion",
|
|
40
|
+
"report_table",
|
|
41
|
+
"report_curve",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _required_str(spec: dict[str, Any], key: str) -> str:
|
|
46
|
+
"""Return required non-empty string key or raise with actionable context."""
|
|
47
|
+
value = spec.get(key)
|
|
48
|
+
if not isinstance(value, str) or not value.strip():
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Invalid eval spec '{spec.get('id', '<unknown>')}': missing required field '{key}'"
|
|
51
|
+
)
|
|
52
|
+
return value.strip()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _validate_eval_spec(spec: dict[str, Any]) -> dict[str, str]:
|
|
56
|
+
"""Validate required eval fields and return normalized values."""
|
|
57
|
+
spec_id = str(spec.get("id") or "<unknown>")
|
|
58
|
+
category = _required_str(spec, "category")
|
|
59
|
+
evaluator_type = _required_str(spec, "evaluator_type").lower()
|
|
60
|
+
pydantic_class = _required_str(spec, "pydantic_class")
|
|
61
|
+
output_kind = _required_str(spec, "output_kind")
|
|
62
|
+
|
|
63
|
+
if category not in ALLOWED_EVAL_CATEGORIES:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Invalid eval spec '{spec_id}': category '{category}' not in {sorted(ALLOWED_EVAL_CATEGORIES)}"
|
|
66
|
+
)
|
|
67
|
+
if evaluator_type not in ALLOWED_EVALUATOR_TYPES:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Invalid eval spec '{spec_id}': evaluator_type '{evaluator_type}' not in {sorted(ALLOWED_EVALUATOR_TYPES)}"
|
|
70
|
+
)
|
|
71
|
+
if output_kind not in ALLOWED_OUTPUT_KINDS:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Invalid eval spec '{spec_id}': output_kind '{output_kind}' not in {sorted(ALLOWED_OUTPUT_KINDS)}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
"category": category,
|
|
78
|
+
"evaluator_type": evaluator_type,
|
|
79
|
+
"pydantic_class": pydantic_class,
|
|
80
|
+
"output_kind": output_kind,
|
|
81
|
+
}
|
|
82
|
+
|
|
23
83
|
|
|
24
84
|
def _fmt_list(items: list[str]) -> str:
|
|
25
85
|
"""Format a list of strings with double quotes for ruff compliance."""
|
|
@@ -64,7 +124,7 @@ def generate_python_code(specs: list[dict[str, Any]]) -> str:
|
|
|
64
124
|
'"""',
|
|
65
125
|
"Eval Catalog.",
|
|
66
126
|
"",
|
|
67
|
-
"Predefined
|
|
127
|
+
"Predefined built-in evaluator configurations.",
|
|
68
128
|
"",
|
|
69
129
|
"This file is AUTO-GENERATED from YAML specifications.",
|
|
70
130
|
"DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
|
|
@@ -84,24 +144,33 @@ def generate_python_code(specs: list[dict[str, Any]]) -> str:
|
|
|
84
144
|
for spec in specs:
|
|
85
145
|
eval_id = spec["id"]
|
|
86
146
|
version = spec["version"]
|
|
147
|
+
validated = _validate_eval_spec(spec)
|
|
148
|
+
evaluator_type = validated["evaluator_type"]
|
|
149
|
+
output_kind = validated["output_kind"]
|
|
150
|
+
pydantic_class = validated["pydantic_class"]
|
|
151
|
+
category = validated["category"]
|
|
87
152
|
const_name = (
|
|
88
153
|
f"{eval_id.upper().replace('-', '_')}_EVAL_SPEC{version_suffix(version)}"
|
|
89
154
|
)
|
|
90
155
|
desc = _esc_dq(spec.get("description", "").strip().replace("\n", " "))
|
|
156
|
+
default_config = repr(spec.get("default_config", {}))
|
|
91
157
|
|
|
92
158
|
lines.extend(
|
|
93
159
|
[
|
|
94
160
|
f"{const_name} = EvalSpec(",
|
|
95
161
|
f' id="{eval_id}",',
|
|
96
162
|
f' version="{version}",',
|
|
97
|
-
f' name="{spec["name"]}",',
|
|
163
|
+
f' name="{_esc_dq(spec["name"])}",',
|
|
98
164
|
f' description="{desc}",',
|
|
99
|
-
f' category="{
|
|
100
|
-
f
|
|
101
|
-
f'
|
|
165
|
+
f' category="{category}",',
|
|
166
|
+
f' evaluator_type="{evaluator_type}",',
|
|
167
|
+
f' pydantic_class="{pydantic_class}",',
|
|
168
|
+
f' output_kind="{output_kind}",',
|
|
169
|
+
f' cost_tier="{spec.get("cost_tier", "free")}",',
|
|
170
|
+
f' latency="{spec.get("latency", "instant")}",',
|
|
171
|
+
f" requires={_fmt_list(spec.get('requires', []))},",
|
|
102
172
|
f' source="{spec.get("source", "")}",',
|
|
103
|
-
f
|
|
104
|
-
f" languages={_fmt_list(spec.get('languages', []))},",
|
|
173
|
+
f" default_config={default_config},",
|
|
105
174
|
")",
|
|
106
175
|
"",
|
|
107
176
|
]
|
|
@@ -159,7 +228,7 @@ def generate_typescript_code(specs: list[dict[str, Any]]) -> str:
|
|
|
159
228
|
"/**",
|
|
160
229
|
" * Eval Catalog",
|
|
161
230
|
" *",
|
|
162
|
-
" * Predefined
|
|
231
|
+
" * Predefined built-in evaluator configurations.",
|
|
163
232
|
" *",
|
|
164
233
|
" * This file is AUTO-GENERATED from YAML specifications.",
|
|
165
234
|
" * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.",
|
|
@@ -176,10 +245,16 @@ def generate_typescript_code(specs: list[dict[str, Any]]) -> str:
|
|
|
176
245
|
for spec in specs:
|
|
177
246
|
eval_id = spec["id"]
|
|
178
247
|
version = spec["version"]
|
|
248
|
+
validated = _validate_eval_spec(spec)
|
|
249
|
+
evaluator_type = validated["evaluator_type"]
|
|
250
|
+
output_kind = validated["output_kind"]
|
|
251
|
+
pydantic_class = validated["pydantic_class"]
|
|
252
|
+
category = validated["category"]
|
|
179
253
|
const_name = (
|
|
180
254
|
f"{eval_id.upper().replace('-', '_')}_EVAL_SPEC{version_suffix(version)}"
|
|
181
255
|
)
|
|
182
256
|
desc = _esc(spec.get("description", "").strip().replace("\n", " "))
|
|
257
|
+
default_config = json.dumps(spec.get("default_config", {}), ensure_ascii=True)
|
|
183
258
|
|
|
184
259
|
lines.extend(
|
|
185
260
|
[
|
|
@@ -188,12 +263,15 @@ def generate_typescript_code(specs: list[dict[str, Any]]) -> str:
|
|
|
188
263
|
f" version: '{version}',",
|
|
189
264
|
f" name: '{_esc(spec['name'])}',",
|
|
190
265
|
f" description: '{desc}',",
|
|
191
|
-
f" category: '{
|
|
192
|
-
f"
|
|
193
|
-
f"
|
|
266
|
+
f" category: '{category}',",
|
|
267
|
+
f" evaluator_type: '{evaluator_type}',",
|
|
268
|
+
f" pydantic_class: '{pydantic_class}',",
|
|
269
|
+
f" output_kind: '{output_kind}',",
|
|
270
|
+
f" cost_tier: '{spec.get('cost_tier', 'free')}',",
|
|
271
|
+
f" latency: '{spec.get('latency', 'instant')}',",
|
|
272
|
+
f" requires: {_ts_list(spec.get('requires', []))},",
|
|
194
273
|
f" source: '{spec.get('source', '')}',",
|
|
195
|
-
f"
|
|
196
|
-
f" languages: {_ts_list(spec.get('languages', []))},",
|
|
274
|
+
f" default_config: {default_config},",
|
|
197
275
|
"};",
|
|
198
276
|
"",
|
|
199
277
|
]
|