npm - @datalayer/agent-runtimes - Versions diffs - 1.0.4 → 1.0.6 - Mend

@datalayer/agent-runtimes 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

package/README.md +182 -1
package/lib/AgentNode.d.ts +3 -0
package/lib/AgentNode.js +676 -0
package/lib/App.js +1 -1
package/lib/agent-node/themeStore.d.ts +3 -0
package/lib/agent-node/themeStore.js +156 -0
package/lib/agent-node-main.d.ts +1 -0
package/lib/agent-node-main.js +14 -0
package/lib/agents/AgentDetails.d.ts +22 -1
package/lib/agents/AgentDetails.js +34 -47
package/lib/api/index.d.ts +0 -1
package/lib/api/index.js +4 -2
package/lib/chat/Chat.d.ts +5 -106
package/lib/chat/Chat.js +20 -14
package/lib/chat/ChatFloating.d.ts +7 -140
package/lib/chat/ChatFloating.js +3 -3
package/lib/chat/ChatPopupStandalone.d.ts +8 -47
package/lib/chat/ChatPopupStandalone.js +3 -3
package/lib/chat/ChatSidebar.d.ts +4 -69
package/lib/chat/ChatSidebar.js +83 -51
package/lib/chat/ChatStandalone.d.ts +4 -54
package/lib/chat/ChatStandalone.js +3 -3
package/lib/chat/base/ChatBase.js +1414 -174
package/lib/chat/display/FloatingBrandButton.js +8 -1
package/lib/chat/header/ChatHeader.d.ts +3 -1
package/lib/chat/header/ChatHeader.js +15 -12
package/lib/chat/header/ChatHeaderBase.d.ts +30 -5
package/lib/chat/header/ChatHeaderBase.js +41 -16
package/lib/chat/indicators/McpStatusIndicator.d.ts +7 -4
package/lib/chat/indicators/McpStatusIndicator.js +7 -32
package/lib/chat/indicators/SandboxStatusIndicator.d.ts +4 -1
package/lib/chat/indicators/SandboxStatusIndicator.js +91 -56
package/lib/chat/indicators/SkillsStatusIndicator.d.ts +7 -0
package/lib/chat/indicators/SkillsStatusIndicator.js +88 -0
package/lib/chat/indicators/index.d.ts +1 -0
package/lib/chat/indicators/index.js +1 -0
package/lib/chat/messages/ChatMessageList.d.ts +1 -1
package/lib/chat/messages/ChatMessageList.js +154 -114
package/lib/chat/messages/ChatMessages.js +6 -2
package/lib/chat/prompt/InputFooter.d.ts +21 -6
package/lib/chat/prompt/InputFooter.js +76 -20
package/lib/chat/prompt/InputPrompt.d.ts +5 -1
package/lib/chat/prompt/InputPrompt.js +4 -4
package/lib/chat/prompt/InputPromptFooter.d.ts +3 -1
package/lib/chat/prompt/InputPromptFooter.js +3 -3
package/lib/chat/prompt/InputPromptLexical.d.ts +3 -1
package/lib/chat/prompt/InputPromptLexical.js +12 -5
package/lib/chat/prompt/InputPromptText.d.ts +3 -1
package/lib/chat/prompt/InputPromptText.js +2 -2
package/lib/chat/tools/ToolApprovalBanner.js +1 -1
package/lib/chat/tools/ToolCallDisplay.d.ts +3 -1
package/lib/chat/tools/ToolCallDisplay.js +2 -2
package/lib/chat/usage/TokenUsageBar.js +20 -2
package/lib/client/AgentRuntimesClientContext.d.ts +53 -0
package/lib/client/AgentRuntimesClientContext.js +55 -0
package/lib/client/AgentsMixin.d.ts +0 -18
package/lib/client/AgentsMixin.js +20 -30
package/lib/client/IAgentRuntimesClient.d.ts +215 -0
package/lib/client/IAgentRuntimesClient.js +5 -0
package/lib/client/SdkAgentRuntimesClient.d.ts +151 -0
package/lib/client/SdkAgentRuntimesClient.js +134 -0
package/lib/client/index.d.ts +4 -1
package/lib/client/index.js +3 -1
package/lib/components/NotificationEventCard.js +5 -1
package/lib/config/AgentConfiguration.d.ts +22 -0
package/lib/config/AgentConfiguration.js +319 -64
package/lib/context/ContextDistribution.d.ts +3 -1
package/lib/context/ContextDistribution.js +8 -27
package/lib/context/ContextInspector.d.ts +3 -1
package/lib/context/ContextInspector.js +19 -67
package/lib/context/ContextPanel.d.ts +3 -1
package/lib/context/ContextPanel.js +104 -64
package/lib/context/ContextUsage.d.ts +3 -1
package/lib/context/ContextUsage.js +3 -3
package/lib/context/CostTracker.d.ts +9 -3
package/lib/context/CostTracker.js +26 -47
package/lib/context/CostUsageChart.d.ts +12 -0
package/lib/context/CostUsageChart.js +378 -0
package/lib/context/GraphFlowChart.d.ts +16 -0
package/lib/context/GraphFlowChart.js +182 -0
package/lib/context/TokenUsageChart.d.ts +8 -1
package/lib/context/TokenUsageChart.js +349 -211
package/lib/context/TurnGraphChart.d.ts +39 -0
package/lib/context/TurnGraphChart.js +538 -0
package/lib/context/otelWsPool.d.ts +20 -0
package/lib/context/otelWsPool.js +69 -0
package/lib/examples/A2UiComponentGalleryExample.d.ts +0 -17
package/lib/examples/A2UiComponentGalleryExample.js +315 -522
package/lib/examples/A2UiContactCardExample.d.ts +0 -18
package/lib/examples/A2UiContactCardExample.js +154 -411
package/lib/examples/A2UiRestaurantExample.d.ts +0 -30
package/lib/examples/A2UiRestaurantExample.js +114 -212
package/lib/examples/A2UiViewerExample.d.ts +0 -18
package/lib/examples/A2UiViewerExample.js +283 -532
package/lib/examples/AgUiBackendToolRenderingExample.js +1 -1
package/lib/examples/AgUiHaikuGenUiExample.d.ts +1 -1
package/lib/examples/AgUiHaikuGenUiExample.js +1 -1
package/lib/examples/AgUiSharedStateExample.js +2 -1
package/lib/examples/AgentCheckpointsExample.js +14 -28
package/lib/examples/AgentCodemodeExample.d.ts +4 -6
package/lib/examples/AgentCodemodeExample.js +603 -169
package/lib/examples/AgentEvalsExample.js +339 -53
package/lib/examples/AgentGuardrailsExample.js +383 -66
package/lib/examples/AgentHooksExample.d.ts +3 -0
package/lib/examples/AgentHooksExample.js +122 -0
package/lib/examples/AgentInferenceProviderExample.d.ts +3 -0
package/lib/examples/AgentInferenceProviderExample.js +329 -0
package/lib/examples/AgentMCPExample.d.ts +3 -0
package/lib/examples/AgentMCPExample.js +481 -0
package/lib/examples/AgentMemoryExample.d.ts +1 -2
package/lib/examples/AgentMemoryExample.js +78 -33
package/lib/examples/AgentMonitoringExample.js +261 -200
package/lib/examples/AgentNotificationsExample.d.ts +1 -2
package/lib/examples/AgentNotificationsExample.js +114 -33
package/lib/examples/AgentOtelExample.js +32 -42
package/lib/examples/AgentOutputsExample.d.ts +11 -6
package/lib/examples/AgentOutputsExample.js +433 -81
package/lib/examples/AgentParametersExample.d.ts +3 -0
package/lib/examples/AgentParametersExample.js +248 -0
package/lib/examples/AgentSandboxExample.d.ts +3 -3
package/lib/examples/AgentSandboxExample.js +74 -45
package/lib/examples/AgentSkillsExample.js +95 -103
package/lib/examples/AgentSubagentsExample.d.ts +14 -0
package/lib/examples/AgentSubagentsExample.js +228 -0
package/lib/examples/AgentToolApprovalsExample.js +49 -561
package/lib/examples/AgentTriggersExample.js +823 -569
package/lib/examples/{AgentspecExample.d.ts → AgentspecsExample.d.ts} +2 -2
package/lib/examples/AgentspecsExample.js +1096 -0
package/lib/examples/ChatCustomExample.js +16 -28
package/lib/examples/ChatExample.js +13 -29
package/lib/examples/CopilotKitLexicalExample.js +2 -1
package/lib/examples/CopilotKitNotebookExample.js +2 -1
package/lib/examples/HomeExample.d.ts +15 -0
package/lib/examples/HomeExample.js +77 -0
package/lib/examples/Lexical2Example.js +4 -2
package/lib/examples/{LexicalExample.d.ts → LexicalAgentExample.d.ts} +4 -4
package/lib/examples/{LexicalExample.js → LexicalAgentExample.js} +66 -17
package/lib/examples/{LexicalSidebarExample.d.ts → LexicalAgentSidebarExample.d.ts} +5 -5
package/lib/examples/LexicalAgentSidebarExample.js +261 -0
package/lib/examples/NotebookAgentExample.d.ts +9 -0
package/lib/examples/NotebookAgentExample.js +192 -0
package/lib/examples/{NotebookSidebarExample.d.ts → NotebookAgentSidebarExample.d.ts} +2 -2
package/lib/examples/NotebookAgentSidebarExample.js +221 -0
package/lib/examples/{DatalayerNotebookExample.d.ts → NotebookCollaborationExample.d.ts} +4 -4
package/lib/examples/{DatalayerNotebookExample.js → NotebookCollaborationExample.js} +3 -3
package/lib/examples/NotebookExample.d.ts +4 -7
package/lib/examples/NotebookExample.js +14 -146
package/lib/examples/components/AuthRequiredView.d.ts +6 -0
package/lib/examples/components/AuthRequiredView.js +33 -0
package/lib/examples/components/ExampleWrapper.d.ts +9 -3
package/lib/examples/components/ExampleWrapper.js +45 -9
package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.js +1 -1
package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.js +1 -1
package/lib/examples/{ag-ui → components}/haiku/index.d.ts +1 -1
package/lib/examples/{ag-ui → components}/haiku/index.js +1 -1
package/lib/examples/components/index.d.ts +3 -0
package/lib/examples/components/index.js +4 -0
package/lib/examples/{ag-ui → components}/weather/index.d.ts +1 -1
package/lib/examples/{ag-ui → components}/weather/index.js +1 -1
package/lib/examples/example-selector.d.ts +17 -4
package/lib/examples/example-selector.js +108 -41
package/lib/examples/index.d.ts +10 -6
package/lib/examples/index.js +10 -6
package/lib/examples/lexical/initial-content.json +6 -6
package/lib/examples/main.js +257 -27
package/lib/examples/utils/a2ui.d.ts +18 -0
package/lib/examples/utils/a2ui.js +69 -0
package/lib/examples/utils/a2uiMarkdownProvider.d.ts +7 -0
package/lib/examples/utils/a2uiMarkdownProvider.js +9 -0
package/lib/examples/utils/agentId.d.ts +18 -0
package/lib/examples/utils/agentId.js +54 -0
package/lib/examples/utils/agents/earthquake-detector.json +11 -11
package/lib/examples/utils/agents/sales-forecaster.json +11 -11
package/lib/examples/utils/agents/social-post-generator.json +11 -11
package/lib/examples/utils/agents/stock-market.json +11 -11
package/lib/examples/utils/examplesStore.js +82 -27
package/lib/examples/utils/useExampleAgentRuntimesUrl.d.ts +5 -0
package/lib/examples/utils/useExampleAgentRuntimesUrl.js +19 -0
package/lib/hooks/index.d.ts +8 -8
package/lib/hooks/index.js +7 -7
package/lib/hooks/useA2A.d.ts +2 -3
package/lib/hooks/useAIAgentsWebSocket.d.ts +43 -4
package/lib/hooks/useAIAgentsWebSocket.js +153 -12
package/lib/hooks/useAcp.d.ts +1 -2
package/lib/hooks/useAgUi.d.ts +1 -1
package/lib/hooks/{useAgents.d.ts → useAgentRuntimes.d.ts} +70 -4
package/lib/hooks/{useAgents.js → useAgentRuntimes.js} +237 -32
package/lib/hooks/useAgentsCatalog.js +1 -1
package/lib/hooks/useAgentsService.d.ts +2 -2
package/lib/hooks/useAgentsService.js +7 -7
package/lib/hooks/useCheckpoints.js +1 -1
package/lib/hooks/useConfig.d.ts +4 -1
package/lib/hooks/useConfig.js +10 -3
package/lib/hooks/useContextSnapshot.d.ts +9 -4
package/lib/hooks/useContextSnapshot.js +9 -37
package/lib/hooks/useMonitoring.js +3 -0
package/lib/hooks/useSandbox.d.ts +20 -8
package/lib/hooks/useSandbox.js +105 -40
package/lib/hooks/useSkills.d.ts +23 -5
package/lib/hooks/useSkills.js +94 -39
package/lib/hooks/useToolApprovals.d.ts +60 -36
package/lib/hooks/useToolApprovals.js +318 -69
package/lib/hooks/useVercelAI.d.ts +1 -1
package/lib/index.d.ts +2 -1
package/lib/index.js +1 -0
package/lib/inference/index.d.ts +0 -1
package/lib/middleware/index.d.ts +0 -1
package/lib/protocols/AGUIAdapter.js +6 -0
package/lib/protocols/VercelAIAdapter.d.ts +7 -0
package/lib/protocols/VercelAIAdapter.js +59 -7
package/lib/specs/agents/agents.d.ts +21 -4
package/lib/specs/agents/agents.js +2879 -316
package/lib/specs/agents/index.js +3 -1
package/lib/specs/benchmarks.d.ts +20 -0
package/lib/specs/benchmarks.js +205 -0
package/lib/specs/envvars.js +27 -20
package/lib/specs/evals.d.ts +10 -9
package/lib/specs/evals.js +128 -88
package/lib/specs/events.d.ts +3 -10
package/lib/specs/events.js +127 -84
package/lib/specs/frontendTools.js +2 -2
package/lib/specs/guardrails.d.ts +0 -7
package/lib/specs/guardrails.js +240 -159
package/lib/specs/mcpServers.js +35 -6
package/lib/specs/memory.d.ts +0 -2
package/lib/specs/memory.js +4 -17
package/lib/specs/models.d.ts +0 -2
package/lib/specs/models.js +20 -15
package/lib/specs/notifications.js +102 -18
package/lib/specs/outputs.js +15 -9
package/lib/specs/personas.d.ts +41 -0
package/lib/specs/personas.js +168 -0
package/lib/specs/skills.d.ts +1 -1
package/lib/specs/skills.js +23 -23
package/lib/specs/teams/index.js +3 -1
package/lib/specs/teams/teams.js +468 -348
package/lib/specs/tools.js +4 -4
package/lib/specs/triggers.js +61 -11
package/lib/stores/agentRuntimeStore.d.ts +208 -0
package/lib/stores/agentRuntimeStore.js +650 -0
package/lib/stores/conversationStore.js +2 -2
package/lib/stores/index.d.ts +1 -1
package/lib/stores/index.js +1 -1
package/lib/tools/adapters/copilotkit/lexicalHooks.d.ts +1 -2
package/lib/tools/adapters/copilotkit/lexicalHooks.js +1 -3
package/lib/tools/adapters/copilotkit/notebookHooks.d.ts +1 -2
package/lib/tools/adapters/copilotkit/notebookHooks.js +1 -3
package/lib/tools/index.d.ts +0 -2
package/lib/tools/index.js +0 -1
package/lib/types/agents-lifecycle.d.ts +18 -0
package/lib/types/agents.d.ts +6 -0
package/lib/types/agentspecs.d.ts +54 -1
package/lib/types/benchmarks.d.ts +43 -0
package/lib/types/benchmarks.js +5 -0
package/lib/types/chat.d.ts +325 -8
package/lib/types/context.d.ts +27 -0
package/lib/types/cost.d.ts +2 -2
package/lib/types/evals.d.ts +26 -17
package/lib/types/index.d.ts +3 -0
package/lib/types/index.js +3 -0
package/lib/types/mcp.d.ts +8 -0
package/lib/types/models.d.ts +2 -2
package/lib/types/personas.d.ts +25 -0
package/lib/types/personas.js +5 -0
package/lib/types/skills.d.ts +43 -1
package/lib/types/stream.d.ts +110 -0
package/lib/types/stream.js +36 -0
package/lib/utils/utils.d.ts +9 -5
package/lib/utils/utils.js +9 -5
package/package.json +19 -11
package/scripts/codegen/__pycache__/generate_agents.cpython-313.pyc +0 -0
package/scripts/codegen/__pycache__/generate_benchmarks.cpython-313.pyc +0 -0
package/scripts/codegen/__pycache__/generate_evals.cpython-313.pyc +0 -0
package/scripts/codegen/__pycache__/generate_events.cpython-313.pyc +0 -0
package/scripts/codegen/__pycache__/versioning.cpython-313.pyc +0 -0
package/scripts/codegen/generate_agents.py +187 -45
package/scripts/codegen/generate_benchmarks.py +441 -0
package/scripts/codegen/generate_evals.py +94 -16
package/scripts/codegen/generate_events.py +35 -14
package/scripts/codegen/generate_personas.py +319 -0
package/scripts/codegen/generate_skills.py +9 -9
package/scripts/sync-jupyter.sh +26 -7
package/lib/api/tool-approvals.d.ts +0 -62
package/lib/api/tool-approvals.js +0 -145
package/lib/examples/AgentspecExample.js +0 -705
package/lib/examples/LexicalSidebarExample.js +0 -163
package/lib/examples/NotebookSidebarExample.js +0 -119
package/lib/examples/NotebookSimpleExample.d.ts +0 -6
package/lib/examples/NotebookSimpleExample.js +0 -22
package/lib/examples/ag-ui/index.d.ts +0 -10
package/lib/examples/ag-ui/index.js +0 -16
package/lib/hooks/useAgentsRegistry.d.ts +0 -10
package/lib/hooks/useAgentsRegistry.js +0 -20
package/lib/stores/agentsStore.d.ts +0 -123
package/lib/stores/agentsStore.js +0 -270
/package/lib/examples/{ag-ui → components}/haiku/HaikuDisplay.d.ts +0 -0
/package/lib/examples/{ag-ui → components}/haiku/InlineHaikuCard.d.ts +0 -0
/package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.d.ts +0 -0
/package/lib/examples/{ag-ui → components}/weather/InlineWeatherCard.js +0 -0

package/lib/specs/agents/index.js CHANGED Viewed

@@ -31,7 +31,9 @@ export function getAgentSpecs(agentId) {
  */
 export function listAgentSpecs(prefix) {
     const specs = Object.values(AGENT_SPECS);
-    return prefix !== undefined ? specs.filter(s => s.id.startsWith(prefix)) : specs;
+    return prefix !== undefined
+        ? specs.filter(s => s.id.startsWith(prefix))
+        : specs;
 }
 /**
  * Collect all required environment variables for an agent spec.

package/lib/specs/benchmarks.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/**
+ * Benchmark Catalog
+ *
+ * Predefined evaluation benchmark configurations.
+ *
+ * This file is AUTO-GENERATED from YAML specifications.
+ * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.
+ */
+import type { BenchmarkSpec } from '../types';
+export declare const AGENTBENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const HUMANEVAL_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const MMLU_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const SWE_BENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const TOOLBENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const TRUTHFULQA_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
+export declare const BENCHMARK_CATALOG: Record<string, BenchmarkSpec>;
+export declare function getBenchmarkSpecs(): BenchmarkSpec[];
+export declare function getBenchmarkSpec(benchmarkId: string): BenchmarkSpec | undefined;

package/lib/specs/benchmarks.js ADDED Viewed

@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2025-2026 Datalayer, Inc.
+ * Distributed under the terms of the Modified BSD License.
+ */
+// ============================================================================
+// Benchmark Definitions
+// ============================================================================
+export const AGENTBENCH_BENCHMARK_SPEC_0_0_1 = {
+    id: 'agentbench',
+    version: '0.0.1',
+    name: 'AgentBench',
+    description: 'Multi-dimensional LLM-as-agent evaluation across 8 diverse environments including web browsing, operating system interaction, database queries, digital card games, lateral thinking, and household tasks.',
+    category: 'Agentic',
+    task_count: 4080,
+    metric: 'success_rate',
+    source: 'https://github.com/THUDM/AgentBench',
+    difficulty: 'hard',
+    languages: ['python', 'sql', 'bash'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: true,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['pass_rate', 'numeric'],
+    evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1 = {
+    id: 'gpqa-diamond',
+    version: '0.0.1',
+    name: 'GPQA Diamond',
+    description: 'Graduate-level science questions crafted by domain experts. Tests advanced reasoning in physics, chemistry, and biology with questions that require PhD-level understanding to answer correctly.',
+    category: 'Knowledge',
+    task_count: 448,
+    metric: 'accuracy',
+    source: 'https://github.com/idavidrein/gpqa',
+    difficulty: 'expert',
+    languages: ['english'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: false,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['numeric'],
+    evaluators: ['precision-recall-evaluator:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const HUMANEVAL_BENCHMARK_SPEC_0_0_1 = {
+    id: 'humaneval',
+    version: '0.0.1',
+    name: 'HumanEval',
+    description: 'Python function implementation from docstrings. Measures functional correctness of code generation by testing against hand-written test cases. Widely used as a baseline for code generation benchmarks.',
+    category: 'Coding',
+    task_count: 164,
+    metric: 'pass@k',
+    source: 'https://github.com/openai/human-eval',
+    difficulty: 'medium',
+    languages: ['python'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: false,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['pass_rate'],
+    evaluators: ['precision-recall-evaluator:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const MMLU_BENCHMARK_SPEC_0_0_1 = {
+    id: 'mmlu',
+    version: '0.0.1',
+    name: 'MMLU',
+    description: 'Massive Multitask Language Understanding: 57-subject knowledge benchmark spanning STEM, humanities, social sciences, and more. Tests broad knowledge and reasoning across diverse academic domains.',
+    category: 'Knowledge',
+    task_count: 15908,
+    metric: 'accuracy',
+    source: 'https://github.com/hendrycks/test',
+    difficulty: 'medium',
+    languages: ['english'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: false,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['numeric'],
+    evaluators: ['precision-recall-evaluator:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1 = {
+    id: 'swe-bench-verified',
+    version: '0.0.1',
+    name: 'SWE-bench Verified',
+    description: 'Human-validated subset of SWE-bench with verified ground-truth patches. Provides higher confidence evaluation of software engineering capabilities by eliminating ambiguous or flawed test cases from the full benchmark.',
+    category: 'Coding',
+    task_count: 500,
+    metric: 'pass@1',
+    source: 'https://www.swebench.com/',
+    difficulty: 'hard',
+    languages: ['python'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: true,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['pass_rate'],
+    evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const SWE_BENCH_BENCHMARK_SPEC_0_0_1 = {
+    id: 'swe-bench',
+    version: '0.0.1',
+    name: 'SWE-bench',
+    description: "Real-world software engineering tasks from GitHub issues. Tests an agent's ability to understand bug reports and feature requests, then produce working code patches that pass existing test suites.",
+    category: 'Coding',
+    task_count: 2294,
+    metric: 'pass@1',
+    source: 'https://www.swebench.com/',
+    difficulty: 'hard',
+    languages: ['python'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: true,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['pass_rate'],
+    evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const TOOLBENCH_BENCHMARK_SPEC_0_0_1 = {
+    id: 'toolbench',
+    version: '0.0.1',
+    name: 'ToolBench',
+    description: 'Large-scale benchmark for tool-augmented LLMs covering 16000+ real-world APIs across 49 categories. Evaluates multi-step tool usage, API selection, argument generation, and response parsing in complex, chained workflows.',
+    category: 'Agentic',
+    task_count: 12657,
+    metric: 'pass_rate',
+    source: 'https://github.com/OpenBMB/ToolBench',
+    difficulty: 'hard',
+    languages: ['python', 'json'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: true,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['pass_rate', 'numeric'],
+    evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+export const TRUTHFULQA_BENCHMARK_SPEC_0_0_1 = {
+    id: 'truthfulqa',
+    version: '0.0.1',
+    name: 'TruthfulQA',
+    description: 'Benchmark measuring whether a language model generates truthful answers to questions spanning 38 categories including health, law, finance, and politics. Designed to test resilience against common human misconceptions and falsehoods that models may have learned from training data.',
+    category: 'Safety',
+    task_count: 817,
+    metric: 'truthful_informative',
+    source: 'https://github.com/sylinrl/TruthfulQA',
+    difficulty: 'medium',
+    languages: ['english'],
+    dataset_source: 'hosted',
+    supports_live_monitoring: false,
+    supports_experiment_comparison: true,
+    evaluator_shapes: ['categorical', 'numeric'],
+    evaluators: ['llm-judge:0.0.1'],
+    recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
+    trace_integration: true,
+    dataset_editability: 'read-only',
+    sdk_support: 'experimental',
+};
+// ============================================================================
+// Benchmark Catalog
+// ============================================================================
+export const BENCHMARK_CATALOG = {
+    agentbench: AGENTBENCH_BENCHMARK_SPEC_0_0_1,
+    'gpqa-diamond': GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1,
+    humaneval: HUMANEVAL_BENCHMARK_SPEC_0_0_1,
+    mmlu: MMLU_BENCHMARK_SPEC_0_0_1,
+    'swe-bench-verified': SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1,
+    'swe-bench': SWE_BENCH_BENCHMARK_SPEC_0_0_1,
+    toolbench: TOOLBENCH_BENCHMARK_SPEC_0_0_1,
+    truthfulqa: TRUTHFULQA_BENCHMARK_SPEC_0_0_1,
+};
+export function getBenchmarkSpecs() {
+    return Object.values(BENCHMARK_CATALOG);
+}
+function resolveBenchmarkId(benchmarkId) {
+    if (benchmarkId in BENCHMARK_CATALOG)
+        return benchmarkId;
+    const idx = benchmarkId.lastIndexOf(':');
+    if (idx > 0) {
+        const base = benchmarkId.slice(0, idx);
+        if (base in BENCHMARK_CATALOG)
+            return base;
+    }
+    return benchmarkId;
+}
+export function getBenchmarkSpec(benchmarkId) {
+    return BENCHMARK_CATALOG[resolveBenchmarkId(benchmarkId)];
+}

package/lib/specs/envvars.js CHANGED Viewed

@@ -11,7 +11,7 @@ export const ALPHAVANTAGE_API_KEY_SPEC_0_0_1 = {
     name: 'Alpha Vantage API Key',
     description: 'API key for accessing Alpha Vantage financial market data and stock information. Provides real-time and historical stock prices, forex data, and cryptocurrency information.',
     registrationUrl: 'https://www.alphavantage.co/support/#api-key',
-    tags: ["authentication", "api-key", "finance", "stocks", "market-data"],
+    tags: ['authentication', 'api-key', 'finance', 'stocks', 'market-data'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -21,7 +21,7 @@ export const GITHUB_TOKEN_SPEC_0_0_1 = {
     name: 'GitHub Token',
     description: 'GitHub API token for repository management and code operations. Required for GitHub MCP server and GitHub skill to interact with GitHub repositories programmatically.',
     registrationUrl: 'https://github.com/settings/tokens',
-    tags: ["authentication", "token", "github", "git", "mcp-server", "skill"],
+    tags: ['authentication', 'token', 'github', 'git', 'mcp-server', 'skill'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -31,7 +31,7 @@ export const GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1 = {
     name: 'Google OAuth Client ID',
     description: 'OAuth 2.0 client ID for Google Workspace authentication. Required for Google Drive, Gmail, Calendar, and Docs integration through the Google Workspace MCP server.',
     registrationUrl: 'https://console.cloud.google.com/apis/credentials',
-    tags: ["authentication", "oauth", "google", "workspace", "client-id"],
+    tags: ['authentication', 'oauth', 'google', 'workspace', 'client-id'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -41,7 +41,14 @@ export const GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1 = {
     name: 'Google OAuth Client Secret',
     description: 'OAuth 2.0 client secret for Google Workspace authentication. Used in conjunction with client ID for secure API access to Google services.',
     registrationUrl: 'https://console.cloud.google.com/apis/credentials',
-    tags: ["authentication", "oauth", "google", "workspace", "client-secret", "security"],
+    tags: [
+        'authentication',
+        'oauth',
+        'google',
+        'workspace',
+        'client-secret',
+        'security',
+    ],
     icon: 'lock',
     emoji: '🔒',
 };
@@ -51,7 +58,7 @@ export const HF_TOKEN_SPEC_0_0_1 = {
     name: 'Hugging Face Token',
     description: 'Access token for Hugging Face API. Required for Hugging Face MCP server authentication. Create a READ token from your settings.',
     registrationUrl: 'https://huggingface.co/settings/tokens',
-    tags: ["authentication", "api-key", "huggingface", "machine-learning"],
+    tags: ['authentication', 'api-key', 'huggingface', 'machine-learning'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -61,7 +68,7 @@ export const KAGGLE_TOKEN_SPEC_0_0_1 = {
     name: 'Kaggle API Token',
     description: 'API token for accessing Kaggle datasets, competitions, notebooks, and models. Required for Kaggle MCP server authentication.',
     registrationUrl: 'https://www.kaggle.com/settings/account',
-    tags: ["authentication", "api-key", "kaggle", "data"],
+    tags: ['authentication', 'api-key', 'kaggle', 'data'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -71,7 +78,7 @@ export const SLACK_BOT_TOKEN_SPEC_0_0_1 = {
     name: 'Slack Bot Token',
     description: 'OAuth token for Slack bot authentication. Required for Slack MCP server to send messages, manage channels, and interact with workspace members.',
     registrationUrl: 'https://api.slack.com/apps',
-    tags: ["authentication", "oauth", "token", "slack", "messaging", "bot"],
+    tags: ['authentication', 'oauth', 'token', 'slack', 'messaging', 'bot'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -80,7 +87,7 @@ export const SLACK_CHANNEL_IDS_SPEC_0_0_1 = {
     version: '0.0.1',
     name: 'Slack Channel IDs',
     description: 'Comma-separated list of Slack channel IDs that the bot is allowed to access. Restricts bot operations to specific channels for security and organization.',
-    tags: ["configuration", "slack", "channels", "identifier"],
+    tags: ['configuration', 'slack', 'channels', 'identifier'],
     icon: 'hash',
     emoji: undefined,
 };
@@ -90,7 +97,7 @@ export const SLACK_TEAM_ID_SPEC_0_0_1 = {
     name: 'Slack Team ID',
     description: 'Unique identifier for the Slack workspace (team). Required to specify which workspace the bot should connect to.',
     registrationUrl: 'https://api.slack.com/apps',
-    tags: ["configuration", "slack", "workspace", "identifier"],
+    tags: ['configuration', 'slack', 'workspace', 'identifier'],
     icon: 'organization',
     emoji: '🏢',
 };
@@ -100,7 +107,7 @@ export const TAVILY_API_KEY_SPEC_0_0_1 = {
     name: 'Tavily API Key',
     description: 'API key for Tavily web search and research capabilities. Required for web crawling, content extraction, and search operations.',
     registrationUrl: 'https://tavily.com/api-keys',
-    tags: ["authentication", "api-key", "search", "web", "research"],
+    tags: ['authentication', 'api-key', 'search', 'web', 'research'],
     icon: 'key',
     emoji: '🔑',
 };
@@ -108,16 +115,16 @@ export const TAVILY_API_KEY_SPEC_0_0_1 = {
 // Environment Variable Catalog
 // ============================================================================
 export const ENVVAR_CATALOG = {
-    'ALPHAVANTAGE_API_KEY': ALPHAVANTAGE_API_KEY_SPEC_0_0_1,
-    'GITHUB_TOKEN': GITHUB_TOKEN_SPEC_0_0_1,
-    'GOOGLE_OAUTH_CLIENT_ID': GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1,
-    'GOOGLE_OAUTH_CLIENT_SECRET': GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1,
-    'HF_TOKEN': HF_TOKEN_SPEC_0_0_1,
-    'KAGGLE_TOKEN': KAGGLE_TOKEN_SPEC_0_0_1,
-    'SLACK_BOT_TOKEN': SLACK_BOT_TOKEN_SPEC_0_0_1,
-    'SLACK_CHANNEL_IDS': SLACK_CHANNEL_IDS_SPEC_0_0_1,
-    'SLACK_TEAM_ID': SLACK_TEAM_ID_SPEC_0_0_1,
-    'TAVILY_API_KEY': TAVILY_API_KEY_SPEC_0_0_1,
+    ALPHAVANTAGE_API_KEY: ALPHAVANTAGE_API_KEY_SPEC_0_0_1,
+    GITHUB_TOKEN: GITHUB_TOKEN_SPEC_0_0_1,
+    GOOGLE_OAUTH_CLIENT_ID: GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1,
+    GOOGLE_OAUTH_CLIENT_SECRET: GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1,
+    HF_TOKEN: HF_TOKEN_SPEC_0_0_1,
+    KAGGLE_TOKEN: KAGGLE_TOKEN_SPEC_0_0_1,
+    SLACK_BOT_TOKEN: SLACK_BOT_TOKEN_SPEC_0_0_1,
+    SLACK_CHANNEL_IDS: SLACK_CHANNEL_IDS_SPEC_0_0_1,
+    SLACK_TEAM_ID: SLACK_TEAM_ID_SPEC_0_0_1,
+    TAVILY_API_KEY: TAVILY_API_KEY_SPEC_0_0_1,
 };
 function resolveEnvvarId(envvarId) {
     if (envvarId in ENVVAR_CATALOG)

package/lib/specs/evals.d.ts CHANGED Viewed

@@ -1,20 +1,21 @@
 /**
  * Eval Catalog
  *
- * Predefined evaluation benchmark configurations.
+ * Predefined built-in evaluator configurations.
  *
  * This file is AUTO-GENERATED from YAML specifications.
  * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.
  */
 import type { EvalSpec } from '../types';
-export declare const AGENTBENCH_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const GPQA_DIAMOND_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const HUMANEVAL_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const MMLU_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const SWE_BENCH_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const TOOLBENCH_EVAL_SPEC_0_0_1: EvalSpec;
-export declare const TRUTHFULQA_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const CONTAINS_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const EQUALS_EXPECTED_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const EQUALS_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const IS_INSTANCE_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const LLM_JUDGE_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const MAX_DURATION_EVAL_SPEC_0_0_1: EvalSpec;
+export declare const PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1: EvalSpec;
 export declare const EVAL_CATALOG: Record<string, EvalSpec>;
 export declare function getEvalSpecs(): EvalSpec[];
 export declare function getEvalSpec(evalId: string): EvalSpec | undefined;

package/lib/specs/evals.js CHANGED Viewed

@@ -5,114 +5,154 @@
 // ============================================================================
 // Eval Definitions
 // ============================================================================
-export const AGENTBENCH_EVAL_SPEC_0_0_1 = {
-    id: 'agentbench',
+export const CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1 = {
+    id: 'confusion-matrix-evaluator',
     version: '0.0.1',
-    name: 'AgentBench',
-    description: 'Multi-dimensional LLM-as-agent evaluation across 8 diverse environments including web browsing, operating system interaction, database queries, digital card games, lateral thinking, and household tasks.',
-    category: 'Agentic',
-    task_count: 4080,
-    metric: 'success_rate',
-    source: 'https://github.com/THUDM/AgentBench',
-    difficulty: 'hard',
-    languages: ['python', 'sql', 'bash'],
+    name: 'Confusion Matrix Evaluator',
+    description: 'Aggregate evaluator for precision/recall style confusion-matrix reporting.',
+    category: 'Report',
+    evaluator_type: 'report',
+    pydantic_class: 'ConfusionMatrixEvaluator',
+    output_kind: 'report_table',
+    cost_tier: 'free',
+    latency: 'fast',
+    requires: ['expected_output'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
-export const GPQA_DIAMOND_EVAL_SPEC_0_0_1 = {
-    id: 'gpqa-diamond',
+export const CONTAINS_EVAL_SPEC_0_0_1 = {
+    id: 'contains',
     version: '0.0.1',
-    name: 'GPQA Diamond',
-    description: 'Graduate-level science questions crafted by domain experts. Tests advanced reasoning in physics, chemistry, and biology with questions that require PhD-level understanding to answer correctly.',
-    category: 'Reasoning',
-    task_count: 448,
-    metric: 'accuracy',
-    source: 'https://github.com/idavidrein/gpqa',
-    difficulty: 'expert',
-    languages: ['english'],
+    name: 'Contains',
+    description: 'Assert that expected content appears in the model output.',
+    category: 'Comparison',
+    evaluator_type: 'case',
+    pydantic_class: 'ContainsEvaluator',
+    output_kind: 'boolean',
+    cost_tier: 'free',
+    latency: 'instant',
+    requires: ['expected_output'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
-export const HUMANEVAL_EVAL_SPEC_0_0_1 = {
-    id: 'humaneval',
+export const EQUALS_EXPECTED_EVAL_SPEC_0_0_1 = {
+    id: 'equals-expected',
     version: '0.0.1',
-    name: 'HumanEval',
-    description: 'Python function implementation from docstrings. Measures functional correctness of code generation by testing against hand-written test cases. Widely used as a baseline for code generation benchmarks.',
-    category: 'Coding',
-    task_count: 164,
-    metric: 'pass@k',
-    source: 'https://github.com/openai/human-eval',
-    difficulty: 'medium',
-    languages: ['python'],
+    name: 'Equals Expected',
+    description: 'Compare model output against an expected value with strict matching.',
+    category: 'Comparison',
+    evaluator_type: 'case',
+    pydantic_class: 'EqualsExpectedEvaluator',
+    output_kind: 'boolean',
+    cost_tier: 'free',
+    latency: 'instant',
+    requires: ['expected_output'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
-export const MMLU_EVAL_SPEC_0_0_1 = {
-    id: 'mmlu',
+export const EQUALS_EVAL_SPEC_0_0_1 = {
+    id: 'equals',
     version: '0.0.1',
-    name: 'MMLU',
-    description: 'Massive Multitask Language Understanding: 57-subject knowledge benchmark spanning STEM, humanities, social sciences, and more. Tests broad knowledge and reasoning across diverse academic domains.',
-    category: 'Knowledge',
-    task_count: 15908,
-    metric: 'accuracy',
-    source: 'https://github.com/hendrycks/test',
-    difficulty: 'medium',
-    languages: ['english'],
+    name: 'Equals',
+    description: 'Assert exact equality between expected and actual values.',
+    category: 'Comparison',
+    evaluator_type: 'case',
+    pydantic_class: 'EqualsEvaluator',
+    output_kind: 'boolean',
+    cost_tier: 'free',
+    latency: 'instant',
+    requires: ['expected_output'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
-export const SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1 = {
-    id: 'swe-bench-verified',
+export const HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1 = {
+    id: 'has-matching-span',
     version: '0.0.1',
-    name: 'SWE-bench Verified',
-    description: 'Human-validated subset of SWE-bench with verified ground-truth patches. Provides higher confidence evaluation of software engineering capabilities by eliminating ambiguous or flawed test cases from the full benchmark.',
-    category: 'Coding',
-    task_count: 500,
-    metric: 'pass@1',
-    source: 'https://www.swebench.com/',
-    difficulty: 'hard',
-    languages: ['python'],
+    name: 'Has Matching Span',
+    description: 'Validate expected spans in structured traces and tool-call transcripts.',
+    category: 'Span-Based',
+    evaluator_type: 'case',
+    pydantic_class: 'HasMatchingSpanEvaluator',
+    output_kind: 'boolean',
+    cost_tier: 'free',
+    latency: 'fast',
+    requires: ['trace'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
-export const SWE_BENCH_EVAL_SPEC_0_0_1 = {
-    id: 'swe-bench',
+export const IS_INSTANCE_EVAL_SPEC_0_0_1 = {
+    id: 'is-instance',
     version: '0.0.1',
-    name: 'SWE-bench',
-    description: 'Real-world software engineering tasks from GitHub issues. Tests an agent\'s ability to understand bug reports and feature requests, then produce working code patches that pass existing test suites.',
-    category: 'Coding',
-    task_count: 2294,
-    metric: 'pass@1',
-    source: 'https://www.swebench.com/',
-    difficulty: 'hard',
-    languages: ['python'],
+    name: 'Is Instance',
+    description: 'Validate output type against an expected Python/JSON schema type.',
+    category: 'Type Validation',
+    evaluator_type: 'case',
+    pydantic_class: 'IsInstanceEvaluator',
+    output_kind: 'boolean',
+    cost_tier: 'free',
+    latency: 'instant',
+    requires: ['expected_type'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
-export const TOOLBENCH_EVAL_SPEC_0_0_1 = {
-    id: 'toolbench',
+export const LLM_JUDGE_EVAL_SPEC_0_0_1 = {
+    id: 'llm-judge',
     version: '0.0.1',
-    name: 'ToolBench',
-    description: 'Large-scale benchmark for tool-augmented LLMs covering 16000+ real-world APIs across 49 categories. Evaluates multi-step tool usage, API selection, argument generation, and response parsing in complex, chained workflows.',
-    category: 'Agentic',
-    task_count: 12657,
-    metric: 'pass_rate',
-    source: 'https://github.com/OpenBMB/ToolBench',
-    difficulty: 'hard',
-    languages: ['python', 'json'],
+    name: 'LLM Judge',
+    description: 'Use an LLM-as-a-judge prompt to score quality and provide rationale.',
+    category: 'LLM-as-a-Judge',
+    evaluator_type: 'case',
+    pydantic_class: 'LLMJudgeEvaluator',
+    output_kind: 'score_and_assertion',
+    cost_tier: 'llm',
+    latency: 'slow',
+    requires: ['model'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: { threshold: 0.7 },
 };
-export const TRUTHFULQA_EVAL_SPEC_0_0_1 = {
-    id: 'truthfulqa',
+export const MAX_DURATION_EVAL_SPEC_0_0_1 = {
+    id: 'max-duration',
     version: '0.0.1',
-    name: 'TruthfulQA',
-    description: 'Benchmark measuring whether a language model generates truthful answers to questions spanning 38 categories including health, law, finance, and politics. Designed to test resilience against common human misconceptions and falsehoods that models may have learned from training data.',
-    category: 'Safety',
-    task_count: 817,
-    metric: 'truthful_informative',
-    source: 'https://github.com/sylinrl/TruthfulQA',
-    difficulty: 'medium',
-    languages: ['english'],
+    name: 'Max Duration',
+    description: 'Assert response latency remains below a configured duration threshold.',
+    category: 'Performance',
+    evaluator_type: 'case',
+    pydantic_class: 'MaxDurationEvaluator',
+    output_kind: 'boolean_with_reason',
+    cost_tier: 'free',
+    latency: 'instant',
+    requires: ['duration_ms'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: { max_duration_ms: 5000 },
+};
+export const PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1 = {
+    id: 'precision-recall-evaluator',
+    version: '0.0.1',
+    name: 'Precision Recall Evaluator',
+    description: 'Aggregate evaluator for precision, recall, and pass-rate style benchmark reporting.',
+    category: 'Report',
+    evaluator_type: 'report',
+    pydantic_class: 'PrecisionRecallEvaluator',
+    output_kind: 'report_curve',
+    cost_tier: 'free',
+    latency: 'fast',
+    requires: ['expected_output'],
+    source: 'https://ai.pydantic.dev/evals/',
+    default_config: {},
 };
 // ============================================================================
 // Eval Catalog
 // ============================================================================
 export const EVAL_CATALOG = {
-    'agentbench': AGENTBENCH_EVAL_SPEC_0_0_1,
-    'gpqa-diamond': GPQA_DIAMOND_EVAL_SPEC_0_0_1,
-    'humaneval': HUMANEVAL_EVAL_SPEC_0_0_1,
-    'mmlu': MMLU_EVAL_SPEC_0_0_1,
-    'swe-bench-verified': SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1,
-    'swe-bench': SWE_BENCH_EVAL_SPEC_0_0_1,
-    'toolbench': TOOLBENCH_EVAL_SPEC_0_0_1,
-    'truthfulqa': TRUTHFULQA_EVAL_SPEC_0_0_1,
+    'confusion-matrix-evaluator': CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1,
+    contains: CONTAINS_EVAL_SPEC_0_0_1,
+    'equals-expected': EQUALS_EXPECTED_EVAL_SPEC_0_0_1,
+    equals: EQUALS_EVAL_SPEC_0_0_1,
+    'has-matching-span': HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1,
+    'is-instance': IS_INSTANCE_EVAL_SPEC_0_0_1,
+    'llm-judge': LLM_JUDGE_EVAL_SPEC_0_0_1,
+    'max-duration': MAX_DURATION_EVAL_SPEC_0_0_1,
+    'precision-recall-evaluator': PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1,
 };
 export function getEvalSpecs() {
     return Object.values(EVAL_CATALOG);