@datalayer/agent-runtimes 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +157 -10
  2. package/lib/AgentNode.d.ts +3 -0
  3. package/lib/AgentNode.js +676 -0
  4. package/lib/agent-node/themeStore.d.ts +3 -0
  5. package/lib/agent-node/themeStore.js +156 -0
  6. package/lib/agent-node-main.d.ts +1 -0
  7. package/lib/agent-node-main.js +14 -0
  8. package/lib/chat/Chat.js +16 -10
  9. package/lib/chat/ChatFloating.js +1 -1
  10. package/lib/chat/ChatSidebar.js +81 -49
  11. package/lib/chat/base/ChatBase.js +388 -74
  12. package/lib/chat/display/FloatingBrandButton.js +8 -1
  13. package/lib/chat/header/ChatHeader.d.ts +3 -1
  14. package/lib/chat/header/ChatHeader.js +15 -12
  15. package/lib/chat/header/ChatHeaderBase.d.ts +29 -9
  16. package/lib/chat/header/ChatHeaderBase.js +26 -3
  17. package/lib/chat/indicators/SandboxStatusIndicator.js +82 -47
  18. package/lib/chat/messages/ChatMessageList.js +46 -1
  19. package/lib/chat/messages/ChatMessages.js +6 -2
  20. package/lib/chat/prompt/InputFooter.d.ts +3 -1
  21. package/lib/chat/prompt/InputFooter.js +8 -5
  22. package/lib/chat/prompt/InputPrompt.d.ts +3 -1
  23. package/lib/chat/prompt/InputPrompt.js +2 -2
  24. package/lib/chat/prompt/InputPromptFooter.d.ts +3 -1
  25. package/lib/chat/prompt/InputPromptFooter.js +3 -3
  26. package/lib/client/AgentsMixin.js +14 -0
  27. package/lib/config/AgentConfiguration.d.ts +22 -0
  28. package/lib/config/AgentConfiguration.js +319 -64
  29. package/lib/examples/AgUiSharedStateExample.js +2 -1
  30. package/lib/examples/AgentCheckpointsExample.js +3 -3
  31. package/lib/examples/AgentCodemodeExample.d.ts +3 -3
  32. package/lib/examples/AgentCodemodeExample.js +24 -12
  33. package/lib/examples/AgentEvalsExample.js +330 -40
  34. package/lib/examples/AgentGuardrailsExample.js +16 -5
  35. package/lib/examples/AgentHooksExample.js +27 -9
  36. package/lib/examples/AgentInferenceProviderExample.d.ts +3 -0
  37. package/lib/examples/AgentInferenceProviderExample.js +329 -0
  38. package/lib/examples/AgentMCPExample.js +6 -5
  39. package/lib/examples/AgentMemoryExample.d.ts +1 -2
  40. package/lib/examples/AgentMemoryExample.js +71 -22
  41. package/lib/examples/AgentMonitoringExample.js +5 -5
  42. package/lib/examples/AgentNotificationsExample.d.ts +1 -2
  43. package/lib/examples/AgentNotificationsExample.js +71 -22
  44. package/lib/examples/AgentOtelExample.js +31 -40
  45. package/lib/examples/AgentOutputsExample.d.ts +1 -1
  46. package/lib/examples/AgentOutputsExample.js +67 -16
  47. package/lib/examples/AgentParametersExample.js +10 -8
  48. package/lib/examples/AgentSandboxExample.d.ts +1 -1
  49. package/lib/examples/AgentSandboxExample.js +7 -6
  50. package/lib/examples/AgentSkillsExample.js +6 -6
  51. package/lib/examples/AgentSubagentsExample.d.ts +1 -1
  52. package/lib/examples/AgentSubagentsExample.js +6 -6
  53. package/lib/examples/AgentToolApprovalsExample.js +27 -11
  54. package/lib/examples/AgentTriggersExample.js +5 -5
  55. package/lib/examples/{AgentSpecsExample.d.ts → AgentspecsExample.d.ts} +2 -2
  56. package/lib/examples/AgentspecsExample.js +1096 -0
  57. package/lib/examples/ChatCustomExample.js +6 -5
  58. package/lib/examples/ChatExample.js +6 -5
  59. package/lib/examples/Lexical2Example.js +1 -1
  60. package/lib/examples/LexicalAgentExample.js +1 -1
  61. package/lib/examples/NotebookAgentExample.js +3 -3
  62. package/lib/examples/components/ExampleWrapper.d.ts +6 -7
  63. package/lib/examples/components/ExampleWrapper.js +27 -10
  64. package/lib/examples/example-selector.js +2 -1
  65. package/lib/examples/index.d.ts +2 -1
  66. package/lib/examples/index.js +2 -1
  67. package/lib/examples/lexical/initial-content.json +6 -6
  68. package/lib/examples/main.js +56 -16
  69. package/lib/examples/utils/agentId.d.ts +1 -1
  70. package/lib/examples/utils/agentId.js +1 -1
  71. package/lib/examples/utils/useExampleAgentRuntimesUrl.d.ts +5 -0
  72. package/lib/examples/utils/useExampleAgentRuntimesUrl.js +19 -0
  73. package/lib/hooks/useAIAgentsWebSocket.js +35 -0
  74. package/lib/hooks/useAgentRuntimes.d.ts +32 -3
  75. package/lib/hooks/useAgentRuntimes.js +114 -19
  76. package/lib/index.d.ts +1 -1
  77. package/lib/specs/agents/agents.d.ts +20 -13
  78. package/lib/specs/agents/agents.js +1267 -581
  79. package/lib/specs/benchmarks.d.ts +20 -0
  80. package/lib/specs/benchmarks.js +205 -0
  81. package/lib/specs/envvars.d.ts +0 -1
  82. package/lib/specs/envvars.js +0 -11
  83. package/lib/specs/evals.d.ts +10 -9
  84. package/lib/specs/evals.js +128 -88
  85. package/lib/specs/index.d.ts +0 -1
  86. package/lib/specs/index.js +0 -1
  87. package/lib/specs/models.d.ts +0 -2
  88. package/lib/specs/models.js +0 -15
  89. package/lib/specs/skills.d.ts +0 -1
  90. package/lib/specs/skills.js +0 -18
  91. package/lib/stores/agentRuntimeStore.d.ts +5 -1
  92. package/lib/stores/agentRuntimeStore.js +22 -8
  93. package/lib/stores/conversationStore.js +2 -2
  94. package/lib/types/agents-lifecycle.d.ts +18 -0
  95. package/lib/types/agents.d.ts +6 -0
  96. package/lib/types/agentspecs.d.ts +4 -0
  97. package/lib/types/benchmarks.d.ts +43 -0
  98. package/lib/types/benchmarks.js +5 -0
  99. package/lib/types/chat.d.ts +16 -0
  100. package/lib/types/evals.d.ts +26 -17
  101. package/lib/types/index.d.ts +1 -0
  102. package/lib/types/index.js +1 -0
  103. package/package.json +9 -5
  104. package/scripts/codegen/__pycache__/generate_agents.cpython-313.pyc +0 -0
  105. package/scripts/codegen/__pycache__/generate_benchmarks.cpython-313.pyc +0 -0
  106. package/scripts/codegen/__pycache__/generate_evals.cpython-313.pyc +0 -0
  107. package/scripts/codegen/generate_agents.py +89 -43
  108. package/scripts/codegen/generate_benchmarks.py +441 -0
  109. package/scripts/codegen/generate_evals.py +94 -16
  110. package/scripts/codegen/generate_events.py +0 -1
  111. package/lib/examples/AgentSpecsExample.js +0 -694
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Benchmark Catalog
3
+ *
4
+ * Predefined evaluation benchmark configurations.
5
+ *
6
+ * This file is AUTO-GENERATED from YAML specifications.
7
+ * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.
8
+ */
9
+ import type { BenchmarkSpec } from '../types';
10
+ export declare const AGENTBENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
11
+ export declare const GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
12
+ export declare const HUMANEVAL_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
13
+ export declare const MMLU_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
14
+ export declare const SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
15
+ export declare const SWE_BENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
16
+ export declare const TOOLBENCH_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
17
+ export declare const TRUTHFULQA_BENCHMARK_SPEC_0_0_1: BenchmarkSpec;
18
+ export declare const BENCHMARK_CATALOG: Record<string, BenchmarkSpec>;
19
+ export declare function getBenchmarkSpecs(): BenchmarkSpec[];
20
+ export declare function getBenchmarkSpec(benchmarkId: string): BenchmarkSpec | undefined;
@@ -0,0 +1,205 @@
1
+ /*
2
+ * Copyright (c) 2025-2026 Datalayer, Inc.
3
+ * Distributed under the terms of the Modified BSD License.
4
+ */
5
+ // ============================================================================
6
+ // Benchmark Definitions
7
+ // ============================================================================
8
+ export const AGENTBENCH_BENCHMARK_SPEC_0_0_1 = {
9
+ id: 'agentbench',
10
+ version: '0.0.1',
11
+ name: 'AgentBench',
12
+ description: 'Multi-dimensional LLM-as-agent evaluation across 8 diverse environments including web browsing, operating system interaction, database queries, digital card games, lateral thinking, and household tasks.',
13
+ category: 'Agentic',
14
+ task_count: 4080,
15
+ metric: 'success_rate',
16
+ source: 'https://github.com/THUDM/AgentBench',
17
+ difficulty: 'hard',
18
+ languages: ['python', 'sql', 'bash'],
19
+ dataset_source: 'hosted',
20
+ supports_live_monitoring: true,
21
+ supports_experiment_comparison: true,
22
+ evaluator_shapes: ['pass_rate', 'numeric'],
23
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
24
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
25
+ trace_integration: true,
26
+ dataset_editability: 'read-only',
27
+ sdk_support: 'experimental',
28
+ };
29
+ export const GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1 = {
30
+ id: 'gpqa-diamond',
31
+ version: '0.0.1',
32
+ name: 'GPQA Diamond',
33
+ description: 'Graduate-level science questions crafted by domain experts. Tests advanced reasoning in physics, chemistry, and biology with questions that require PhD-level understanding to answer correctly.',
34
+ category: 'Knowledge',
35
+ task_count: 448,
36
+ metric: 'accuracy',
37
+ source: 'https://github.com/idavidrein/gpqa',
38
+ difficulty: 'expert',
39
+ languages: ['english'],
40
+ dataset_source: 'hosted',
41
+ supports_live_monitoring: false,
42
+ supports_experiment_comparison: true,
43
+ evaluator_shapes: ['numeric'],
44
+ evaluators: ['precision-recall-evaluator:0.0.1'],
45
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
46
+ trace_integration: true,
47
+ dataset_editability: 'read-only',
48
+ sdk_support: 'experimental',
49
+ };
50
+ export const HUMANEVAL_BENCHMARK_SPEC_0_0_1 = {
51
+ id: 'humaneval',
52
+ version: '0.0.1',
53
+ name: 'HumanEval',
54
+ description: 'Python function implementation from docstrings. Measures functional correctness of code generation by testing against hand-written test cases. Widely used as a baseline for code generation benchmarks.',
55
+ category: 'Coding',
56
+ task_count: 164,
57
+ metric: 'pass@k',
58
+ source: 'https://github.com/openai/human-eval',
59
+ difficulty: 'medium',
60
+ languages: ['python'],
61
+ dataset_source: 'hosted',
62
+ supports_live_monitoring: false,
63
+ supports_experiment_comparison: true,
64
+ evaluator_shapes: ['pass_rate'],
65
+ evaluators: ['precision-recall-evaluator:0.0.1'],
66
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
67
+ trace_integration: true,
68
+ dataset_editability: 'read-only',
69
+ sdk_support: 'experimental',
70
+ };
71
+ export const MMLU_BENCHMARK_SPEC_0_0_1 = {
72
+ id: 'mmlu',
73
+ version: '0.0.1',
74
+ name: 'MMLU',
75
+ description: 'Massive Multitask Language Understanding: 57-subject knowledge benchmark spanning STEM, humanities, social sciences, and more. Tests broad knowledge and reasoning across diverse academic domains.',
76
+ category: 'Knowledge',
77
+ task_count: 15908,
78
+ metric: 'accuracy',
79
+ source: 'https://github.com/hendrycks/test',
80
+ difficulty: 'medium',
81
+ languages: ['english'],
82
+ dataset_source: 'hosted',
83
+ supports_live_monitoring: false,
84
+ supports_experiment_comparison: true,
85
+ evaluator_shapes: ['numeric'],
86
+ evaluators: ['precision-recall-evaluator:0.0.1'],
87
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
88
+ trace_integration: true,
89
+ dataset_editability: 'read-only',
90
+ sdk_support: 'experimental',
91
+ };
92
+ export const SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1 = {
93
+ id: 'swe-bench-verified',
94
+ version: '0.0.1',
95
+ name: 'SWE-bench Verified',
96
+ description: 'Human-validated subset of SWE-bench with verified ground-truth patches. Provides higher confidence evaluation of software engineering capabilities by eliminating ambiguous or flawed test cases from the full benchmark.',
97
+ category: 'Coding',
98
+ task_count: 500,
99
+ metric: 'pass@1',
100
+ source: 'https://www.swebench.com/',
101
+ difficulty: 'hard',
102
+ languages: ['python'],
103
+ dataset_source: 'hosted',
104
+ supports_live_monitoring: true,
105
+ supports_experiment_comparison: true,
106
+ evaluator_shapes: ['pass_rate'],
107
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
108
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
109
+ trace_integration: true,
110
+ dataset_editability: 'read-only',
111
+ sdk_support: 'experimental',
112
+ };
113
+ export const SWE_BENCH_BENCHMARK_SPEC_0_0_1 = {
114
+ id: 'swe-bench',
115
+ version: '0.0.1',
116
+ name: 'SWE-bench',
117
+ description: "Real-world software engineering tasks from GitHub issues. Tests an agent's ability to understand bug reports and feature requests, then produce working code patches that pass existing test suites.",
118
+ category: 'Coding',
119
+ task_count: 2294,
120
+ metric: 'pass@1',
121
+ source: 'https://www.swebench.com/',
122
+ difficulty: 'hard',
123
+ languages: ['python'],
124
+ dataset_source: 'hosted',
125
+ supports_live_monitoring: true,
126
+ supports_experiment_comparison: true,
127
+ evaluator_shapes: ['pass_rate'],
128
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
129
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
130
+ trace_integration: true,
131
+ dataset_editability: 'read-only',
132
+ sdk_support: 'experimental',
133
+ };
134
+ export const TOOLBENCH_BENCHMARK_SPEC_0_0_1 = {
135
+ id: 'toolbench',
136
+ version: '0.0.1',
137
+ name: 'ToolBench',
138
+ description: 'Large-scale benchmark for tool-augmented LLMs covering 16000+ real-world APIs across 49 categories. Evaluates multi-step tool usage, API selection, argument generation, and response parsing in complex, chained workflows.',
139
+ category: 'Agentic',
140
+ task_count: 12657,
141
+ metric: 'pass_rate',
142
+ source: 'https://github.com/OpenBMB/ToolBench',
143
+ difficulty: 'hard',
144
+ languages: ['python', 'json'],
145
+ dataset_source: 'hosted',
146
+ supports_live_monitoring: true,
147
+ supports_experiment_comparison: true,
148
+ evaluator_shapes: ['pass_rate', 'numeric'],
149
+ evaluators: ['precision-recall-evaluator:0.0.1', 'llm-judge:0.0.1'],
150
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
151
+ trace_integration: true,
152
+ dataset_editability: 'read-only',
153
+ sdk_support: 'experimental',
154
+ };
155
+ export const TRUTHFULQA_BENCHMARK_SPEC_0_0_1 = {
156
+ id: 'truthfulqa',
157
+ version: '0.0.1',
158
+ name: 'TruthfulQA',
159
+ description: 'Benchmark measuring whether a language model generates truthful answers to questions spanning 38 categories including health, law, finance, and politics. Designed to test resilience against common human misconceptions and falsehoods that models may have learned from training data.',
160
+ category: 'Safety',
161
+ task_count: 817,
162
+ metric: 'truthful_informative',
163
+ source: 'https://github.com/sylinrl/TruthfulQA',
164
+ difficulty: 'medium',
165
+ languages: ['english'],
166
+ dataset_source: 'hosted',
167
+ supports_live_monitoring: false,
168
+ supports_experiment_comparison: true,
169
+ evaluator_shapes: ['categorical', 'numeric'],
170
+ evaluators: ['llm-judge:0.0.1'],
171
+ recommended_windows: ['1h', '6h', '24h', '7d', '30d'],
172
+ trace_integration: true,
173
+ dataset_editability: 'read-only',
174
+ sdk_support: 'experimental',
175
+ };
176
+ // ============================================================================
177
+ // Benchmark Catalog
178
+ // ============================================================================
179
+ export const BENCHMARK_CATALOG = {
180
+ agentbench: AGENTBENCH_BENCHMARK_SPEC_0_0_1,
181
+ 'gpqa-diamond': GPQA_DIAMOND_BENCHMARK_SPEC_0_0_1,
182
+ humaneval: HUMANEVAL_BENCHMARK_SPEC_0_0_1,
183
+ mmlu: MMLU_BENCHMARK_SPEC_0_0_1,
184
+ 'swe-bench-verified': SWE_BENCH_VERIFIED_BENCHMARK_SPEC_0_0_1,
185
+ 'swe-bench': SWE_BENCH_BENCHMARK_SPEC_0_0_1,
186
+ toolbench: TOOLBENCH_BENCHMARK_SPEC_0_0_1,
187
+ truthfulqa: TRUTHFULQA_BENCHMARK_SPEC_0_0_1,
188
+ };
189
+ export function getBenchmarkSpecs() {
190
+ return Object.values(BENCHMARK_CATALOG);
191
+ }
192
+ function resolveBenchmarkId(benchmarkId) {
193
+ if (benchmarkId in BENCHMARK_CATALOG)
194
+ return benchmarkId;
195
+ const idx = benchmarkId.lastIndexOf(':');
196
+ if (idx > 0) {
197
+ const base = benchmarkId.slice(0, idx);
198
+ if (base in BENCHMARK_CATALOG)
199
+ return base;
200
+ }
201
+ return benchmarkId;
202
+ }
203
+ export function getBenchmarkSpec(benchmarkId) {
204
+ return BENCHMARK_CATALOG[resolveBenchmarkId(benchmarkId)];
205
+ }
@@ -8,7 +8,6 @@
8
8
  */
9
9
  import type { EnvvarSpec } from '../types';
10
10
  export declare const ALPHAVANTAGE_API_KEY_SPEC_0_0_1: EnvvarSpec;
11
- export declare const DATALAYER_API_KEY_SPEC_0_0_1: EnvvarSpec;
12
11
  export declare const GITHUB_TOKEN_SPEC_0_0_1: EnvvarSpec;
13
12
  export declare const GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1: EnvvarSpec;
14
13
  export declare const GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1: EnvvarSpec;
@@ -15,16 +15,6 @@ export const ALPHAVANTAGE_API_KEY_SPEC_0_0_1 = {
15
15
  icon: 'key',
16
16
  emoji: '🔑',
17
17
  };
18
- export const DATALAYER_API_KEY_SPEC_0_0_1 = {
19
- id: 'DATALAYER_API_KEY',
20
- version: '0.0.1',
21
- name: 'Datalayer API Key',
22
- description: 'API key for authenticating with Datalayer services, including datalayer-skills such as datalayer-whoami.',
23
- registrationUrl: 'https://datalayer.app',
24
- tags: ['authentication', 'api-key', 'datalayer', 'skill'],
25
- icon: 'key',
26
- emoji: '🔑',
27
- };
28
18
  export const GITHUB_TOKEN_SPEC_0_0_1 = {
29
19
  id: 'GITHUB_TOKEN',
30
20
  version: '0.0.1',
@@ -126,7 +116,6 @@ export const TAVILY_API_KEY_SPEC_0_0_1 = {
126
116
  // ============================================================================
127
117
  export const ENVVAR_CATALOG = {
128
118
  ALPHAVANTAGE_API_KEY: ALPHAVANTAGE_API_KEY_SPEC_0_0_1,
129
- DATALAYER_API_KEY: DATALAYER_API_KEY_SPEC_0_0_1,
130
119
  GITHUB_TOKEN: GITHUB_TOKEN_SPEC_0_0_1,
131
120
  GOOGLE_OAUTH_CLIENT_ID: GOOGLE_OAUTH_CLIENT_ID_SPEC_0_0_1,
132
121
  GOOGLE_OAUTH_CLIENT_SECRET: GOOGLE_OAUTH_CLIENT_SECRET_SPEC_0_0_1,
@@ -1,20 +1,21 @@
1
1
  /**
2
2
  * Eval Catalog
3
3
  *
4
- * Predefined evaluation benchmark configurations.
4
+ * Predefined built-in evaluator configurations.
5
5
  *
6
6
  * This file is AUTO-GENERATED from YAML specifications.
7
7
  * DO NOT EDIT MANUALLY - run 'make specs' to regenerate.
8
8
  */
9
9
  import type { EvalSpec } from '../types';
10
- export declare const AGENTBENCH_EVAL_SPEC_0_0_1: EvalSpec;
11
- export declare const GPQA_DIAMOND_EVAL_SPEC_0_0_1: EvalSpec;
12
- export declare const HUMANEVAL_EVAL_SPEC_0_0_1: EvalSpec;
13
- export declare const MMLU_EVAL_SPEC_0_0_1: EvalSpec;
14
- export declare const SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1: EvalSpec;
15
- export declare const SWE_BENCH_EVAL_SPEC_0_0_1: EvalSpec;
16
- export declare const TOOLBENCH_EVAL_SPEC_0_0_1: EvalSpec;
17
- export declare const TRUTHFULQA_EVAL_SPEC_0_0_1: EvalSpec;
10
+ export declare const CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1: EvalSpec;
11
+ export declare const CONTAINS_EVAL_SPEC_0_0_1: EvalSpec;
12
+ export declare const EQUALS_EXPECTED_EVAL_SPEC_0_0_1: EvalSpec;
13
+ export declare const EQUALS_EVAL_SPEC_0_0_1: EvalSpec;
14
+ export declare const HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1: EvalSpec;
15
+ export declare const IS_INSTANCE_EVAL_SPEC_0_0_1: EvalSpec;
16
+ export declare const LLM_JUDGE_EVAL_SPEC_0_0_1: EvalSpec;
17
+ export declare const MAX_DURATION_EVAL_SPEC_0_0_1: EvalSpec;
18
+ export declare const PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1: EvalSpec;
18
19
  export declare const EVAL_CATALOG: Record<string, EvalSpec>;
19
20
  export declare function getEvalSpecs(): EvalSpec[];
20
21
  export declare function getEvalSpec(evalId: string): EvalSpec | undefined;
@@ -5,114 +5,154 @@
5
5
  // ============================================================================
6
6
  // Eval Definitions
7
7
  // ============================================================================
8
- export const AGENTBENCH_EVAL_SPEC_0_0_1 = {
9
- id: 'agentbench',
8
+ export const CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1 = {
9
+ id: 'confusion-matrix-evaluator',
10
10
  version: '0.0.1',
11
- name: 'AgentBench',
12
- description: 'Multi-dimensional LLM-as-agent evaluation across 8 diverse environments including web browsing, operating system interaction, database queries, digital card games, lateral thinking, and household tasks.',
13
- category: 'Agentic',
14
- task_count: 4080,
15
- metric: 'success_rate',
16
- source: 'https://github.com/THUDM/AgentBench',
17
- difficulty: 'hard',
18
- languages: ['python', 'sql', 'bash'],
11
+ name: 'Confusion Matrix Evaluator',
12
+ description: 'Aggregate evaluator for precision/recall style confusion-matrix reporting.',
13
+ category: 'Report',
14
+ evaluator_type: 'report',
15
+ pydantic_class: 'ConfusionMatrixEvaluator',
16
+ output_kind: 'report_table',
17
+ cost_tier: 'free',
18
+ latency: 'fast',
19
+ requires: ['expected_output'],
20
+ source: 'https://ai.pydantic.dev/evals/',
21
+ default_config: {},
19
22
  };
20
- export const GPQA_DIAMOND_EVAL_SPEC_0_0_1 = {
21
- id: 'gpqa-diamond',
23
+ export const CONTAINS_EVAL_SPEC_0_0_1 = {
24
+ id: 'contains',
22
25
  version: '0.0.1',
23
- name: 'GPQA Diamond',
24
- description: 'Graduate-level science questions crafted by domain experts. Tests advanced reasoning in physics, chemistry, and biology with questions that require PhD-level understanding to answer correctly.',
25
- category: 'Reasoning',
26
- task_count: 448,
27
- metric: 'accuracy',
28
- source: 'https://github.com/idavidrein/gpqa',
29
- difficulty: 'expert',
30
- languages: ['english'],
26
+ name: 'Contains',
27
+ description: 'Assert that expected content appears in the model output.',
28
+ category: 'Comparison',
29
+ evaluator_type: 'case',
30
+ pydantic_class: 'ContainsEvaluator',
31
+ output_kind: 'boolean',
32
+ cost_tier: 'free',
33
+ latency: 'instant',
34
+ requires: ['expected_output'],
35
+ source: 'https://ai.pydantic.dev/evals/',
36
+ default_config: {},
31
37
  };
32
- export const HUMANEVAL_EVAL_SPEC_0_0_1 = {
33
- id: 'humaneval',
38
+ export const EQUALS_EXPECTED_EVAL_SPEC_0_0_1 = {
39
+ id: 'equals-expected',
34
40
  version: '0.0.1',
35
- name: 'HumanEval',
36
- description: 'Python function implementation from docstrings. Measures functional correctness of code generation by testing against hand-written test cases. Widely used as a baseline for code generation benchmarks.',
37
- category: 'Coding',
38
- task_count: 164,
39
- metric: 'pass@k',
40
- source: 'https://github.com/openai/human-eval',
41
- difficulty: 'medium',
42
- languages: ['python'],
41
+ name: 'Equals Expected',
42
+ description: 'Compare model output against an expected value with strict matching.',
43
+ category: 'Comparison',
44
+ evaluator_type: 'case',
45
+ pydantic_class: 'EqualsExpectedEvaluator',
46
+ output_kind: 'boolean',
47
+ cost_tier: 'free',
48
+ latency: 'instant',
49
+ requires: ['expected_output'],
50
+ source: 'https://ai.pydantic.dev/evals/',
51
+ default_config: {},
43
52
  };
44
- export const MMLU_EVAL_SPEC_0_0_1 = {
45
- id: 'mmlu',
53
+ export const EQUALS_EVAL_SPEC_0_0_1 = {
54
+ id: 'equals',
46
55
  version: '0.0.1',
47
- name: 'MMLU',
48
- description: 'Massive Multitask Language Understanding: 57-subject knowledge benchmark spanning STEM, humanities, social sciences, and more. Tests broad knowledge and reasoning across diverse academic domains.',
49
- category: 'Knowledge',
50
- task_count: 15908,
51
- metric: 'accuracy',
52
- source: 'https://github.com/hendrycks/test',
53
- difficulty: 'medium',
54
- languages: ['english'],
56
+ name: 'Equals',
57
+ description: 'Assert exact equality between expected and actual values.',
58
+ category: 'Comparison',
59
+ evaluator_type: 'case',
60
+ pydantic_class: 'EqualsEvaluator',
61
+ output_kind: 'boolean',
62
+ cost_tier: 'free',
63
+ latency: 'instant',
64
+ requires: ['expected_output'],
65
+ source: 'https://ai.pydantic.dev/evals/',
66
+ default_config: {},
55
67
  };
56
- export const SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1 = {
57
- id: 'swe-bench-verified',
68
+ export const HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1 = {
69
+ id: 'has-matching-span',
58
70
  version: '0.0.1',
59
- name: 'SWE-bench Verified',
60
- description: 'Human-validated subset of SWE-bench with verified ground-truth patches. Provides higher confidence evaluation of software engineering capabilities by eliminating ambiguous or flawed test cases from the full benchmark.',
61
- category: 'Coding',
62
- task_count: 500,
63
- metric: 'pass@1',
64
- source: 'https://www.swebench.com/',
65
- difficulty: 'hard',
66
- languages: ['python'],
71
+ name: 'Has Matching Span',
72
+ description: 'Validate expected spans in structured traces and tool-call transcripts.',
73
+ category: 'Span-Based',
74
+ evaluator_type: 'case',
75
+ pydantic_class: 'HasMatchingSpanEvaluator',
76
+ output_kind: 'boolean',
77
+ cost_tier: 'free',
78
+ latency: 'fast',
79
+ requires: ['trace'],
80
+ source: 'https://ai.pydantic.dev/evals/',
81
+ default_config: {},
67
82
  };
68
- export const SWE_BENCH_EVAL_SPEC_0_0_1 = {
69
- id: 'swe-bench',
83
+ export const IS_INSTANCE_EVAL_SPEC_0_0_1 = {
84
+ id: 'is-instance',
70
85
  version: '0.0.1',
71
- name: 'SWE-bench',
72
- description: "Real-world software engineering tasks from GitHub issues. Tests an agent's ability to understand bug reports and feature requests, then produce working code patches that pass existing test suites.",
73
- category: 'Coding',
74
- task_count: 2294,
75
- metric: 'pass@1',
76
- source: 'https://www.swebench.com/',
77
- difficulty: 'hard',
78
- languages: ['python'],
86
+ name: 'Is Instance',
87
+ description: 'Validate output type against an expected Python/JSON schema type.',
88
+ category: 'Type Validation',
89
+ evaluator_type: 'case',
90
+ pydantic_class: 'IsInstanceEvaluator',
91
+ output_kind: 'boolean',
92
+ cost_tier: 'free',
93
+ latency: 'instant',
94
+ requires: ['expected_type'],
95
+ source: 'https://ai.pydantic.dev/evals/',
96
+ default_config: {},
79
97
  };
80
- export const TOOLBENCH_EVAL_SPEC_0_0_1 = {
81
- id: 'toolbench',
98
+ export const LLM_JUDGE_EVAL_SPEC_0_0_1 = {
99
+ id: 'llm-judge',
82
100
  version: '0.0.1',
83
- name: 'ToolBench',
84
- description: 'Large-scale benchmark for tool-augmented LLMs covering 16000+ real-world APIs across 49 categories. Evaluates multi-step tool usage, API selection, argument generation, and response parsing in complex, chained workflows.',
85
- category: 'Agentic',
86
- task_count: 12657,
87
- metric: 'pass_rate',
88
- source: 'https://github.com/OpenBMB/ToolBench',
89
- difficulty: 'hard',
90
- languages: ['python', 'json'],
101
+ name: 'LLM Judge',
102
+ description: 'Use an LLM-as-a-judge prompt to score quality and provide rationale.',
103
+ category: 'LLM-as-a-Judge',
104
+ evaluator_type: 'case',
105
+ pydantic_class: 'LLMJudgeEvaluator',
106
+ output_kind: 'score_and_assertion',
107
+ cost_tier: 'llm',
108
+ latency: 'slow',
109
+ requires: ['model'],
110
+ source: 'https://ai.pydantic.dev/evals/',
111
+ default_config: { threshold: 0.7 },
91
112
  };
92
- export const TRUTHFULQA_EVAL_SPEC_0_0_1 = {
93
- id: 'truthfulqa',
113
+ export const MAX_DURATION_EVAL_SPEC_0_0_1 = {
114
+ id: 'max-duration',
94
115
  version: '0.0.1',
95
- name: 'TruthfulQA',
96
- description: 'Benchmark measuring whether a language model generates truthful answers to questions spanning 38 categories including health, law, finance, and politics. Designed to test resilience against common human misconceptions and falsehoods that models may have learned from training data.',
97
- category: 'Safety',
98
- task_count: 817,
99
- metric: 'truthful_informative',
100
- source: 'https://github.com/sylinrl/TruthfulQA',
101
- difficulty: 'medium',
102
- languages: ['english'],
116
+ name: 'Max Duration',
117
+ description: 'Assert response latency remains below a configured duration threshold.',
118
+ category: 'Performance',
119
+ evaluator_type: 'case',
120
+ pydantic_class: 'MaxDurationEvaluator',
121
+ output_kind: 'boolean_with_reason',
122
+ cost_tier: 'free',
123
+ latency: 'instant',
124
+ requires: ['duration_ms'],
125
+ source: 'https://ai.pydantic.dev/evals/',
126
+ default_config: { max_duration_ms: 5000 },
127
+ };
128
+ export const PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1 = {
129
+ id: 'precision-recall-evaluator',
130
+ version: '0.0.1',
131
+ name: 'Precision Recall Evaluator',
132
+ description: 'Aggregate evaluator for precision, recall, and pass-rate style benchmark reporting.',
133
+ category: 'Report',
134
+ evaluator_type: 'report',
135
+ pydantic_class: 'PrecisionRecallEvaluator',
136
+ output_kind: 'report_curve',
137
+ cost_tier: 'free',
138
+ latency: 'fast',
139
+ requires: ['expected_output'],
140
+ source: 'https://ai.pydantic.dev/evals/',
141
+ default_config: {},
103
142
  };
104
143
  // ============================================================================
105
144
  // Eval Catalog
106
145
  // ============================================================================
107
146
  export const EVAL_CATALOG = {
108
- agentbench: AGENTBENCH_EVAL_SPEC_0_0_1,
109
- 'gpqa-diamond': GPQA_DIAMOND_EVAL_SPEC_0_0_1,
110
- humaneval: HUMANEVAL_EVAL_SPEC_0_0_1,
111
- mmlu: MMLU_EVAL_SPEC_0_0_1,
112
- 'swe-bench-verified': SWE_BENCH_VERIFIED_EVAL_SPEC_0_0_1,
113
- 'swe-bench': SWE_BENCH_EVAL_SPEC_0_0_1,
114
- toolbench: TOOLBENCH_EVAL_SPEC_0_0_1,
115
- truthfulqa: TRUTHFULQA_EVAL_SPEC_0_0_1,
147
+ 'confusion-matrix-evaluator': CONFUSION_MATRIX_EVALUATOR_EVAL_SPEC_0_0_1,
148
+ contains: CONTAINS_EVAL_SPEC_0_0_1,
149
+ 'equals-expected': EQUALS_EXPECTED_EVAL_SPEC_0_0_1,
150
+ equals: EQUALS_EVAL_SPEC_0_0_1,
151
+ 'has-matching-span': HAS_MATCHING_SPAN_EVAL_SPEC_0_0_1,
152
+ 'is-instance': IS_INSTANCE_EVAL_SPEC_0_0_1,
153
+ 'llm-judge': LLM_JUDGE_EVAL_SPEC_0_0_1,
154
+ 'max-duration': MAX_DURATION_EVAL_SPEC_0_0_1,
155
+ 'precision-recall-evaluator': PRECISION_RECALL_EVALUATOR_EVAL_SPEC_0_0_1,
116
156
  };
117
157
  export function getEvalSpecs() {
118
158
  return Object.values(EVAL_CATALOG);
@@ -13,7 +13,6 @@ export * from './memory';
13
13
  export * from './models';
14
14
  export * from './notifications';
15
15
  export * from './outputs';
16
- export * from './personas';
17
16
  export * from './skills';
18
17
  export * from './tools';
19
18
  export * from './triggers';
@@ -17,7 +17,6 @@ export * from './memory';
17
17
  export * from './models';
18
18
  export * from './notifications';
19
19
  export * from './outputs';
20
- export * from './personas';
21
20
  export * from './skills';
22
21
  export * from './tools';
23
22
  export * from './triggers';
@@ -17,7 +17,6 @@ export declare const AIModels: {
17
17
  readonly AZURE_OPENAI_GPT_4_1: "azure-openai:gpt-4.1";
18
18
  readonly AZURE_OPENAI_GPT_4O_MINI: "azure-openai:gpt-4o-mini";
19
19
  readonly AZURE_OPENAI_GPT_4O: "azure-openai:gpt-4o";
20
- readonly BEDROCK_US_ANTHROPIC_CLAUDE_3_5_HAIKU_20241022_V1_0: "bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0";
21
20
  readonly BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_6_V1: "bedrock:us.anthropic.claude-opus-4-6-v1";
22
21
  readonly BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_20250514_V1_0: "bedrock:us.anthropic.claude-opus-4-20250514-v1:0";
23
22
  readonly BEDROCK_US_ANTHROPIC_CLAUDE_SONNET_4_5_20250929_V1_0: "bedrock:us.anthropic.claude-sonnet-4-5-20250929-v1:0";
@@ -39,7 +38,6 @@ export declare const AZURE_OPENAI_GPT_4_1_NANO_0_0_1: AIModel;
39
38
  export declare const AZURE_OPENAI_GPT_4_1_0_0_1: AIModel;
40
39
  export declare const AZURE_OPENAI_GPT_4O_MINI_0_0_1: AIModel;
41
40
  export declare const AZURE_OPENAI_GPT_4O_0_0_1: AIModel;
42
- export declare const BEDROCK_US_ANTHROPIC_CLAUDE_3_5_HAIKU_20241022_V1_0_0_0_1: AIModel;
43
41
  export declare const BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_6_V1_0_0_1: AIModel;
44
42
  export declare const BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_20250514_V1_0_0_0_1: AIModel;
45
43
  export declare const BEDROCK_US_ANTHROPIC_CLAUDE_SONNET_4_5_20250929_V1_0_0_0_1: AIModel;
@@ -15,7 +15,6 @@ export const AIModels = {
15
15
  AZURE_OPENAI_GPT_4_1: 'azure-openai:gpt-4.1',
16
16
  AZURE_OPENAI_GPT_4O_MINI: 'azure-openai:gpt-4o-mini',
17
17
  AZURE_OPENAI_GPT_4O: 'azure-openai:gpt-4o',
18
- BEDROCK_US_ANTHROPIC_CLAUDE_3_5_HAIKU_20241022_V1_0: 'bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0',
19
18
  BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_6_V1: 'bedrock:us.anthropic.claude-opus-4-6-v1',
20
19
  BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_20250514_V1_0: 'bedrock:us.anthropic.claude-opus-4-20250514-v1:0',
21
20
  BEDROCK_US_ANTHROPIC_CLAUDE_SONNET_4_5_20250929_V1_0: 'bedrock:us.anthropic.claude-sonnet-4-5-20250929-v1:0',
@@ -111,19 +110,6 @@ export const AZURE_OPENAI_GPT_4O_0_0_1 = {
111
110
  default: false,
112
111
  requiredEnvVars: ['AZURE_OPENAI_API_KEY', 'AZURE_OPENAI_ENDPOINT'],
113
112
  };
114
- export const BEDROCK_US_ANTHROPIC_CLAUDE_3_5_HAIKU_20241022_V1_0_0_0_1 = {
115
- id: 'bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0',
116
- version: '0.0.1',
117
- name: 'Bedrock Claude Haiku 3.5',
118
- description: 'Claude Haiku 3.5 via AWS Bedrock - fast and efficient',
119
- provider: 'bedrock',
120
- default: false,
121
- requiredEnvVars: [
122
- 'AWS_ACCESS_KEY_ID',
123
- 'AWS_SECRET_ACCESS_KEY',
124
- 'AWS_DEFAULT_REGION',
125
- ],
126
- };
127
113
  export const BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_6_V1_0_0_1 = {
128
114
  id: 'bedrock:us.anthropic.claude-opus-4-6-v1',
129
115
  version: '0.0.1',
@@ -243,7 +229,6 @@ export const AI_MODEL_CATALOGUE = {
243
229
  'azure-openai:gpt-4.1': AZURE_OPENAI_GPT_4_1_0_0_1,
244
230
  'azure-openai:gpt-4o-mini': AZURE_OPENAI_GPT_4O_MINI_0_0_1,
245
231
  'azure-openai:gpt-4o': AZURE_OPENAI_GPT_4O_0_0_1,
246
- 'bedrock:us.anthropic.claude-3-5-haiku-20241022-v1:0': BEDROCK_US_ANTHROPIC_CLAUDE_3_5_HAIKU_20241022_V1_0_0_0_1,
247
232
  'bedrock:us.anthropic.claude-opus-4-6-v1': BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_6_V1_0_0_1,
248
233
  'bedrock:us.anthropic.claude-opus-4-20250514-v1:0': BEDROCK_US_ANTHROPIC_CLAUDE_OPUS_4_20250514_V1_0_0_0_1,
249
234
  'bedrock:us.anthropic.claude-sonnet-4-5-20250929-v1:0': BEDROCK_US_ANTHROPIC_CLAUDE_SONNET_4_5_20250929_V1_0_0_0_1,
@@ -8,7 +8,6 @@
8
8
  */
9
9
  import type { SkillSpec } from '../types';
10
10
  export declare const CRAWL_SKILL_SPEC_0_0_1: SkillSpec;
11
- export declare const DATALAYER_WHOAMI_SKILL_SPEC_0_0_1: SkillSpec;
12
11
  export declare const EVENTS_SKILL_SPEC_0_0_1: SkillSpec;
13
12
  export declare const GITHUB_SKILL_SPEC_0_0_1: SkillSpec;
14
13
  export declare const JOKES_SKILL_SPEC_0_0_1: SkillSpec;
@@ -22,23 +22,6 @@ export const CRAWL_SKILL_SPEC_0_0_1 = {
22
22
  emoji: '🌐',
23
23
  enabled: true,
24
24
  };
25
- export const DATALAYER_WHOAMI_SKILL_SPEC_0_0_1 = {
26
- id: 'datalayer-whoami',
27
- version: '0.0.1',
28
- name: 'Datalayer Whoami Skill',
29
- description: 'Retrieve the authenticated Datalayer user profile using the datalayer-skills package.',
30
- module: 'datalayer_skills.skills.whoami',
31
- package: undefined,
32
- method: undefined,
33
- path: undefined,
34
- requiredEnvVars: ['DATALAYER_API_KEY:0.0.1'],
35
- optionalEnvVars: [],
36
- dependencies: ['datalayer_skills>=0.1.0'],
37
- tags: ['datalayer', 'iam', 'identity', 'profile'],
38
- icon: 'person',
39
- emoji: '👤',
40
- enabled: true,
41
- };
42
25
  export const EVENTS_SKILL_SPEC_0_0_1 = {
43
26
  id: 'events',
44
27
  version: '0.0.1',
@@ -129,7 +112,6 @@ export const TEXT_SUMMARIZER_SKILL_SPEC_0_0_1 = {
129
112
  // ============================================================================
130
113
  export const SKILLS_CATALOG = {
131
114
  crawl: CRAWL_SKILL_SPEC_0_0_1,
132
- 'datalayer-whoami': DATALAYER_WHOAMI_SKILL_SPEC_0_0_1,
133
115
  events: EVENTS_SKILL_SPEC_0_0_1,
134
116
  github: GITHUB_SKILL_SPEC_0_0_1,
135
117
  jokes: JOKES_SKILL_SPEC_0_0_1,