kagent-ts 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +68 -21
- package/README.md +27 -371
- package/dist/compression/progressive-compressor.d.ts +66 -0
- package/dist/compression/progressive-compressor.d.ts.map +1 -0
- package/dist/compression/progressive-compressor.js +367 -0
- package/dist/compression/progressive-compressor.js.map +1 -0
- package/dist/compression/types.d.ts +1 -5
- package/dist/compression/types.d.ts.map +1 -1
- package/dist/context/context-manager.d.ts +34 -15
- package/dist/context/context-manager.d.ts.map +1 -1
- package/dist/context/context-manager.js +78 -28
- package/dist/context/context-manager.js.map +1 -1
- package/dist/context/types.d.ts +20 -4
- package/dist/context/types.d.ts.map +1 -1
- package/dist/core/agent.d.ts +354 -25
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +646 -64
- package/dist/core/agent.js.map +1 -1
- package/dist/core/fusion-agent.d.ts +207 -0
- package/dist/core/fusion-agent.d.ts.map +1 -0
- package/dist/core/fusion-agent.js +769 -0
- package/dist/core/fusion-agent.js.map +1 -0
- package/dist/core/hooks.d.ts +19 -7
- package/dist/core/hooks.d.ts.map +1 -1
- package/dist/core/plan-solve-agent.d.ts +1 -15
- package/dist/core/plan-solve-agent.d.ts.map +1 -1
- package/dist/core/plan-solve-agent.js +142 -117
- package/dist/core/plan-solve-agent.js.map +1 -1
- package/dist/core/react-agent.d.ts +0 -13
- package/dist/core/react-agent.d.ts.map +1 -1
- package/dist/core/react-agent.js +127 -102
- package/dist/core/react-agent.js.map +1 -1
- package/dist/core/response-schema.d.ts +65 -0
- package/dist/core/response-schema.d.ts.map +1 -1
- package/dist/core/response-schema.js +174 -1
- package/dist/core/response-schema.js.map +1 -1
- package/dist/core/system-prompts.d.ts +27 -0
- package/dist/core/system-prompts.d.ts.map +1 -0
- package/dist/core/system-prompts.js +112 -0
- package/dist/core/system-prompts.js.map +1 -0
- package/dist/eval/benchmark.d.ts +81 -0
- package/dist/eval/benchmark.d.ts.map +1 -0
- package/dist/eval/benchmark.js +292 -0
- package/dist/eval/benchmark.js.map +1 -0
- package/dist/eval/eval-runner.d.ts +79 -0
- package/dist/eval/eval-runner.d.ts.map +1 -0
- package/dist/eval/eval-runner.js +252 -0
- package/dist/eval/eval-runner.js.map +1 -0
- package/dist/eval/index.d.ts +7 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +13 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/tool-call-evaluator.d.ts +72 -0
- package/dist/eval/tool-call-evaluator.d.ts.map +1 -0
- package/dist/eval/tool-call-evaluator.js +265 -0
- package/dist/eval/tool-call-evaluator.js.map +1 -0
- package/dist/eval/types.d.ts +219 -0
- package/dist/eval/types.d.ts.map +1 -0
- package/dist/eval/types.js +3 -0
- package/dist/eval/types.js.map +1 -0
- package/dist/index.d.ts +58 -14
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +116 -8
- package/dist/index.js.map +1 -1
- package/dist/llm/anthropic-provider.d.ts +141 -0
- package/dist/llm/anthropic-provider.d.ts.map +1 -0
- package/dist/llm/anthropic-provider.js +486 -0
- package/dist/llm/anthropic-provider.js.map +1 -0
- package/dist/llm/errors.d.ts +26 -0
- package/dist/llm/errors.d.ts.map +1 -0
- package/dist/llm/errors.js +19 -0
- package/dist/llm/errors.js.map +1 -0
- package/dist/llm/factory.d.ts +73 -0
- package/dist/llm/factory.d.ts.map +1 -0
- package/dist/llm/factory.js +77 -0
- package/dist/llm/factory.js.map +1 -0
- package/dist/llm/fallback-provider.d.ts +47 -0
- package/dist/llm/fallback-provider.d.ts.map +1 -0
- package/dist/llm/fallback-provider.js +91 -0
- package/dist/llm/fallback-provider.js.map +1 -0
- package/dist/llm/interface.d.ts +54 -11
- package/dist/llm/interface.d.ts.map +1 -1
- package/dist/llm/interface.js +34 -0
- package/dist/llm/interface.js.map +1 -1
- package/dist/llm/model-router.d.ts +126 -0
- package/dist/llm/model-router.d.ts.map +1 -0
- package/dist/llm/model-router.js +178 -0
- package/dist/llm/model-router.js.map +1 -0
- package/dist/llm/openai-provider.d.ts +8 -32
- package/dist/llm/openai-provider.d.ts.map +1 -1
- package/dist/llm/openai-provider.js +27 -60
- package/dist/llm/openai-provider.js.map +1 -1
- package/dist/llm/rate-limiter.d.ts +41 -0
- package/dist/llm/rate-limiter.d.ts.map +1 -0
- package/dist/llm/rate-limiter.js +93 -0
- package/dist/llm/rate-limiter.js.map +1 -0
- package/dist/llm/retry.d.ts +26 -0
- package/dist/llm/retry.d.ts.map +1 -0
- package/dist/llm/retry.js +44 -0
- package/dist/llm/retry.js.map +1 -0
- package/dist/llm/token-budget.d.ts +97 -0
- package/dist/llm/token-budget.d.ts.map +1 -0
- package/dist/llm/token-budget.js +115 -0
- package/dist/llm/token-budget.js.map +1 -0
- package/dist/logging/index.d.ts +2 -0
- package/dist/logging/index.d.ts.map +1 -0
- package/dist/logging/index.js +7 -0
- package/dist/logging/index.js.map +1 -0
- package/dist/logging/logger.d.ts +38 -0
- package/dist/logging/logger.d.ts.map +1 -0
- package/dist/logging/logger.js +34 -0
- package/dist/logging/logger.js.map +1 -0
- package/dist/mcp/mcp-client-manager.d.ts +10 -2
- package/dist/mcp/mcp-client-manager.d.ts.map +1 -1
- package/dist/mcp/mcp-client-manager.js +20 -9
- package/dist/mcp/mcp-client-manager.js.map +1 -1
- package/dist/memory/index.d.ts +3 -0
- package/dist/memory/index.d.ts.map +1 -0
- package/dist/memory/index.js +6 -0
- package/dist/memory/index.js.map +1 -0
- package/dist/memory/memory-manager.d.ts +119 -0
- package/dist/memory/memory-manager.d.ts.map +1 -0
- package/dist/memory/memory-manager.js +334 -0
- package/dist/memory/memory-manager.js.map +1 -0
- package/dist/messages/types.d.ts +2 -0
- package/dist/messages/types.d.ts.map +1 -1
- package/dist/orchestrator/index.d.ts +5 -0
- package/dist/orchestrator/index.d.ts.map +1 -0
- package/dist/orchestrator/index.js +13 -0
- package/dist/orchestrator/index.js.map +1 -0
- package/dist/orchestrator/json-extractor.d.ts +18 -0
- package/dist/orchestrator/json-extractor.d.ts.map +1 -0
- package/dist/orchestrator/json-extractor.js +111 -0
- package/dist/orchestrator/json-extractor.js.map +1 -0
- package/dist/orchestrator/orchestrator-agent.d.ts +152 -0
- package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-agent.js +675 -0
- package/dist/orchestrator/orchestrator-agent.js.map +1 -0
- package/dist/orchestrator/orchestrator-response.d.ts +40 -0
- package/dist/orchestrator/orchestrator-response.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-response.js +275 -0
- package/dist/orchestrator/orchestrator-response.js.map +1 -0
- package/dist/orchestrator/orchestrator-types.d.ts +116 -0
- package/dist/orchestrator/orchestrator-types.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-types.js +3 -0
- package/dist/orchestrator/orchestrator-types.js.map +1 -0
- package/dist/preferences/preference-manager.d.ts +8 -3
- package/dist/preferences/preference-manager.d.ts.map +1 -1
- package/dist/preferences/preference-manager.js +17 -4
- package/dist/preferences/preference-manager.js.map +1 -1
- package/dist/rag/chroma-store.d.ts +52 -0
- package/dist/rag/chroma-store.d.ts.map +1 -0
- package/dist/rag/chroma-store.js +110 -0
- package/dist/rag/chroma-store.js.map +1 -0
- package/dist/rag/document-loader.d.ts +21 -0
- package/dist/rag/document-loader.d.ts.map +1 -0
- package/dist/rag/document-loader.js +129 -0
- package/dist/rag/document-loader.js.map +1 -0
- package/dist/rag/embedding-provider.d.ts +36 -0
- package/dist/rag/embedding-provider.d.ts.map +1 -0
- package/dist/rag/embedding-provider.js +74 -0
- package/dist/rag/embedding-provider.js.map +1 -0
- package/dist/rag/index.d.ts +17 -0
- package/dist/rag/index.d.ts.map +1 -0
- package/dist/rag/index.js +27 -0
- package/dist/rag/index.js.map +1 -0
- package/dist/rag/keyword-index.d.ts +53 -0
- package/dist/rag/keyword-index.d.ts.map +1 -0
- package/dist/rag/keyword-index.js +161 -0
- package/dist/rag/keyword-index.js.map +1 -0
- package/dist/rag/llm-reranker.d.ts +36 -0
- package/dist/rag/llm-reranker.d.ts.map +1 -0
- package/dist/rag/llm-reranker.js +95 -0
- package/dist/rag/llm-reranker.js.map +1 -0
- package/dist/rag/rag-manager.d.ts +54 -0
- package/dist/rag/rag-manager.d.ts.map +1 -0
- package/dist/rag/rag-manager.js +179 -0
- package/dist/rag/rag-manager.js.map +1 -0
- package/dist/rag/rag-types.d.ts +143 -0
- package/dist/rag/rag-types.d.ts.map +1 -0
- package/dist/rag/rag-types.js +9 -0
- package/dist/rag/rag-types.js.map +1 -0
- package/dist/rag/rrf.d.ts +47 -0
- package/dist/rag/rrf.d.ts.map +1 -0
- package/dist/rag/rrf.js +70 -0
- package/dist/rag/rrf.js.map +1 -0
- package/dist/rag/search-knowledge.d.ts +24 -0
- package/dist/rag/search-knowledge.d.ts.map +1 -0
- package/dist/rag/search-knowledge.js +86 -0
- package/dist/rag/search-knowledge.js.map +1 -0
- package/dist/rag/text-splitter.d.ts +25 -0
- package/dist/rag/text-splitter.d.ts.map +1 -0
- package/dist/rag/text-splitter.js +136 -0
- package/dist/rag/text-splitter.js.map +1 -0
- package/dist/rag/vector-store.d.ts +34 -0
- package/dist/rag/vector-store.d.ts.map +1 -0
- package/dist/rag/vector-store.js +73 -0
- package/dist/rag/vector-store.js.map +1 -0
- package/dist/reflection/error-notebook.d.ts +125 -0
- package/dist/reflection/error-notebook.d.ts.map +1 -0
- package/dist/reflection/error-notebook.js +368 -0
- package/dist/reflection/error-notebook.js.map +1 -0
- package/dist/reflection/index.d.ts +8 -0
- package/dist/reflection/index.d.ts.map +1 -0
- package/dist/reflection/index.js +12 -0
- package/dist/reflection/index.js.map +1 -0
- package/dist/reflection/memory-reflector.d.ts +97 -0
- package/dist/reflection/memory-reflector.d.ts.map +1 -0
- package/dist/reflection/memory-reflector.js +215 -0
- package/dist/reflection/memory-reflector.js.map +1 -0
- package/dist/reflection/reflection-agent.d.ts +105 -0
- package/dist/reflection/reflection-agent.d.ts.map +1 -0
- package/dist/reflection/reflection-agent.js +234 -0
- package/dist/reflection/reflection-agent.js.map +1 -0
- package/dist/reflection/reflection-hook.d.ts +50 -0
- package/dist/reflection/reflection-hook.d.ts.map +1 -0
- package/dist/reflection/reflection-hook.js +108 -0
- package/dist/reflection/reflection-hook.js.map +1 -0
- package/dist/rules/project-rules.d.ts +47 -0
- package/dist/rules/project-rules.d.ts.map +1 -0
- package/dist/rules/project-rules.js +166 -0
- package/dist/rules/project-rules.js.map +1 -0
- package/dist/security/boundaries.d.ts +81 -0
- package/dist/security/boundaries.d.ts.map +1 -0
- package/dist/security/boundaries.js +158 -0
- package/dist/security/boundaries.js.map +1 -0
- package/dist/security/index.d.ts +2 -0
- package/dist/security/index.d.ts.map +1 -0
- package/dist/security/index.js +11 -0
- package/dist/security/index.js.map +1 -0
- package/dist/session/session-types.d.ts +25 -4
- package/dist/session/session-types.d.ts.map +1 -1
- package/dist/skills/file-skill-loader.d.ts +4 -6
- package/dist/skills/file-skill-loader.d.ts.map +1 -1
- package/dist/skills/file-skill-loader.js +8 -19
- package/dist/skills/file-skill-loader.js.map +1 -1
- package/dist/skills/index.d.ts +1 -1
- package/dist/skills/index.d.ts.map +1 -1
- package/dist/skills/index.js +1 -2
- package/dist/skills/index.js.map +1 -1
- package/dist/skills/skill-manager.d.ts +18 -8
- package/dist/skills/skill-manager.d.ts.map +1 -1
- package/dist/skills/skill-manager.js +58 -36
- package/dist/skills/skill-manager.js.map +1 -1
- package/dist/skills/types.d.ts +3 -8
- package/dist/skills/types.d.ts.map +1 -1
- package/dist/subagent/index.d.ts +4 -0
- package/dist/subagent/index.d.ts.map +1 -0
- package/dist/subagent/index.js +8 -0
- package/dist/subagent/index.js.map +1 -0
- package/dist/subagent/subagent-loader.d.ts +53 -0
- package/dist/subagent/subagent-loader.d.ts.map +1 -0
- package/dist/subagent/subagent-loader.js +155 -0
- package/dist/subagent/subagent-loader.js.map +1 -0
- package/dist/subagent/subagent-manager.d.ts +161 -0
- package/dist/subagent/subagent-manager.d.ts.map +1 -0
- package/dist/subagent/subagent-manager.js +468 -0
- package/dist/subagent/subagent-manager.js.map +1 -0
- package/dist/subagent/subagent-types.d.ts +77 -0
- package/dist/subagent/subagent-types.d.ts.map +1 -0
- package/dist/subagent/subagent-types.js +3 -0
- package/dist/subagent/subagent-types.js.map +1 -0
- package/dist/tools/builtin/bash.d.ts +3 -0
- package/dist/tools/builtin/bash.d.ts.map +1 -0
- package/dist/tools/builtin/bash.js +87 -0
- package/dist/tools/builtin/bash.js.map +1 -0
- package/dist/tools/builtin/edit-file.d.ts.map +1 -1
- package/dist/tools/builtin/edit-file.js +1 -0
- package/dist/tools/builtin/edit-file.js.map +1 -1
- package/dist/tools/builtin/index.d.ts +14 -0
- package/dist/tools/builtin/index.d.ts.map +1 -1
- package/dist/tools/builtin/index.js +45 -1
- package/dist/tools/builtin/index.js.map +1 -1
- package/dist/tools/builtin/list-errors.d.ts +7 -0
- package/dist/tools/builtin/list-errors.d.ts.map +1 -0
- package/dist/tools/builtin/list-errors.js +64 -0
- package/dist/tools/builtin/list-errors.js.map +1 -0
- package/dist/tools/builtin/list-subagents.d.ts +7 -0
- package/dist/tools/builtin/list-subagents.d.ts.map +1 -0
- package/dist/tools/builtin/list-subagents.js +21 -0
- package/dist/tools/builtin/list-subagents.js.map +1 -0
- package/dist/tools/builtin/recall.d.ts +11 -0
- package/dist/tools/builtin/recall.d.ts.map +1 -0
- package/dist/tools/builtin/recall.js +60 -0
- package/dist/tools/builtin/recall.js.map +1 -0
- package/dist/tools/builtin/remember.d.ts +12 -0
- package/dist/tools/builtin/remember.d.ts.map +1 -0
- package/dist/tools/builtin/remember.js +72 -0
- package/dist/tools/builtin/remember.js.map +1 -0
- package/dist/tools/builtin/skill.d.ts +14 -0
- package/dist/tools/builtin/skill.d.ts.map +1 -0
- package/dist/tools/builtin/skill.js +71 -0
- package/dist/tools/builtin/skill.js.map +1 -0
- package/dist/tools/builtin/spawn-subagent.d.ts +7 -0
- package/dist/tools/builtin/spawn-subagent.d.ts.map +1 -0
- package/dist/tools/builtin/spawn-subagent.js +43 -0
- package/dist/tools/builtin/spawn-subagent.js.map +1 -0
- package/dist/tools/builtin/web-fetch.d.ts +3 -0
- package/dist/tools/builtin/web-fetch.d.ts.map +1 -0
- package/dist/tools/builtin/web-fetch.js +101 -0
- package/dist/tools/builtin/web-fetch.js.map +1 -0
- package/dist/tools/builtin/write-file.d.ts.map +1 -1
- package/dist/tools/builtin/write-file.js +1 -0
- package/dist/tools/builtin/write-file.js.map +1 -1
- package/dist/tools/circuit-breaker.d.ts +19 -10
- package/dist/tools/circuit-breaker.d.ts.map +1 -1
- package/dist/tools/circuit-breaker.js +22 -11
- package/dist/tools/circuit-breaker.js.map +1 -1
- package/dist/tools/error-tracker.d.ts +28 -44
- package/dist/tools/error-tracker.d.ts.map +1 -1
- package/dist/tools/error-tracker.js +39 -156
- package/dist/tools/error-tracker.js.map +1 -1
- package/dist/tools/tool-filter.d.ts +70 -0
- package/dist/tools/tool-filter.d.ts.map +1 -0
- package/dist/tools/tool-filter.js +92 -0
- package/dist/tools/tool-filter.js.map +1 -0
- package/dist/tools/tool-output-truncator.d.ts +36 -0
- package/dist/tools/tool-output-truncator.d.ts.map +1 -0
- package/dist/tools/tool-output-truncator.js +117 -0
- package/dist/tools/tool-output-truncator.js.map +1 -0
- package/dist/tools/tool-registry.d.ts +25 -9
- package/dist/tools/tool-registry.d.ts.map +1 -1
- package/dist/tools/tool-registry.js +77 -28
- package/dist/tools/tool-registry.js.map +1 -1
- package/dist/tools/tool-validator.d.ts +13 -0
- package/dist/tools/tool-validator.d.ts.map +1 -0
- package/dist/tools/tool-validator.js +116 -0
- package/dist/tools/tool-validator.js.map +1 -0
- package/dist/tools/types.d.ts +86 -3
- package/dist/tools/types.d.ts.map +1 -1
- package/dist/tools/types.js +51 -2
- package/dist/tools/types.js.map +1 -1
- package/dist/trace/trace-logger.d.ts +30 -4
- package/dist/trace/trace-logger.d.ts.map +1 -1
- package/dist/trace/trace-logger.js +82 -6
- package/dist/trace/trace-logger.js.map +1 -1
- package/package.json +13 -4
- package/dist/compression/sliding-window.d.ts +0 -21
- package/dist/compression/sliding-window.d.ts.map +0 -1
- package/dist/compression/sliding-window.js +0 -44
- package/dist/compression/sliding-window.js.map +0 -1
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Benchmark = exports.EvalRunner = exports.ToolCallEvaluator = void 0;
|
|
4
|
+
// Tool Call Evaluator — per-tool metrics via AgentHooks
|
|
5
|
+
var tool_call_evaluator_1 = require("./tool-call-evaluator");
|
|
6
|
+
Object.defineProperty(exports, "ToolCallEvaluator", { enumerable: true, get: function () { return tool_call_evaluator_1.ToolCallEvaluator; } });
|
|
7
|
+
// Eval Runner — end-to-end test case execution
|
|
8
|
+
var eval_runner_1 = require("./eval-runner");
|
|
9
|
+
Object.defineProperty(exports, "EvalRunner", { enumerable: true, get: function () { return eval_runner_1.EvalRunner; } });
|
|
10
|
+
// Benchmark — regression testing & baseline comparison
|
|
11
|
+
var benchmark_1 = require("./benchmark");
|
|
12
|
+
Object.defineProperty(exports, "Benchmark", { enumerable: true, get: function () { return benchmark_1.Benchmark; } });
|
|
13
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":";;;AAAA,wDAAwD;AACxD,6DAA0D;AAAjD,wHAAA,iBAAiB,OAAA;AAE1B,+CAA+C;AAC/C,6CAA2C;AAAlC,yGAAA,UAAU,OAAA;AAGnB,uDAAuD;AACvD,yCAAwC;AAA/B,sGAAA,SAAS,OAAA"}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { AgentHooks } from "../core/hooks";
|
|
2
|
+
import type { LLMResponse } from "../llm/interface";
|
|
3
|
+
import type { LLMNetworkError } from "../llm/errors";
|
|
4
|
+
import type { MessageData } from "../messages/types";
|
|
5
|
+
import type { Tool } from "../tools/types";
|
|
6
|
+
import type { ToolCallRecord, ToolCallScorecard } from "./types";
|
|
7
|
+
/**
|
|
8
|
+
* ToolCallEvaluator — collects per-tool-call metrics via AgentHooks.
|
|
9
|
+
*
|
|
10
|
+
* Attach to any agent to track tool call performance:
|
|
11
|
+
* ```ts
|
|
12
|
+
* const evaluator = new ToolCallEvaluator();
|
|
13
|
+
* const agent = new ReActAgent({ llm, hooks: [evaluator, ...otherHooks] });
|
|
14
|
+
* await agent.run("...");
|
|
15
|
+
* console.log(evaluator.generateReport());
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
export declare class ToolCallEvaluator implements AgentHooks {
|
|
19
|
+
/** All recorded tool calls in chronological order. */
|
|
20
|
+
private records;
|
|
21
|
+
/** Per-tool attempt counters (reset on success). */
|
|
22
|
+
private attemptCounters;
|
|
23
|
+
/** Per-tool circuit breaker trip counters. */
|
|
24
|
+
private circuitBreakerTripCounts;
|
|
25
|
+
onLLMStart?: ((messages: MessageData[], tools: Tool[]) => void) | undefined;
|
|
26
|
+
onLLMEnd?: ((response: LLMResponse) => void) | undefined;
|
|
27
|
+
onLLMError?: ((error: LLMNetworkError) => void) | undefined;
|
|
28
|
+
onToolStart(toolName: string, args: Record<string, unknown>, toolCallId?: string): void;
|
|
29
|
+
onToolEnd(toolName: string, result: string, toolCallId?: string): void;
|
|
30
|
+
onToolError(toolName: string, error: string, toolCallId?: string): void;
|
|
31
|
+
onThought?: ((thought: string) => void) | undefined;
|
|
32
|
+
onPlanCreated?: ((plan: string[]) => void) | undefined;
|
|
33
|
+
onPlanRevised?: ((plan: string[]) => void) | undefined;
|
|
34
|
+
onFinish?: ((answer: string) => void) | undefined;
|
|
35
|
+
/**
|
|
36
|
+
* Get all raw tool call records.
|
|
37
|
+
*/
|
|
38
|
+
getRecords(): ToolCallRecord[];
|
|
39
|
+
/**
|
|
40
|
+
* Compute the aggregated scorecard from all recorded calls.
|
|
41
|
+
*/
|
|
42
|
+
getScorecard(): ToolCallScorecard;
|
|
43
|
+
/**
|
|
44
|
+
* Generate a Markdown report of tool call metrics.
|
|
45
|
+
*/
|
|
46
|
+
generateReport(): string;
|
|
47
|
+
/**
|
|
48
|
+
* Reset all counters and records.
|
|
49
|
+
*/
|
|
50
|
+
reset(): void;
|
|
51
|
+
/**
|
|
52
|
+
* Find the matching uncompleted record for a tool call.
|
|
53
|
+
*
|
|
54
|
+
* When `toolCallId` is provided, performs an exact match (preferred:
|
|
55
|
+
* handles parallel calls to the same tool correctly). Falls back to
|
|
56
|
+
* reverse-scan by tool name when the ID is not available (legacy
|
|
57
|
+
* hooks that don't pass `toolCallId`).
|
|
58
|
+
*/
|
|
59
|
+
private findRecord;
|
|
60
|
+
/**
|
|
61
|
+
* Extract a ToolErrorCode from the structured error message format.
|
|
62
|
+
*
|
|
63
|
+
* Error messages follow the pattern:
|
|
64
|
+
* [SEVERITY:ERROR_CODE] Human-readable message...
|
|
65
|
+
*/
|
|
66
|
+
private extractErrorCode;
|
|
67
|
+
/**
|
|
68
|
+
* Compute per-tool statistics from raw records.
|
|
69
|
+
*/
|
|
70
|
+
private computePerToolStats;
|
|
71
|
+
}
|
|
72
|
+
//# sourceMappingURL=tool-call-evaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tool-call-evaluator.d.ts","sourceRoot":"","sources":["../../src/eval/tool-call-evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,gBAAgB,CAAC;AAE3C,OAAO,KAAK,EACV,cAAc,EAEd,iBAAiB,EAClB,MAAM,SAAS,CAAC;AAEjB;;;;;;;;;;GAUG;AACH,qBAAa,iBAAkB,YAAW,UAAU;IAClD,sDAAsD;IACtD,OAAO,CAAC,OAAO,CAAwB;IAEvC,oDAAoD;IACpD,OAAO,CAAC,eAAe,CAAkC;IAEzD,8CAA8C;IAC9C,OAAO,CAAC,wBAAwB,CAAkC;IAIlE,UAAU,CAAC,EACP,CAAC,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,IAAI,CAAC,GAClD,SAAS,CAAa;IAE1B,QAAQ,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,WAAW,KAAK,IAAI,CAAC,GAAG,SAAS,CAAa;IAErE,UAAU,CAAC,EAAE,CAAC,CAAC,KAAK,EAAE,eAAe,KAAK,IAAI,CAAC,GAAG,SAAS,CAAa;IAExE,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI;IAavF,SAAS,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI;IAetE,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI;IA2BvE,SAAS,CAAC,EAAE,CAAC,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC,GAAG,SAAS,CAAa;IAChE,aAAa,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,IAAI,CAAC,GAAG,SAAS,CAAa;IACnE,aAAa,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,IAAI,CAAC,GAAG,SAAS,CAAa;IACnE,QAAQ,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC,GAAG,SAAS,CAAa;IAI9D;;OAEG;IACH,UAAU,IAAI,cAAc,EAAE;IAI9B;;OAEG;IACH,YAAY,IAAI,iBAAiB;IA6BjC;;OAEG;IACH,cAAc,IAAI,MAAM;IA+CxB;;OAEG;IACH,KAAK,IAAI,IAAI;IAQb;;;;;;;OAOG;IACH,OAAO,CAAC,UAAU;IAwBlB;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAaxB;;OAEG;IACH,OAAO,CAAC,mBAAmB;CAiD5B"}
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ToolCallEvaluator = void 0;
|
|
4
|
+
const types_1 = require("../tools/types");
|
|
5
|
+
/**
|
|
6
|
+
* ToolCallEvaluator — collects per-tool-call metrics via AgentHooks.
|
|
7
|
+
*
|
|
8
|
+
* Attach to any agent to track tool call performance:
|
|
9
|
+
* ```ts
|
|
10
|
+
* const evaluator = new ToolCallEvaluator();
|
|
11
|
+
* const agent = new ReActAgent({ llm, hooks: [evaluator, ...otherHooks] });
|
|
12
|
+
* await agent.run("...");
|
|
13
|
+
* console.log(evaluator.generateReport());
|
|
14
|
+
* ```
|
|
15
|
+
*/
|
|
16
|
+
class ToolCallEvaluator {
|
|
17
|
+
/** All recorded tool calls in chronological order. */
|
|
18
|
+
records = [];
|
|
19
|
+
/** Per-tool attempt counters (reset on success). */
|
|
20
|
+
attemptCounters = new Map();
|
|
21
|
+
/** Per-tool circuit breaker trip counters. */
|
|
22
|
+
circuitBreakerTripCounts = new Map();
|
|
23
|
+
// ─── AgentHooks Implementation ─────────────────────────────────────────
|
|
24
|
+
onLLMStart = undefined;
|
|
25
|
+
onLLMEnd = undefined;
|
|
26
|
+
onLLMError = undefined;
|
|
27
|
+
onToolStart(toolName, args, toolCallId) {
|
|
28
|
+
const attempt = this.attemptCounters.get(toolName) ?? 0;
|
|
29
|
+
this.records.push({
|
|
30
|
+
toolCallId,
|
|
31
|
+
toolName,
|
|
32
|
+
args,
|
|
33
|
+
startTime: new Date().toISOString(),
|
|
34
|
+
success: false, // set on end/error
|
|
35
|
+
attemptNumber: attempt + 1,
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
onToolEnd(toolName, result, toolCallId) {
|
|
39
|
+
this.attemptCounters.set(toolName, 0); // reset on success
|
|
40
|
+
const record = this.findRecord(toolName, toolCallId);
|
|
41
|
+
if (!record)
|
|
42
|
+
return;
|
|
43
|
+
record.endTime = new Date().toISOString();
|
|
44
|
+
record.latencyMs =
|
|
45
|
+
new Date(record.endTime).getTime() -
|
|
46
|
+
new Date(record.startTime).getTime();
|
|
47
|
+
record.success = true;
|
|
48
|
+
record.errorCode = types_1.ToolErrorCode.SUCCESS;
|
|
49
|
+
record.resultLength = result.length;
|
|
50
|
+
}
|
|
51
|
+
onToolError(toolName, error, toolCallId) {
|
|
52
|
+
const attempt = (this.attemptCounters.get(toolName) ?? 0) + 1;
|
|
53
|
+
this.attemptCounters.set(toolName, attempt);
|
|
54
|
+
const record = this.findRecord(toolName, toolCallId);
|
|
55
|
+
if (!record)
|
|
56
|
+
return;
|
|
57
|
+
record.endTime = new Date().toISOString();
|
|
58
|
+
record.latencyMs =
|
|
59
|
+
new Date(record.endTime).getTime() -
|
|
60
|
+
new Date(record.startTime).getTime();
|
|
61
|
+
record.success = false;
|
|
62
|
+
record.error = error;
|
|
63
|
+
record.attemptNumber = attempt;
|
|
64
|
+
// Extract error code from the structured error message
|
|
65
|
+
record.errorCode = this.extractErrorCode(error);
|
|
66
|
+
// Track circuit breaker trips
|
|
67
|
+
if (record.errorCode === types_1.ToolErrorCode.CIRCUIT_OPEN) {
|
|
68
|
+
this.circuitBreakerTripCounts.set(toolName, (this.circuitBreakerTripCounts.get(toolName) ?? 0) + 1);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
onThought = undefined;
|
|
72
|
+
onPlanCreated = undefined;
|
|
73
|
+
onPlanRevised = undefined;
|
|
74
|
+
onFinish = undefined;
|
|
75
|
+
// ─── Public API ────────────────────────────────────────────────────────
|
|
76
|
+
/**
|
|
77
|
+
* Get all raw tool call records.
|
|
78
|
+
*/
|
|
79
|
+
getRecords() {
|
|
80
|
+
return [...this.records];
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Compute the aggregated scorecard from all recorded calls.
|
|
84
|
+
*/
|
|
85
|
+
getScorecard() {
|
|
86
|
+
const allStats = this.computePerToolStats();
|
|
87
|
+
const totalCalls = allStats.reduce((s, t) => s + t.totalCalls, 0);
|
|
88
|
+
const totalSuccesses = allStats.reduce((s, t) => s + t.successCount, 0);
|
|
89
|
+
const totalFailures = totalCalls - totalSuccesses;
|
|
90
|
+
const allLatencies = allStats.flatMap((t) => t.latencySamples);
|
|
91
|
+
const avgLatencyMs = allLatencies.length > 0
|
|
92
|
+
? Math.round(allLatencies.reduce((a, b) => a + b, 0) / allLatencies.length)
|
|
93
|
+
: 0;
|
|
94
|
+
const circuitBreakerTrips = allStats.reduce((s, t) => s + t.circuitBreakerTrips, 0);
|
|
95
|
+
return {
|
|
96
|
+
totalCalls,
|
|
97
|
+
totalSuccesses,
|
|
98
|
+
totalFailures,
|
|
99
|
+
overallSuccessRate: totalCalls > 0 ? totalSuccesses / totalCalls : 1,
|
|
100
|
+
avgLatencyMs,
|
|
101
|
+
uniqueToolsUsed: allStats.length,
|
|
102
|
+
circuitBreakerTrips,
|
|
103
|
+
perTool: allStats.sort((a, b) => b.totalCalls - a.totalCalls),
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Generate a Markdown report of tool call metrics.
|
|
108
|
+
*/
|
|
109
|
+
generateReport() {
|
|
110
|
+
const scorecard = this.getScorecard();
|
|
111
|
+
if (scorecard.totalCalls === 0) {
|
|
112
|
+
return "# Tool Call Evaluation Report\n\n*No tool calls recorded.*\n";
|
|
113
|
+
}
|
|
114
|
+
const ratePercent = (scorecard.overallSuccessRate * 100).toFixed(1);
|
|
115
|
+
let report = `# Tool Call Evaluation Report\n\n`;
|
|
116
|
+
report += `## Summary\n\n`;
|
|
117
|
+
report += `| Metric | Value |\n`;
|
|
118
|
+
report += `|--------|-------|\n`;
|
|
119
|
+
report += `| Total Calls | ${scorecard.totalCalls} |\n`;
|
|
120
|
+
report += `| Successes | ${scorecard.totalSuccesses} |\n`;
|
|
121
|
+
report += `| Failures | ${scorecard.totalFailures} |\n`;
|
|
122
|
+
report += `| Success Rate | ${ratePercent}% |\n`;
|
|
123
|
+
report += `| Avg Latency | ${scorecard.avgLatencyMs}ms |\n`;
|
|
124
|
+
report += `| Unique Tools Used | ${scorecard.uniqueToolsUsed} |\n`;
|
|
125
|
+
report += `| Circuit Breaker Trips | ${scorecard.circuitBreakerTrips} |\n\n`;
|
|
126
|
+
report += `## Per-Tool Breakdown\n\n`;
|
|
127
|
+
report += `| Tool | Calls | Success Rate | Avg Latency | P50 | P99 | Avg Retries | CB Trips |\n`;
|
|
128
|
+
report += `|------|-------|-------------|-------------|-----|-----|-------------|----------|\n`;
|
|
129
|
+
for (const stat of scorecard.perTool) {
|
|
130
|
+
const sr = (stat.successRate * 100).toFixed(1);
|
|
131
|
+
report += `| \`${stat.toolName}\` | ${stat.totalCalls} | ${sr}% | ${stat.avgLatencyMs}ms | ${stat.p50LatencyMs}ms | ${stat.p99LatencyMs}ms | ${stat.avgRetries.toFixed(1)} | ${stat.circuitBreakerTrips} |\n`;
|
|
132
|
+
}
|
|
133
|
+
report += `\n## Error Distribution\n\n`;
|
|
134
|
+
for (const stat of scorecard.perTool) {
|
|
135
|
+
if (Object.keys(stat.errorDistribution).length === 0)
|
|
136
|
+
continue;
|
|
137
|
+
report += `### ${stat.toolName}\n\n`;
|
|
138
|
+
report += `| Error Code | Count |\n`;
|
|
139
|
+
report += `|------------|-------|\n`;
|
|
140
|
+
for (const [code, count] of Object.entries(stat.errorDistribution)) {
|
|
141
|
+
report += `| \`${code}\` | ${count} |\n`;
|
|
142
|
+
}
|
|
143
|
+
report += `\n`;
|
|
144
|
+
}
|
|
145
|
+
report += `---\n*Generated at ${new Date().toISOString()}*\n`;
|
|
146
|
+
return report;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Reset all counters and records.
|
|
150
|
+
*/
|
|
151
|
+
reset() {
|
|
152
|
+
this.records = [];
|
|
153
|
+
this.attemptCounters.clear();
|
|
154
|
+
this.circuitBreakerTripCounts.clear();
|
|
155
|
+
}
|
|
156
|
+
// ─── Private Helpers ───────────────────────────────────────────────────
|
|
157
|
+
/**
|
|
158
|
+
* Find the matching uncompleted record for a tool call.
|
|
159
|
+
*
|
|
160
|
+
* When `toolCallId` is provided, performs an exact match (preferred:
|
|
161
|
+
* handles parallel calls to the same tool correctly). Falls back to
|
|
162
|
+
* reverse-scan by tool name when the ID is not available (legacy
|
|
163
|
+
* hooks that don't pass `toolCallId`).
|
|
164
|
+
*/
|
|
165
|
+
findRecord(toolName, toolCallId) {
|
|
166
|
+
// Exact ID match — correct even when the same tool is called
|
|
167
|
+
// multiple times within one LLM response batch.
|
|
168
|
+
if (toolCallId) {
|
|
169
|
+
for (let i = this.records.length - 1; i >= 0; i--) {
|
|
170
|
+
if (this.records[i].toolCallId === toolCallId && !this.records[i].endTime) {
|
|
171
|
+
return this.records[i];
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return undefined;
|
|
175
|
+
}
|
|
176
|
+
// Legacy fallback — reverse scan by tool name only.
|
|
177
|
+
for (let i = this.records.length - 1; i >= 0; i--) {
|
|
178
|
+
if (this.records[i].toolName === toolName && !this.records[i].endTime) {
|
|
179
|
+
return this.records[i];
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
return undefined;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Extract a ToolErrorCode from the structured error message format.
|
|
186
|
+
*
|
|
187
|
+
* Error messages follow the pattern:
|
|
188
|
+
* [SEVERITY:ERROR_CODE] Human-readable message...
|
|
189
|
+
*/
|
|
190
|
+
extractErrorCode(error) {
|
|
191
|
+
const match = error.match(/\[(?:RETRYABLE|FATAL):([A-Z_]+)\]/);
|
|
192
|
+
if (match) {
|
|
193
|
+
// Error format uses uppercase (CIRCUIT_OPEN), but enum values are
|
|
194
|
+
// lowercase ("circuit_open"). Normalise before lookup.
|
|
195
|
+
const code = match[1].toLowerCase();
|
|
196
|
+
if (Object.values(types_1.ToolErrorCode).includes(code)) {
|
|
197
|
+
return code;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return types_1.ToolErrorCode.EXECUTION_FAILURE; // default for unparseable errors
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Compute per-tool statistics from raw records.
|
|
204
|
+
*/
|
|
205
|
+
computePerToolStats() {
|
|
206
|
+
const byTool = new Map();
|
|
207
|
+
for (const r of this.records) {
|
|
208
|
+
if (!byTool.has(r.toolName))
|
|
209
|
+
byTool.set(r.toolName, []);
|
|
210
|
+
byTool.get(r.toolName).push(r);
|
|
211
|
+
}
|
|
212
|
+
const stats = [];
|
|
213
|
+
for (const [toolName, records] of byTool) {
|
|
214
|
+
const completed = records.filter((r) => r.endTime);
|
|
215
|
+
const successes = completed.filter((r) => r.success);
|
|
216
|
+
const failures = completed.filter((r) => !r.success);
|
|
217
|
+
const latencies = completed
|
|
218
|
+
.filter((r) => r.latencyMs !== undefined)
|
|
219
|
+
.map((r) => r.latencyMs);
|
|
220
|
+
// Error distribution
|
|
221
|
+
const errorDist = {};
|
|
222
|
+
for (const f of failures) {
|
|
223
|
+
const code = f.errorCode ?? types_1.ToolErrorCode.EXECUTION_FAILURE;
|
|
224
|
+
errorDist[code] = (errorDist[code] ?? 0) + 1;
|
|
225
|
+
}
|
|
226
|
+
stats.push({
|
|
227
|
+
toolName,
|
|
228
|
+
totalCalls: completed.length,
|
|
229
|
+
successCount: successes.length,
|
|
230
|
+
failureCount: failures.length,
|
|
231
|
+
successRate: completed.length > 0 ? successes.length / completed.length : 1,
|
|
232
|
+
avgLatencyMs: latencies.length > 0
|
|
233
|
+
? Math.round(latencies.reduce((a, b) => a + b, 0) / latencies.length)
|
|
234
|
+
: 0,
|
|
235
|
+
p50LatencyMs: percentile(latencies, 50),
|
|
236
|
+
p99LatencyMs: percentile(latencies, 99),
|
|
237
|
+
avgRetries: successes.length > 0
|
|
238
|
+
? failures.length / successes.length
|
|
239
|
+
: failures.length,
|
|
240
|
+
circuitBreakerTrips: this.circuitBreakerTripCounts.get(toolName) ?? 0,
|
|
241
|
+
errorDistribution: errorDist,
|
|
242
|
+
latencySamples: latencies,
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
return stats;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
exports.ToolCallEvaluator = ToolCallEvaluator;
|
|
249
|
+
/**
|
|
250
|
+
* Compute a percentile from an array of numbers.
|
|
251
|
+
* Uses linear interpolation between closest ranks.
|
|
252
|
+
*/
|
|
253
|
+
function percentile(sorted, p) {
|
|
254
|
+
if (sorted.length === 0)
|
|
255
|
+
return 0;
|
|
256
|
+
const arr = [...sorted].sort((a, b) => a - b);
|
|
257
|
+
const index = (p / 100) * (arr.length - 1);
|
|
258
|
+
const lower = Math.floor(index);
|
|
259
|
+
const upper = Math.ceil(index);
|
|
260
|
+
if (lower === upper)
|
|
261
|
+
return Math.round(arr[lower]);
|
|
262
|
+
const weight = index - lower;
|
|
263
|
+
return Math.round(arr[lower] * (1 - weight) + arr[upper] * weight);
|
|
264
|
+
}
|
|
265
|
+
//# sourceMappingURL=tool-call-evaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tool-call-evaluator.js","sourceRoot":"","sources":["../../src/eval/tool-call-evaluator.ts"],"names":[],"mappings":";;;AAKA,0CAA+C;AAO/C;;;;;;;;;;GAUG;AACH,MAAa,iBAAiB;IAC5B,sDAAsD;IAC9C,OAAO,GAAqB,EAAE,CAAC;IAEvC,oDAAoD;IAC5C,eAAe,GAAwB,IAAI,GAAG,EAAE,CAAC;IAEzD,8CAA8C;IACtC,wBAAwB,GAAwB,IAAI,GAAG,EAAE,CAAC;IAElE,0EAA0E;IAE1E,UAAU,GAEM,SAAS,CAAC;IAE1B,QAAQ,GAAmD,SAAS,CAAC;IAErE,UAAU,GAAoD,SAAS,CAAC;IAExE,WAAW,CAAC,QAAgB,EAAE,IAA6B,EAAE,UAAmB;QAC9E,MAAM,OAAO,GAAG,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAExD,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;YAChB,UAAU;YACV,QAAQ;YACR,IAAI;YACJ,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,OAAO,EAAE,KAAK,EAAE,mBAAmB;YACnC,aAAa,EAAE,OAAO,GAAG,CAAC;SAC3B,CAAC,CAAC;IACL,CAAC;IAED,SAAS,CAAC,QAAgB,EAAE,MAAc,EAAE,UAAmB;QAC7D,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,mBAAmB;QAE1D,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM;YAAE,OAAO;QAEpB,MAAM,CAAC,OAAO,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC1C,MAAM,CAAC,SAAS;YACd,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE;gBAClC,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;QACvC,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,MAAM,CAAC,SAAS,GAAG,qBAAa,CAAC,OAAO,CAAC;QACzC,MAAM,CAAC,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC;IACtC,CAAC;IAED,WAAW,CAAC,QAAgB,EAAE,KAAa,EAAE,UAAmB;QAC9D,MAAM,OAAO,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;QAC9D,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE5C,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;QACrD,IAAI,CAAC,MAAM;YAAE,OAAO;QAEpB,MAAM,CAAC,OAAO,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC1C,MAAM,CAAC,SAAS;YACd,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE;gBAClC,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;QACvC,MAAM,CAAC,OAAO,GAAG,KAAK,CAAC;QACvB,MAAM,CAAC,KAAK,GAAG,KAAK,CAAC;QACrB,MAAM,CAAC,aAAa,GAAG,OAAO,CAAC;QAE/B,uDAAuD;QACvD,MAAM,CAAC,SAAS,GAAG,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC;QAEhD,8BAA8B;QAC9B,IAAI,MAAM,CAAC,SAAS,KAAK,qBAAa,CAAC,YAAY,EAAE,CAAC;YACpD,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAC/B,QAAQ,EACR,CAAC,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CACvD,CAAC;QACJ,CAAC;IACH,CAAC;IAED,SAAS,GAA6C,SAAS,CAAC;IAChE,aAAa,GAA4C,SAAS,CAAC;IACnE,aAAa,GAA4C,SAAS,CAAC;IACnE,QAAQ,GAA4C,SAAS,CAAC;IAE9D,0EAA0E;IAE1E;;OAEG;IACH,UAAU;QACR,OAAO,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,YAAY;QACV,MAAM,QAAQ,GAAG,IAAI,CAAC,mBAAmB,EAAE,CAAC;QAC5C,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QAClE,MAAM,cAAc,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC;QACxE,MAAM,aAAa,GAAG,UAAU,GAAG,cAAc,CAAC;QAElD,MAAM,YAAY,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC;QAC/D,MAAM,YAAY,GAChB,YAAY,CAAC,MAAM,GAAG,CAAC;YACrB,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;YAC3E,CAAC,CAAC,CAAC,CAAC;QAER,MAAM,mBAAmB,GAAG,QAAQ,CAAC,MAAM,CACzC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,mBAAmB,EACnC,CAAC,CACF,CAAC;QAEF,OAAO;YACL,UAAU;YACV,cAAc;YACd,aAAa;YACb,kBAAkB,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YACpE,YAAY;YACZ,eAAe,EAAE,QAAQ,CAAC,MAAM;YAChC,mBAAmB;YACnB,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC;SAC9D,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,cAAc;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;QACtC,IAAI,SAAS,CAAC,UAAU,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO,8DAA8D,CAAC;QACxE,CAAC;QAED,MAAM,WAAW,GAAG,CAAC,SAAS,CAAC,kBAAkB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAEpE,IAAI,MAAM,GAAG,mCAAmC,CAAC;QACjD,MAAM,IAAI,gBAAgB,CAAC;QAC3B,MAAM,IAAI,sBAAsB,CAAC;QACjC,MAAM,IAAI,sBAAsB,CAAC;QACjC,MAAM,IAAI,mBAAmB,SAAS,CAAC,UAAU,MAAM,CAAC;QACxD,MAAM,IAAI,iBAAiB,SAAS,CAAC,cAAc,MAAM,CAAC;QAC1D,MAAM,IAAI,gBAAgB,SAAS,CAAC,aAAa,MAAM,CAAC;QACxD,MAAM,IAAI,oBAAoB,WAAW,OAAO,CAAC;QACjD,MAAM,IAAI,mBAAmB,SAAS,CAAC,YAAY,QAAQ,CAAC;QAC5D,MAAM,IAAI,yBAAyB,SAAS,CAAC,eAAe,MAAM,CAAC;QACnE,MAAM,IAAI,6BAA6B,SAAS,CAAC,mBAAmB,QAAQ,CAAC;QAE7E,MAAM,IAAI,2BAA2B,CAAC;QACtC,MAAM,IAAI,sFAAsF,CAAC;QACjG,MAAM,IAAI,qFAAqF,CAAC;QAEhG,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YACrC,MAAM,EAAE,GAAG,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAC/C,MAAM,IAAI,OAAO,IAAI,CAAC,QAAQ,QAAQ,IAAI,CAAC,UAAU,MAAM,EAAE,OAAO,IAAI,CAAC,YAAY,QAAQ,IAAI,CAAC,YAAY,QAAQ,IAAI,CAAC,YAAY,QAAQ,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,mBAAmB,MAAM,CAAC;QAChN,CAAC;QAED,MAAM,IAAI,6BAA6B,CAAC;QAExC,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YACrC,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YAC/D,MAAM,IAAI,OAAO,IAAI,CAAC,QAAQ,MAAM,CAAC;YACrC,MAAM,IAAI,0BAA0B,CAAC;YACrC,MAAM,IAAI,0BAA0B,CAAC;YACrC,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,iBAAiB,CAAC,EAAE,CAAC;gBACnE,MAAM,IAAI,OAAO,IAAI,QAAQ,KAAK,MAAM,CAAC;YAC3C,CAAC;YACD,MAAM,IAAI,IAAI,CAAC;QACjB,CAAC;QAED,MAAM,IAAI,sBAAsB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,KAAK,CAAC;QAE9D,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,OAAO,GAAG,EAAE,CAAC;QAClB,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;QAC7B,IAAI,CAAC,wBAAwB,CAAC,KAAK,EAAE,CAAC;IACxC,CAAC;IAED,0EAA0E;IAE1E;;;;;;;OAOG;IACK,UAAU,CAChB,QAAgB,EAChB,UAAmB;QAEnB,6DAA6D;QAC7D,gDAAgD;QAChD,IAAI,UAAU,EAAE,CAAC;YACf,KAAK,IAAI,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClD,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,UAAU,KAAK,UAAU,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;oBAC1E,OAAO,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBACzB,CAAC;YACH,CAAC;YACD,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,oDAAoD;QACpD,KAAK,IAAI,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAClD,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,KAAK,QAAQ,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC;gBACtE,OAAO,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;;;;OAKG;IACK,gBAAgB,CAAC,KAAa;QACpC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,mCAAmC,CAAC,CAAC;QAC/D,IAAI,KAAK,EAAE,CAAC;YACV,kEAAkE;YAClE,uDAAuD;YACvD,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,EAAmB,CAAC;YACrD,IAAI,MAAM,CAAC,MAAM,CAAC,qBAAa,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAChD,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QACD,OAAO,qBAAa,CAAC,iBAAiB,CAAC,CAAC,iCAAiC;IAC3E,CAAC;IAED;;OAEG;IACK,mBAAmB;QACzB,MAAM,MAAM,GAAG,IAAI,GAAG,EAA4B,CAAC;QACnD,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAC7B,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC;gBAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YACxD,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAE,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC;QAED,MAAM,KAAK,GAAoB,EAAE,CAAC;QAClC,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,EAAE,CAAC;YACzC,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;YACnD,MAAM,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;YACrD,MAAM,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;YACrD,MAAM,SAAS,GAAG,SAAS;iBACxB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,SAAS,CAAC;iBACxC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAU,CAAC,CAAC;YAE5B,qBAAqB;YACrB,MAAM,SAAS,GAA2B,EAAE,CAAC;YAC7C,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;gBACzB,MAAM,IAAI,GAAG,CAAC,CAAC,SAAS,IAAI,qBAAa,CAAC,iBAAiB,CAAC;gBAC5D,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YAC/C,CAAC;YAED,KAAK,CAAC,IAAI,CAAC;gBACT,QAAQ;gBACR,UAAU,EAAE,SAAS,CAAC,MAAM;gBAC5B,YAAY,EAAE,SAAS,CAAC,MAAM;gBAC9B,YAAY,EAAE,QAAQ,CAAC,MAAM;gBAC7B,WAAW,EACT,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBAChE,YAAY,EACV,SAAS,CAAC,MAAM,GAAG,CAAC;oBAClB,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;oBACrE,CAAC,CAAC,CAAC;gBACP,YAAY,EAAE,UAAU,CAAC,SAAS,EAAE,EAAE,CAAC;gBACvC,YAAY,EAAE,UAAU,CAAC,SAAS,EAAE,EAAE,CAAC;gBACvC,UAAU,EACR,SAAS,CAAC,MAAM,GAAG,CAAC;oBAClB,CAAC,CAAC,QAAQ,CAAC,MAAM,GAAG,SAAS,CAAC,MAAM;oBACpC,CAAC,CAAC,QAAQ,CAAC,MAAM;gBACrB,mBAAmB,EACjB,IAAI,CAAC,wBAAwB,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC;gBAClD,iBAAiB,EAAE,SAAS;gBAC5B,cAAc,EAAE,SAAS;aAC1B,CAAC,CAAC;QACL,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;CACF;AA7RD,8CA6RC;AAED;;;GAGG;AACH,SAAS,UAAU,CAAC,MAAgB,EAAE,CAAS;IAC7C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAClC,MAAM,GAAG,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC3C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC/B,IAAI,KAAK,KAAK,KAAK;QAAE,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC;IACnD,MAAM,MAAM,GAAG,KAAK,GAAG,KAAK,CAAC;IAC7B,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC,CAAC;AACrE,CAAC"}
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import type { ToolErrorCode } from "../tools/types";
|
|
2
|
+
/**
|
|
3
|
+
* Types for the agent evaluation framework.
|
|
4
|
+
*
|
|
5
|
+
* Three layers of evaluation:
|
|
6
|
+
* 1. ToolCallEvaluator — per-tool-call metrics (observed via AgentHooks)
|
|
7
|
+
* 2. EvalRunner — end-to-end test cases with pass/fail criteria
|
|
8
|
+
* 3. Benchmark — regression testing across runs
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* A single tool call record captured by the ToolCallEvaluator hook.
|
|
12
|
+
*/
|
|
13
|
+
export interface ToolCallRecord {
|
|
14
|
+
/** The tool being called. */
|
|
15
|
+
toolName: string;
|
|
16
|
+
/** The unique ID assigned by the LLM to this tool call
|
|
17
|
+
* (from `response.tool_calls[].id`). Enables exact matching
|
|
18
|
+
* when the same tool is called multiple times in one batch. */
|
|
19
|
+
toolCallId?: string;
|
|
20
|
+
/** Arguments passed to the tool. */
|
|
21
|
+
args: Record<string, unknown>;
|
|
22
|
+
/** ISO-8601 timestamp when execution started. */
|
|
23
|
+
startTime: string;
|
|
24
|
+
/** ISO-8601 timestamp when execution ended (set on success or error). */
|
|
25
|
+
endTime?: string;
|
|
26
|
+
/** Wall-clock duration in milliseconds. */
|
|
27
|
+
latencyMs?: number;
|
|
28
|
+
/** Whether the tool executed successfully. */
|
|
29
|
+
success: boolean;
|
|
30
|
+
/** Error message if the tool failed. */
|
|
31
|
+
error?: string;
|
|
32
|
+
/** Machine-readable error code (SUCCESS if no error). */
|
|
33
|
+
errorCode?: ToolErrorCode;
|
|
34
|
+
/** Attempt number within the current circuit-breaker cycle (1-based). */
|
|
35
|
+
attemptNumber: number;
|
|
36
|
+
/** Length of the result content in characters. */
|
|
37
|
+
resultLength?: number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Aggregated per-tool statistics computed from all recorded calls.
|
|
41
|
+
*/
|
|
42
|
+
export interface ToolCallStats {
|
|
43
|
+
/** Tool name. */
|
|
44
|
+
toolName: string;
|
|
45
|
+
/** Total number of call attempts (successes + failures). */
|
|
46
|
+
totalCalls: number;
|
|
47
|
+
/** Number of successful calls. */
|
|
48
|
+
successCount: number;
|
|
49
|
+
/** Number of failed calls. */
|
|
50
|
+
failureCount: number;
|
|
51
|
+
/** Success rate (0–1). */
|
|
52
|
+
successRate: number;
|
|
53
|
+
/** Average latency in milliseconds. */
|
|
54
|
+
avgLatencyMs: number;
|
|
55
|
+
/** Median (P50) latency in milliseconds. */
|
|
56
|
+
p50LatencyMs: number;
|
|
57
|
+
/** P99 latency in milliseconds. */
|
|
58
|
+
p99LatencyMs: number;
|
|
59
|
+
/** Average number of retries before success or circuit open. */
|
|
60
|
+
avgRetries: number;
|
|
61
|
+
/** How many times the circuit breaker opened for this tool. */
|
|
62
|
+
circuitBreakerTrips: number;
|
|
63
|
+
/** Distribution of error codes for failed calls. */
|
|
64
|
+
errorDistribution: Record<string, number>;
|
|
65
|
+
/** Raw latency samples (for percentile computation). */
|
|
66
|
+
latencySamples: number[];
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Overall scorecard produced at the end of a session.
|
|
70
|
+
*/
|
|
71
|
+
export interface ToolCallScorecard {
|
|
72
|
+
/** Total tool calls across all tools. */
|
|
73
|
+
totalCalls: number;
|
|
74
|
+
/** Total successful calls. */
|
|
75
|
+
totalSuccesses: number;
|
|
76
|
+
/** Total failed calls. */
|
|
77
|
+
totalFailures: number;
|
|
78
|
+
/** Overall success rate (0–1). */
|
|
79
|
+
overallSuccessRate: number;
|
|
80
|
+
/** Overall average latency in milliseconds. */
|
|
81
|
+
avgLatencyMs: number;
|
|
82
|
+
/** How many distinct tools were called. */
|
|
83
|
+
uniqueToolsUsed: number;
|
|
84
|
+
/** How many tools had their circuit breaker open. */
|
|
85
|
+
circuitBreakerTrips: number;
|
|
86
|
+
/** Per-tool statistics, sorted by call count descending. */
|
|
87
|
+
perTool: ToolCallStats[];
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* A single evaluation test case.
|
|
91
|
+
*/
|
|
92
|
+
export interface EvalCase {
|
|
93
|
+
/** Human-readable name for this test case. */
|
|
94
|
+
name: string;
|
|
95
|
+
/** The user input / prompt to send to the agent. */
|
|
96
|
+
input: string;
|
|
97
|
+
/**
|
|
98
|
+
* Tools that SHOULD be called during execution.
|
|
99
|
+
* The test passes if ALL expected tools are called at least once.
|
|
100
|
+
* Leave empty to skip this check.
|
|
101
|
+
*/
|
|
102
|
+
expectedTools?: string[];
|
|
103
|
+
/**
|
|
104
|
+
* Tools that should NOT be called during execution.
|
|
105
|
+
* The test fails if ANY forbidden tool is called.
|
|
106
|
+
*/
|
|
107
|
+
forbiddenTools?: string[];
|
|
108
|
+
/**
|
|
109
|
+
* Pattern that the final answer should match (string substring or RegExp).
|
|
110
|
+
* The test passes if the answer contains this pattern.
|
|
111
|
+
* Leave undefined to skip output validation.
|
|
112
|
+
*/
|
|
113
|
+
expectedOutput?: string | RegExp;
|
|
114
|
+
/**
|
|
115
|
+
* Maximum iterations for this case (overrides agent default).
|
|
116
|
+
*/
|
|
117
|
+
maxIterations?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Timeout in milliseconds (default: 120_000).
|
|
120
|
+
*/
|
|
121
|
+
timeoutMs?: number;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* LLM-based quality judgment of an agent's final answer.
|
|
125
|
+
*/
|
|
126
|
+
export interface LLMEvalJudgment {
|
|
127
|
+
/** Whether the LLM judge considers the answer satisfactory. */
|
|
128
|
+
passed: boolean;
|
|
129
|
+
/** 0–100 quality score. */
|
|
130
|
+
score: number;
|
|
131
|
+
/** Brief explanation of the judgment. */
|
|
132
|
+
reasoning: string;
|
|
133
|
+
/** Specific issues identified (empty if none). */
|
|
134
|
+
issues: string[];
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Result of running a single evaluation case.
|
|
138
|
+
*/
|
|
139
|
+
export interface EvalResult {
|
|
140
|
+
/** The case name (from EvalCase). */
|
|
141
|
+
caseName: string;
|
|
142
|
+
/** Whether the case passed all checks. */
|
|
143
|
+
passed: boolean;
|
|
144
|
+
/** The agent's final answer. */
|
|
145
|
+
answer: string;
|
|
146
|
+
/** Tool names called during execution (in order). */
|
|
147
|
+
toolCalls: string[];
|
|
148
|
+
/** Number of ReAct/PlanSolve iterations consumed. */
|
|
149
|
+
iterations: number;
|
|
150
|
+
/** Wall-clock duration in milliseconds. */
|
|
151
|
+
durationMs: number;
|
|
152
|
+
/** Tool call scorecard for this run. */
|
|
153
|
+
scorecard: ToolCallScorecard;
|
|
154
|
+
/** Optional LLM-based quality judgment. */
|
|
155
|
+
llmJudgment?: LLMEvalJudgment;
|
|
156
|
+
/** Failure reasons (empty if passed). */
|
|
157
|
+
failures: string[];
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* A regression — something that got worse compared to the baseline.
|
|
161
|
+
*/
|
|
162
|
+
export interface Regression {
|
|
163
|
+
/** What regressed (tool name, metric name, or case name). */
|
|
164
|
+
target: string;
|
|
165
|
+
/** The metric that changed. */
|
|
166
|
+
metric: string;
|
|
167
|
+
/** Baseline value. */
|
|
168
|
+
baseline: number | string;
|
|
169
|
+
/** Current value. */
|
|
170
|
+
current: number | string;
|
|
171
|
+
/** Human-readable description. */
|
|
172
|
+
description: string;
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* An improvement — something that got better compared to the baseline.
|
|
176
|
+
*/
|
|
177
|
+
export interface Improvement {
|
|
178
|
+
/** What improved. */
|
|
179
|
+
target: string;
|
|
180
|
+
/** The metric that changed. */
|
|
181
|
+
metric: string;
|
|
182
|
+
/** Baseline value. */
|
|
183
|
+
baseline: number | string;
|
|
184
|
+
/** Current value. */
|
|
185
|
+
current: number | string;
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Summary of a benchmark run.
|
|
189
|
+
*/
|
|
190
|
+
export interface BenchmarkSummary {
|
|
191
|
+
/** Benchmark name. */
|
|
192
|
+
name: string;
|
|
193
|
+
/** ISO-8601 timestamp of the run. */
|
|
194
|
+
timestamp: string;
|
|
195
|
+
/** Number of cases that passed. */
|
|
196
|
+
passed: number;
|
|
197
|
+
/** Total number of cases. */
|
|
198
|
+
total: number;
|
|
199
|
+
/** Pass rate (0–1). */
|
|
200
|
+
passRate: number;
|
|
201
|
+
/** Average tool calls per case. */
|
|
202
|
+
avgToolCallsPerCase: number;
|
|
203
|
+
/** Average latency per case in milliseconds. */
|
|
204
|
+
avgLatencyMs: number;
|
|
205
|
+
/** Regressions vs baseline (empty if no baseline). */
|
|
206
|
+
regressions: Regression[];
|
|
207
|
+
/** Improvements vs baseline (empty if no baseline). */
|
|
208
|
+
improvements: Improvement[];
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Full benchmark result including per-case details.
|
|
212
|
+
*/
|
|
213
|
+
export interface BenchmarkResult {
|
|
214
|
+
/** Summary stats. */
|
|
215
|
+
summary: BenchmarkSummary;
|
|
216
|
+
/** Per-case results. */
|
|
217
|
+
cases: EvalResult[];
|
|
218
|
+
}
|
|
219
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/eval/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAEpD;;;;;;;GAOG;AAIH;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,QAAQ,EAAE,MAAM,CAAC;IACjB;;oEAEgE;IAChE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC9B,iDAAiD;IACjD,SAAS,EAAE,MAAM,CAAC;IAClB,yEAAyE;IACzE,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2CAA2C;IAC3C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,8CAA8C;IAC9C,OAAO,EAAE,OAAO,CAAC;IACjB,wCAAwC;IACxC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,yDAAyD;IACzD,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,yEAAyE;IACzE,aAAa,EAAE,MAAM,CAAC;IACtB,kDAAkD;IAClD,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,iBAAiB;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,4DAA4D;IAC5D,UAAU,EAAE,MAAM,CAAC;IACnB,kCAAkC;IAClC,YAAY,EAAE,MAAM,CAAC;IACrB,8BAA8B;IAC9B,YAAY,EAAE,MAAM,CAAC;IACrB,0BAA0B;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,uCAAuC;IACvC,YAAY,EAAE,MAAM,CAAC;IACrB,4CAA4C;IAC5C,YAAY,EAAE,MAAM,CAAC;IACrB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,UAAU,EAAE,MAAM,CAAC;IACnB,+DAA+D;IAC/D,mBAAmB,EAAE,MAAM,CAAC;IAC5B,oDAAoD;IACpD,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,wDAAwD;IACxD,cAAc,EAAE,MAAM,EAAE,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,yCAAyC;IACzC,UAAU,EAAE,MAAM,CAAC;IACnB,8BAA8B;IAC9B,cAAc,EAAE,MAAM,CAAC;IACvB,0BAA0B;IAC1B,aAAa,EAAE,MAAM,CAAC;IACtB,kCAAkC;IAClC,kBAAkB,EAAE,MAAM,CAAC;IAC3B,+CAA+C;IAC/C,YAAY,EAAE,MAAM,CAAC;IACrB,2CAA2C;IAC3C,eAAe,EAAE,MAAM,CAAC;IACxB,qDAAqD;IACrD,mBAAmB,EAAE,MAAM,CAAC;IAC5B,4DAA4D;IAC5D,OAAO,EAAE,aAAa,EAAE,CAAC;CAC1B;AAID;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,8CAA8C;IAC9C,IAAI,EAAE,MAAM,CAAC;IACb,oDAAoD;IACpD,KAAK,EAAE,MAAM,CAAC;IACd;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B;;;;OAIG;IACH,cAAc,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC;IACjC;;OAEG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,+DAA+D;IAC/D,MAAM,EAAE,OAAO,CAAC;IAChB,2BAA2B;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,yCAAyC;IACzC,SAAS,EAAE,MAAM,CAAC;IAClB,kDAAkD;IAClD,MAAM,EAAE,MAAM,EAAE,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,0CAA0C;IAC1C,MAAM,EAAE,OAAO,CAAC;IAChB,gCAAgC;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,qDAAqD;IACrD,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,qDAAqD;IACrD,UAAU,EAAE,MAAM,CAAC;IACnB,2CAA2C;IAC3C,UAAU,EAAE,MAAM,CAAC;IACnB,wCAAwC;IACxC,SAAS,EAAE,iBAAiB,CAAC;IAC7B,2CAA2C;IAC3C,WAAW,CAAC,EAAE,eAAe,CAAC;IAC9B,yCAAyC;IACzC,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAID;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,6DAA6D;IAC7D,MAAM,EAAE,MAAM,CAAC;IACf,+BAA+B;IAC/B,MAAM,EAAE,MAAM,CAAC;IACf,sBAAsB;IACtB,QAAQ,EAAE,MAAM,GAAG,MAAM,CAAC;IAC1B,qBAAqB;IACrB,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC;IACzB,kCAAkC;IAClC,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,MAAM,EAAE,MAAM,CAAC;IACf,+BAA+B;IAC/B,MAAM,EAAE,MAAM,CAAC;IACf,sBAAsB;IACtB,QAAQ,EAAE,MAAM,GAAG,MAAM,CAAC;IAC1B,qBAAqB;IACrB,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,sBAAsB;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,SAAS,EAAE,MAAM,CAAC;IAClB,mCAAmC;IACnC,MAAM,EAAE,MAAM,CAAC;IACf,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,uBAAuB;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,mCAAmC;IACnC,mBAAmB,EAAE,MAAM,CAAC;IAC5B,gDAAgD;IAChD,YAAY,EAAE,MAAM,CAAC;IACrB,sDAAsD;IACtD,WAAW,EAAE,UAAU,EAAE,CAAC;IAC1B,uDAAuD;IACvD,YAAY,EAAE,WAAW,EAAE,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,qBAAqB;IACrB,OAAO,EAAE,gBAAgB,CAAC;IAC1B,wBAAwB;IACxB,KAAK,EAAE,UAAU,EAAE,CAAC;CACrB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/eval/types.ts"],"names":[],"mappings":""}
|