kagent-ts 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +68 -21
- package/README.md +27 -371
- package/dist/compression/progressive-compressor.d.ts +66 -0
- package/dist/compression/progressive-compressor.d.ts.map +1 -0
- package/dist/compression/progressive-compressor.js +367 -0
- package/dist/compression/progressive-compressor.js.map +1 -0
- package/dist/compression/types.d.ts +1 -5
- package/dist/compression/types.d.ts.map +1 -1
- package/dist/context/context-manager.d.ts +34 -15
- package/dist/context/context-manager.d.ts.map +1 -1
- package/dist/context/context-manager.js +78 -28
- package/dist/context/context-manager.js.map +1 -1
- package/dist/context/types.d.ts +20 -4
- package/dist/context/types.d.ts.map +1 -1
- package/dist/core/agent.d.ts +354 -25
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +646 -64
- package/dist/core/agent.js.map +1 -1
- package/dist/core/fusion-agent.d.ts +207 -0
- package/dist/core/fusion-agent.d.ts.map +1 -0
- package/dist/core/fusion-agent.js +769 -0
- package/dist/core/fusion-agent.js.map +1 -0
- package/dist/core/hooks.d.ts +19 -7
- package/dist/core/hooks.d.ts.map +1 -1
- package/dist/core/plan-solve-agent.d.ts +1 -15
- package/dist/core/plan-solve-agent.d.ts.map +1 -1
- package/dist/core/plan-solve-agent.js +142 -117
- package/dist/core/plan-solve-agent.js.map +1 -1
- package/dist/core/react-agent.d.ts +0 -13
- package/dist/core/react-agent.d.ts.map +1 -1
- package/dist/core/react-agent.js +127 -102
- package/dist/core/react-agent.js.map +1 -1
- package/dist/core/response-schema.d.ts +65 -0
- package/dist/core/response-schema.d.ts.map +1 -1
- package/dist/core/response-schema.js +174 -1
- package/dist/core/response-schema.js.map +1 -1
- package/dist/core/system-prompts.d.ts +27 -0
- package/dist/core/system-prompts.d.ts.map +1 -0
- package/dist/core/system-prompts.js +112 -0
- package/dist/core/system-prompts.js.map +1 -0
- package/dist/eval/benchmark.d.ts +81 -0
- package/dist/eval/benchmark.d.ts.map +1 -0
- package/dist/eval/benchmark.js +292 -0
- package/dist/eval/benchmark.js.map +1 -0
- package/dist/eval/eval-runner.d.ts +79 -0
- package/dist/eval/eval-runner.d.ts.map +1 -0
- package/dist/eval/eval-runner.js +252 -0
- package/dist/eval/eval-runner.js.map +1 -0
- package/dist/eval/index.d.ts +7 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +13 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/tool-call-evaluator.d.ts +72 -0
- package/dist/eval/tool-call-evaluator.d.ts.map +1 -0
- package/dist/eval/tool-call-evaluator.js +265 -0
- package/dist/eval/tool-call-evaluator.js.map +1 -0
- package/dist/eval/types.d.ts +219 -0
- package/dist/eval/types.d.ts.map +1 -0
- package/dist/eval/types.js +3 -0
- package/dist/eval/types.js.map +1 -0
- package/dist/index.d.ts +58 -14
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +116 -8
- package/dist/index.js.map +1 -1
- package/dist/llm/anthropic-provider.d.ts +141 -0
- package/dist/llm/anthropic-provider.d.ts.map +1 -0
- package/dist/llm/anthropic-provider.js +486 -0
- package/dist/llm/anthropic-provider.js.map +1 -0
- package/dist/llm/errors.d.ts +26 -0
- package/dist/llm/errors.d.ts.map +1 -0
- package/dist/llm/errors.js +19 -0
- package/dist/llm/errors.js.map +1 -0
- package/dist/llm/factory.d.ts +73 -0
- package/dist/llm/factory.d.ts.map +1 -0
- package/dist/llm/factory.js +77 -0
- package/dist/llm/factory.js.map +1 -0
- package/dist/llm/fallback-provider.d.ts +47 -0
- package/dist/llm/fallback-provider.d.ts.map +1 -0
- package/dist/llm/fallback-provider.js +91 -0
- package/dist/llm/fallback-provider.js.map +1 -0
- package/dist/llm/interface.d.ts +54 -11
- package/dist/llm/interface.d.ts.map +1 -1
- package/dist/llm/interface.js +34 -0
- package/dist/llm/interface.js.map +1 -1
- package/dist/llm/model-router.d.ts +126 -0
- package/dist/llm/model-router.d.ts.map +1 -0
- package/dist/llm/model-router.js +178 -0
- package/dist/llm/model-router.js.map +1 -0
- package/dist/llm/openai-provider.d.ts +8 -32
- package/dist/llm/openai-provider.d.ts.map +1 -1
- package/dist/llm/openai-provider.js +27 -60
- package/dist/llm/openai-provider.js.map +1 -1
- package/dist/llm/rate-limiter.d.ts +41 -0
- package/dist/llm/rate-limiter.d.ts.map +1 -0
- package/dist/llm/rate-limiter.js +93 -0
- package/dist/llm/rate-limiter.js.map +1 -0
- package/dist/llm/retry.d.ts +26 -0
- package/dist/llm/retry.d.ts.map +1 -0
- package/dist/llm/retry.js +44 -0
- package/dist/llm/retry.js.map +1 -0
- package/dist/llm/token-budget.d.ts +97 -0
- package/dist/llm/token-budget.d.ts.map +1 -0
- package/dist/llm/token-budget.js +115 -0
- package/dist/llm/token-budget.js.map +1 -0
- package/dist/logging/index.d.ts +2 -0
- package/dist/logging/index.d.ts.map +1 -0
- package/dist/logging/index.js +7 -0
- package/dist/logging/index.js.map +1 -0
- package/dist/logging/logger.d.ts +38 -0
- package/dist/logging/logger.d.ts.map +1 -0
- package/dist/logging/logger.js +34 -0
- package/dist/logging/logger.js.map +1 -0
- package/dist/mcp/mcp-client-manager.d.ts +10 -2
- package/dist/mcp/mcp-client-manager.d.ts.map +1 -1
- package/dist/mcp/mcp-client-manager.js +20 -9
- package/dist/mcp/mcp-client-manager.js.map +1 -1
- package/dist/memory/index.d.ts +3 -0
- package/dist/memory/index.d.ts.map +1 -0
- package/dist/memory/index.js +6 -0
- package/dist/memory/index.js.map +1 -0
- package/dist/memory/memory-manager.d.ts +119 -0
- package/dist/memory/memory-manager.d.ts.map +1 -0
- package/dist/memory/memory-manager.js +334 -0
- package/dist/memory/memory-manager.js.map +1 -0
- package/dist/messages/types.d.ts +2 -0
- package/dist/messages/types.d.ts.map +1 -1
- package/dist/orchestrator/index.d.ts +5 -0
- package/dist/orchestrator/index.d.ts.map +1 -0
- package/dist/orchestrator/index.js +13 -0
- package/dist/orchestrator/index.js.map +1 -0
- package/dist/orchestrator/json-extractor.d.ts +18 -0
- package/dist/orchestrator/json-extractor.d.ts.map +1 -0
- package/dist/orchestrator/json-extractor.js +111 -0
- package/dist/orchestrator/json-extractor.js.map +1 -0
- package/dist/orchestrator/orchestrator-agent.d.ts +152 -0
- package/dist/orchestrator/orchestrator-agent.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-agent.js +675 -0
- package/dist/orchestrator/orchestrator-agent.js.map +1 -0
- package/dist/orchestrator/orchestrator-response.d.ts +40 -0
- package/dist/orchestrator/orchestrator-response.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-response.js +275 -0
- package/dist/orchestrator/orchestrator-response.js.map +1 -0
- package/dist/orchestrator/orchestrator-types.d.ts +116 -0
- package/dist/orchestrator/orchestrator-types.d.ts.map +1 -0
- package/dist/orchestrator/orchestrator-types.js +3 -0
- package/dist/orchestrator/orchestrator-types.js.map +1 -0
- package/dist/preferences/preference-manager.d.ts +8 -3
- package/dist/preferences/preference-manager.d.ts.map +1 -1
- package/dist/preferences/preference-manager.js +17 -4
- package/dist/preferences/preference-manager.js.map +1 -1
- package/dist/rag/chroma-store.d.ts +52 -0
- package/dist/rag/chroma-store.d.ts.map +1 -0
- package/dist/rag/chroma-store.js +110 -0
- package/dist/rag/chroma-store.js.map +1 -0
- package/dist/rag/document-loader.d.ts +21 -0
- package/dist/rag/document-loader.d.ts.map +1 -0
- package/dist/rag/document-loader.js +129 -0
- package/dist/rag/document-loader.js.map +1 -0
- package/dist/rag/embedding-provider.d.ts +36 -0
- package/dist/rag/embedding-provider.d.ts.map +1 -0
- package/dist/rag/embedding-provider.js +74 -0
- package/dist/rag/embedding-provider.js.map +1 -0
- package/dist/rag/index.d.ts +17 -0
- package/dist/rag/index.d.ts.map +1 -0
- package/dist/rag/index.js +27 -0
- package/dist/rag/index.js.map +1 -0
- package/dist/rag/keyword-index.d.ts +53 -0
- package/dist/rag/keyword-index.d.ts.map +1 -0
- package/dist/rag/keyword-index.js +161 -0
- package/dist/rag/keyword-index.js.map +1 -0
- package/dist/rag/llm-reranker.d.ts +36 -0
- package/dist/rag/llm-reranker.d.ts.map +1 -0
- package/dist/rag/llm-reranker.js +95 -0
- package/dist/rag/llm-reranker.js.map +1 -0
- package/dist/rag/rag-manager.d.ts +54 -0
- package/dist/rag/rag-manager.d.ts.map +1 -0
- package/dist/rag/rag-manager.js +179 -0
- package/dist/rag/rag-manager.js.map +1 -0
- package/dist/rag/rag-types.d.ts +143 -0
- package/dist/rag/rag-types.d.ts.map +1 -0
- package/dist/rag/rag-types.js +9 -0
- package/dist/rag/rag-types.js.map +1 -0
- package/dist/rag/rrf.d.ts +47 -0
- package/dist/rag/rrf.d.ts.map +1 -0
- package/dist/rag/rrf.js +70 -0
- package/dist/rag/rrf.js.map +1 -0
- package/dist/rag/search-knowledge.d.ts +24 -0
- package/dist/rag/search-knowledge.d.ts.map +1 -0
- package/dist/rag/search-knowledge.js +86 -0
- package/dist/rag/search-knowledge.js.map +1 -0
- package/dist/rag/text-splitter.d.ts +25 -0
- package/dist/rag/text-splitter.d.ts.map +1 -0
- package/dist/rag/text-splitter.js +136 -0
- package/dist/rag/text-splitter.js.map +1 -0
- package/dist/rag/vector-store.d.ts +34 -0
- package/dist/rag/vector-store.d.ts.map +1 -0
- package/dist/rag/vector-store.js +73 -0
- package/dist/rag/vector-store.js.map +1 -0
- package/dist/reflection/error-notebook.d.ts +125 -0
- package/dist/reflection/error-notebook.d.ts.map +1 -0
- package/dist/reflection/error-notebook.js +368 -0
- package/dist/reflection/error-notebook.js.map +1 -0
- package/dist/reflection/index.d.ts +8 -0
- package/dist/reflection/index.d.ts.map +1 -0
- package/dist/reflection/index.js +12 -0
- package/dist/reflection/index.js.map +1 -0
- package/dist/reflection/memory-reflector.d.ts +97 -0
- package/dist/reflection/memory-reflector.d.ts.map +1 -0
- package/dist/reflection/memory-reflector.js +215 -0
- package/dist/reflection/memory-reflector.js.map +1 -0
- package/dist/reflection/reflection-agent.d.ts +105 -0
- package/dist/reflection/reflection-agent.d.ts.map +1 -0
- package/dist/reflection/reflection-agent.js +234 -0
- package/dist/reflection/reflection-agent.js.map +1 -0
- package/dist/reflection/reflection-hook.d.ts +50 -0
- package/dist/reflection/reflection-hook.d.ts.map +1 -0
- package/dist/reflection/reflection-hook.js +108 -0
- package/dist/reflection/reflection-hook.js.map +1 -0
- package/dist/rules/project-rules.d.ts +47 -0
- package/dist/rules/project-rules.d.ts.map +1 -0
- package/dist/rules/project-rules.js +166 -0
- package/dist/rules/project-rules.js.map +1 -0
- package/dist/security/boundaries.d.ts +81 -0
- package/dist/security/boundaries.d.ts.map +1 -0
- package/dist/security/boundaries.js +158 -0
- package/dist/security/boundaries.js.map +1 -0
- package/dist/security/index.d.ts +2 -0
- package/dist/security/index.d.ts.map +1 -0
- package/dist/security/index.js +11 -0
- package/dist/security/index.js.map +1 -0
- package/dist/session/session-types.d.ts +25 -4
- package/dist/session/session-types.d.ts.map +1 -1
- package/dist/skills/file-skill-loader.d.ts +4 -6
- package/dist/skills/file-skill-loader.d.ts.map +1 -1
- package/dist/skills/file-skill-loader.js +8 -19
- package/dist/skills/file-skill-loader.js.map +1 -1
- package/dist/skills/index.d.ts +1 -1
- package/dist/skills/index.d.ts.map +1 -1
- package/dist/skills/index.js +1 -2
- package/dist/skills/index.js.map +1 -1
- package/dist/skills/skill-manager.d.ts +18 -8
- package/dist/skills/skill-manager.d.ts.map +1 -1
- package/dist/skills/skill-manager.js +58 -36
- package/dist/skills/skill-manager.js.map +1 -1
- package/dist/skills/types.d.ts +3 -8
- package/dist/skills/types.d.ts.map +1 -1
- package/dist/subagent/index.d.ts +4 -0
- package/dist/subagent/index.d.ts.map +1 -0
- package/dist/subagent/index.js +8 -0
- package/dist/subagent/index.js.map +1 -0
- package/dist/subagent/subagent-loader.d.ts +53 -0
- package/dist/subagent/subagent-loader.d.ts.map +1 -0
- package/dist/subagent/subagent-loader.js +155 -0
- package/dist/subagent/subagent-loader.js.map +1 -0
- package/dist/subagent/subagent-manager.d.ts +161 -0
- package/dist/subagent/subagent-manager.d.ts.map +1 -0
- package/dist/subagent/subagent-manager.js +468 -0
- package/dist/subagent/subagent-manager.js.map +1 -0
- package/dist/subagent/subagent-types.d.ts +77 -0
- package/dist/subagent/subagent-types.d.ts.map +1 -0
- package/dist/subagent/subagent-types.js +3 -0
- package/dist/subagent/subagent-types.js.map +1 -0
- package/dist/tools/builtin/bash.d.ts +3 -0
- package/dist/tools/builtin/bash.d.ts.map +1 -0
- package/dist/tools/builtin/bash.js +87 -0
- package/dist/tools/builtin/bash.js.map +1 -0
- package/dist/tools/builtin/edit-file.d.ts.map +1 -1
- package/dist/tools/builtin/edit-file.js +1 -0
- package/dist/tools/builtin/edit-file.js.map +1 -1
- package/dist/tools/builtin/index.d.ts +14 -0
- package/dist/tools/builtin/index.d.ts.map +1 -1
- package/dist/tools/builtin/index.js +45 -1
- package/dist/tools/builtin/index.js.map +1 -1
- package/dist/tools/builtin/list-errors.d.ts +7 -0
- package/dist/tools/builtin/list-errors.d.ts.map +1 -0
- package/dist/tools/builtin/list-errors.js +64 -0
- package/dist/tools/builtin/list-errors.js.map +1 -0
- package/dist/tools/builtin/list-subagents.d.ts +7 -0
- package/dist/tools/builtin/list-subagents.d.ts.map +1 -0
- package/dist/tools/builtin/list-subagents.js +21 -0
- package/dist/tools/builtin/list-subagents.js.map +1 -0
- package/dist/tools/builtin/recall.d.ts +11 -0
- package/dist/tools/builtin/recall.d.ts.map +1 -0
- package/dist/tools/builtin/recall.js +60 -0
- package/dist/tools/builtin/recall.js.map +1 -0
- package/dist/tools/builtin/remember.d.ts +12 -0
- package/dist/tools/builtin/remember.d.ts.map +1 -0
- package/dist/tools/builtin/remember.js +72 -0
- package/dist/tools/builtin/remember.js.map +1 -0
- package/dist/tools/builtin/skill.d.ts +14 -0
- package/dist/tools/builtin/skill.d.ts.map +1 -0
- package/dist/tools/builtin/skill.js +71 -0
- package/dist/tools/builtin/skill.js.map +1 -0
- package/dist/tools/builtin/spawn-subagent.d.ts +7 -0
- package/dist/tools/builtin/spawn-subagent.d.ts.map +1 -0
- package/dist/tools/builtin/spawn-subagent.js +43 -0
- package/dist/tools/builtin/spawn-subagent.js.map +1 -0
- package/dist/tools/builtin/web-fetch.d.ts +3 -0
- package/dist/tools/builtin/web-fetch.d.ts.map +1 -0
- package/dist/tools/builtin/web-fetch.js +101 -0
- package/dist/tools/builtin/web-fetch.js.map +1 -0
- package/dist/tools/builtin/write-file.d.ts.map +1 -1
- package/dist/tools/builtin/write-file.js +1 -0
- package/dist/tools/builtin/write-file.js.map +1 -1
- package/dist/tools/circuit-breaker.d.ts +19 -10
- package/dist/tools/circuit-breaker.d.ts.map +1 -1
- package/dist/tools/circuit-breaker.js +22 -11
- package/dist/tools/circuit-breaker.js.map +1 -1
- package/dist/tools/error-tracker.d.ts +28 -44
- package/dist/tools/error-tracker.d.ts.map +1 -1
- package/dist/tools/error-tracker.js +39 -156
- package/dist/tools/error-tracker.js.map +1 -1
- package/dist/tools/tool-filter.d.ts +70 -0
- package/dist/tools/tool-filter.d.ts.map +1 -0
- package/dist/tools/tool-filter.js +92 -0
- package/dist/tools/tool-filter.js.map +1 -0
- package/dist/tools/tool-output-truncator.d.ts +36 -0
- package/dist/tools/tool-output-truncator.d.ts.map +1 -0
- package/dist/tools/tool-output-truncator.js +117 -0
- package/dist/tools/tool-output-truncator.js.map +1 -0
- package/dist/tools/tool-registry.d.ts +25 -9
- package/dist/tools/tool-registry.d.ts.map +1 -1
- package/dist/tools/tool-registry.js +77 -28
- package/dist/tools/tool-registry.js.map +1 -1
- package/dist/tools/tool-validator.d.ts +13 -0
- package/dist/tools/tool-validator.d.ts.map +1 -0
- package/dist/tools/tool-validator.js +116 -0
- package/dist/tools/tool-validator.js.map +1 -0
- package/dist/tools/types.d.ts +86 -3
- package/dist/tools/types.d.ts.map +1 -1
- package/dist/tools/types.js +51 -2
- package/dist/tools/types.js.map +1 -1
- package/dist/trace/trace-logger.d.ts +30 -4
- package/dist/trace/trace-logger.d.ts.map +1 -1
- package/dist/trace/trace-logger.js +82 -6
- package/dist/trace/trace-logger.js.map +1 -1
- package/package.json +13 -4
- package/dist/compression/sliding-window.d.ts +0 -21
- package/dist/compression/sliding-window.d.ts.map +0 -1
- package/dist/compression/sliding-window.js +0 -44
- package/dist/compression/sliding-window.js.map +0 -1
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.Benchmark = void 0;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const eval_runner_1 = require("./eval-runner");
|
|
40
|
+
// ─── Regression Detection Thresholds ─────────────────────────────────────
|
|
41
|
+
/** Minimum drop in success rate to flag as a regression (percentage points). */
|
|
42
|
+
const SUCCESS_RATE_REGRESSION_THRESHOLD = 0.05;
|
|
43
|
+
/** Minimum increase in failures to flag as a regression (absolute count). */
|
|
44
|
+
const FAILURE_COUNT_REGRESSION_THRESHOLD = 2;
|
|
45
|
+
/** Factor by which latency must increase to flag as a regression. */
|
|
46
|
+
const LATENCY_REGRESSION_FACTOR = 1.5;
|
|
47
|
+
/** Minimum absolute latency increase to flag (ms). */
|
|
48
|
+
const LATENCY_MIN_ABSOLUTE_INCREASE_MS = 1000;
|
|
49
|
+
/**
|
|
50
|
+
* Benchmark — runs evaluation cases against an agent, compares with
|
|
51
|
+
* baseline results, and flags regressions & improvements.
|
|
52
|
+
*
|
|
53
|
+
* Usage:
|
|
54
|
+
* ```ts
|
|
55
|
+
* const benchmark = new Benchmark({
|
|
56
|
+
* name: "tool-calling-v2",
|
|
57
|
+
* agentFactory: (evaluator) => new ReActAgent({ llm, hooks: [evaluator] }),
|
|
58
|
+
* cases: myEvalCases,
|
|
59
|
+
* baselinePath: ".kagent-benchmarks/tool-calling-v1.json",
|
|
60
|
+
* });
|
|
61
|
+
*
|
|
62
|
+
* const result = await benchmark.run();
|
|
63
|
+
* console.log(benchmark.generateReport(result));
|
|
64
|
+
* ```
|
|
65
|
+
*/
|
|
66
|
+
class Benchmark {
|
|
67
|
+
config;
|
|
68
|
+
outputDir;
|
|
69
|
+
constructor(config) {
|
|
70
|
+
this.config = config;
|
|
71
|
+
this.outputDir = path.resolve(config.outputDir ?? ".kagent-benchmarks");
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Run the benchmark and return results.
|
|
75
|
+
*
|
|
76
|
+
* Results are automatically persisted to disk so they can serve
|
|
77
|
+
* as the baseline for future runs.
|
|
78
|
+
*/
|
|
79
|
+
async run() {
|
|
80
|
+
const runner = this.config.runner ?? new eval_runner_1.EvalRunner();
|
|
81
|
+
const cases = this.config.cases;
|
|
82
|
+
const timestamp = new Date().toISOString();
|
|
83
|
+
// Run all cases
|
|
84
|
+
const results = await runner.run(this.config.agentFactory, cases);
|
|
85
|
+
// Build summary
|
|
86
|
+
const summary = this.buildSummary(results, timestamp);
|
|
87
|
+
// Compare against baseline if available
|
|
88
|
+
if (this.config.baselinePath) {
|
|
89
|
+
const baseline = this.loadBaseline();
|
|
90
|
+
if (baseline) {
|
|
91
|
+
this.compareWithBaseline(summary, baseline, results);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
const result = { summary, cases: results };
|
|
95
|
+
// Persist for future baseline use
|
|
96
|
+
this.persistResult(result);
|
|
97
|
+
return result;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Generate a Markdown comparison report.
|
|
101
|
+
*/
|
|
102
|
+
generateReport(result) {
|
|
103
|
+
const s = result.summary;
|
|
104
|
+
const passRate = (s.passRate * 100).toFixed(1);
|
|
105
|
+
let report = `# Benchmark: ${s.name}\n\n`;
|
|
106
|
+
report += `**Run at:** ${s.timestamp}\n\n`;
|
|
107
|
+
report += `## Summary\n\n`;
|
|
108
|
+
report += `| Metric | Value |\n`;
|
|
109
|
+
report += `|--------|-------|\n`;
|
|
110
|
+
report += `| Pass Rate | ${passRate}% (${s.passed}/${s.total}) |\n`;
|
|
111
|
+
report += `| Avg Tool Calls / Case | ${s.avgToolCallsPerCase.toFixed(1)} |\n`;
|
|
112
|
+
report += `| Avg Duration | ${s.avgLatencyMs}ms |\n\n`;
|
|
113
|
+
// Regressions (most important — show first)
|
|
114
|
+
if (s.regressions.length > 0) {
|
|
115
|
+
report += `## ⚠️ Regressions\n\n`;
|
|
116
|
+
report += `| Target | Metric | Baseline | Current | Details |\n`;
|
|
117
|
+
report += `|--------|--------|----------|---------|----------|\n`;
|
|
118
|
+
for (const r of s.regressions) {
|
|
119
|
+
report += `| ${r.target} | ${r.metric} | ${r.baseline} | ${r.current} | ${r.description} |\n`;
|
|
120
|
+
}
|
|
121
|
+
report += `\n`;
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
report += `## ✅ No Regressions\n\n`;
|
|
125
|
+
}
|
|
126
|
+
// Improvements
|
|
127
|
+
if (s.improvements.length > 0) {
|
|
128
|
+
report += `## 📈 Improvements\n\n`;
|
|
129
|
+
report += `| Target | Metric | Baseline | Current |\n`;
|
|
130
|
+
report += `|--------|--------|----------|--------|\n`;
|
|
131
|
+
for (const imp of s.improvements) {
|
|
132
|
+
report += `| ${imp.target} | ${imp.metric} | ${imp.baseline} | ${imp.current} |\n`;
|
|
133
|
+
}
|
|
134
|
+
report += `\n`;
|
|
135
|
+
}
|
|
136
|
+
// Per-case results
|
|
137
|
+
report += `## Per-Case Results\n\n`;
|
|
138
|
+
report += `| Case | Status | Duration | Tool Calls | Success Rate |\n`;
|
|
139
|
+
report += `|------|--------|----------|------------|-------------|\n`;
|
|
140
|
+
for (const r of result.cases) {
|
|
141
|
+
const icon = r.passed ? "✅" : "❌";
|
|
142
|
+
const sr = (r.scorecard.overallSuccessRate * 100).toFixed(0);
|
|
143
|
+
report += `| ${icon} ${r.caseName} | ${r.passed ? "PASS" : "FAIL"} | ${r.durationMs}ms | ${r.toolCalls.length} | ${sr}% |\n`;
|
|
144
|
+
}
|
|
145
|
+
report += `\n`;
|
|
146
|
+
// Failure details
|
|
147
|
+
const failures = result.cases.filter((r) => !r.passed);
|
|
148
|
+
if (failures.length > 0) {
|
|
149
|
+
report += `## Failure Details\n\n`;
|
|
150
|
+
for (const f of failures) {
|
|
151
|
+
report += `### ❌ ${f.caseName}\n\n`;
|
|
152
|
+
for (const reason of f.failures) {
|
|
153
|
+
report += `- ${reason}\n`;
|
|
154
|
+
}
|
|
155
|
+
report += `\n`;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
report += `---\n*Generated at ${new Date().toISOString()}*\n`;
|
|
159
|
+
return report;
|
|
160
|
+
}
|
|
161
|
+
// ─── Private ────────────────────────────────────────────────────────────
|
|
162
|
+
buildSummary(results, timestamp) {
|
|
163
|
+
const total = results.length;
|
|
164
|
+
const passed = results.filter((r) => r.passed).length;
|
|
165
|
+
const avgToolCalls = total > 0
|
|
166
|
+
? results.reduce((s, r) => s + r.toolCalls.length, 0) / total
|
|
167
|
+
: 0;
|
|
168
|
+
const avgLatencyMs = total > 0
|
|
169
|
+
? Math.round(results.reduce((s, r) => s + r.durationMs, 0) / total)
|
|
170
|
+
: 0;
|
|
171
|
+
return {
|
|
172
|
+
name: this.config.name,
|
|
173
|
+
timestamp,
|
|
174
|
+
passed,
|
|
175
|
+
total,
|
|
176
|
+
passRate: total > 0 ? passed / total : 1,
|
|
177
|
+
avgToolCallsPerCase: avgToolCalls,
|
|
178
|
+
avgLatencyMs,
|
|
179
|
+
regressions: [],
|
|
180
|
+
improvements: [],
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Compare current results against a baseline and populate
|
|
185
|
+
* regressions / improvements in the summary.
|
|
186
|
+
*/
|
|
187
|
+
compareWithBaseline(summary, baseline, currentResults) {
|
|
188
|
+
const bl = baseline.summary;
|
|
189
|
+
// ── Pass rate regression ──────────────────────────────────────────
|
|
190
|
+
if (bl.passRate - summary.passRate >= SUCCESS_RATE_REGRESSION_THRESHOLD) {
|
|
191
|
+
const drop = ((bl.passRate - summary.passRate) * 100).toFixed(1);
|
|
192
|
+
summary.regressions.push({
|
|
193
|
+
target: "overall",
|
|
194
|
+
metric: "passRate",
|
|
195
|
+
baseline: `${(bl.passRate * 100).toFixed(1)}%`,
|
|
196
|
+
current: `${(summary.passRate * 100).toFixed(1)}%`,
|
|
197
|
+
description: `Pass rate dropped by ${drop} percentage points.`,
|
|
198
|
+
});
|
|
199
|
+
}
|
|
200
|
+
else if (summary.passRate - bl.passRate >=
|
|
201
|
+
SUCCESS_RATE_REGRESSION_THRESHOLD) {
|
|
202
|
+
summary.improvements.push({
|
|
203
|
+
target: "overall",
|
|
204
|
+
metric: "passRate",
|
|
205
|
+
baseline: `${(bl.passRate * 100).toFixed(1)}%`,
|
|
206
|
+
current: `${(summary.passRate * 100).toFixed(1)}%`,
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
// ── Latency regression ────────────────────────────────────────────
|
|
210
|
+
if (summary.avgLatencyMs >
|
|
211
|
+
bl.avgLatencyMs * LATENCY_REGRESSION_FACTOR &&
|
|
212
|
+
summary.avgLatencyMs - bl.avgLatencyMs > LATENCY_MIN_ABSOLUTE_INCREASE_MS) {
|
|
213
|
+
summary.regressions.push({
|
|
214
|
+
target: "overall",
|
|
215
|
+
metric: "avgLatencyMs",
|
|
216
|
+
baseline: `${bl.avgLatencyMs}ms`,
|
|
217
|
+
current: `${summary.avgLatencyMs}ms`,
|
|
218
|
+
description: `Average latency increased by ${summary.avgLatencyMs - bl.avgLatencyMs}ms.`,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
// ── Per-case comparison ───────────────────────────────────────────
|
|
222
|
+
const baselineCases = new Map(baseline.cases.map((c) => [c.caseName, c]));
|
|
223
|
+
for (const current of currentResults) {
|
|
224
|
+
const prev = baselineCases.get(current.caseName);
|
|
225
|
+
if (!prev)
|
|
226
|
+
continue; // New case — no baseline to compare
|
|
227
|
+
// Case flipped from pass to fail
|
|
228
|
+
if (prev.passed && !current.passed) {
|
|
229
|
+
summary.regressions.push({
|
|
230
|
+
target: current.caseName,
|
|
231
|
+
metric: "passed",
|
|
232
|
+
baseline: "true",
|
|
233
|
+
current: "false",
|
|
234
|
+
description: `"${current.caseName}" went from PASS to FAIL. Failures: ${current.failures.join("; ")}`,
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
// Case flipped from fail to pass
|
|
238
|
+
if (!prev.passed && current.passed) {
|
|
239
|
+
summary.improvements.push({
|
|
240
|
+
target: current.caseName,
|
|
241
|
+
metric: "passed",
|
|
242
|
+
baseline: "false",
|
|
243
|
+
current: "true",
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
// Failure count increased significantly
|
|
247
|
+
const prevFailCount = prev.failures.length;
|
|
248
|
+
const currFailCount = current.failures.length;
|
|
249
|
+
if (currFailCount - prevFailCount >= FAILURE_COUNT_REGRESSION_THRESHOLD) {
|
|
250
|
+
summary.regressions.push({
|
|
251
|
+
target: current.caseName,
|
|
252
|
+
metric: "failureCount",
|
|
253
|
+
baseline: String(prevFailCount),
|
|
254
|
+
current: String(currFailCount),
|
|
255
|
+
description: `Failure count increased from ${prevFailCount} to ${currFailCount}.`,
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Load baseline results from disk.
|
|
262
|
+
*/
|
|
263
|
+
loadBaseline() {
|
|
264
|
+
if (!this.config.baselinePath)
|
|
265
|
+
return null;
|
|
266
|
+
try {
|
|
267
|
+
const raw = fs.readFileSync(this.config.baselinePath, "utf-8");
|
|
268
|
+
return JSON.parse(raw);
|
|
269
|
+
}
|
|
270
|
+
catch {
|
|
271
|
+
return null;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Persist results to disk for future baseline use.
|
|
276
|
+
*/
|
|
277
|
+
persistResult(result) {
|
|
278
|
+
try {
|
|
279
|
+
fs.mkdirSync(this.outputDir, { recursive: true });
|
|
280
|
+
const safeName = this.config.name.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
281
|
+
const timestamp = result.summary.timestamp.replace(/[:.]/g, "-");
|
|
282
|
+
const filename = `${safeName}_${timestamp}.json`;
|
|
283
|
+
const filePath = path.join(this.outputDir, filename);
|
|
284
|
+
fs.writeFileSync(filePath, JSON.stringify(result, null, 2), "utf-8");
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
// Best-effort persistence — never throw from persist
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
exports.Benchmark = Benchmark;
|
|
292
|
+
//# sourceMappingURL=benchmark.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAyD;AAwCzD,4EAA4E;AAE5E,gFAAgF;AAChF,MAAM,iCAAiC,GAAG,IAAI,CAAC;AAE/C,6EAA6E;AAC7E,MAAM,kCAAkC,GAAG,CAAC,CAAC;AAE7C,qEAAqE;AACrE,MAAM,yBAAyB,GAAG,GAAG,CAAC;AAEtC,sDAAsD;AACtD,MAAM,gCAAgC,GAAG,IAAI,CAAC;AAE9C;;;;;;;;;;;;;;;;GAgBG;AACH,MAAa,SAAS;IACZ,MAAM,CAAkB;IACxB,SAAS,CAAS;IAE1B,YAAY,MAAuB;QACjC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,SAAS,IAAI,oBAAoB,CAAC,CAAC;IAC1E,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,GAAG;QACP,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,IAAI,wBAAU,EAAE,CAAC;QACtD,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC;QAChC,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE3C,gBAAgB;QAChB,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;QAElE,gBAAgB;QAChB,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;QAEtD,wCAAwC;QACxC,IAAI,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC;YAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;YACrC,IAAI,QAAQ,EAAE,CAAC;gBACb,IAAI,CAAC,mBAAmB,CAAC,OAAO,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;YACvD,CAAC;QACH,CAAC;QAED,MAAM,MAAM,GAAoB,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC;QAE5D,kCAAkC;QAClC,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAE3B,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,MAAuB;QACpC,MAAM,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC;QACzB,MAAM,QAAQ,GAAG,CAAC,CAAC,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAE/C,IAAI,MAAM,GAAG,gBAAgB,CAAC,CAAC,IAAI,MAAM,CAAC;QAC1C,MAAM,IAAI,eAAe,CAAC,CAAC,SAAS,MAAM,CAAC;QAE3C,MAAM,IAAI,gBAAgB,CAAC;QAC3B,MAAM,IAAI,sBAAsB,CAAC;QACjC,MAAM,IAAI,sBAAsB,CAAC;QACjC,MAAM,IAAI,iBAAiB,QAAQ,MAAM,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,OAAO,CAAC;QACpE,MAAM,IAAI,6BAA6B,CAAC,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9E,MAAM,IAAI,oBAAoB,CAAC,CAAC,YAAY,UAAU,CAAC;QAEvD,4CAA4C;QAC5C,IAAI,CAAC,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,uBAAuB,CAAC;YAClC,MAAM,IAAI,sDAAsD,CAAC;YACjE,MAAM,IAAI,uDAAuD,CAAC;YAClE,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;gBAC9B,MAAM,IAAI,KAAK,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,QAAQ,MAAM,CAAC,CAAC,OAAO,MAAM,CAAC,CAAC,WAAW,MAAM,CAAC;YAChG,CAAC;YACD,MAAM,IAAI,IAAI,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,yBAAyB,CAAC;QACtC,CAAC;QAED,eAAe;QACf,IAAI,CAAC,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9B,MAAM,IAAI,wBAAwB,CAAC;YACnC,MAAM,IAAI,4CAA4C,CAAC;YACvD,MAAM,IAAI,2CAA2C,CAAC;YACtD,KAAK,MAAM,GAAG,IAAI,CAAC,CAAC,YAAY,EAAE,CAAC;gBACjC,MAAM,IAAI,KAAK,GAAG,CAAC,MAAM,MAAM,GAAG,CAAC,MAAM,MAAM,GAAG,CAAC,QAAQ,MAAM,GAAG,CAAC,OAAO,MAAM,CAAC;YACrF,CAAC;YACD,MAAM,IAAI,IAAI,CAAC;QACjB,CAAC;QAED,mBAAmB;QACnB,MAAM,IAAI,yBAAyB,CAAC;QACpC,MAAM,IAAI,4DAA4D,CAAC;QACvE,MAAM,IAAI,2DAA2D,CAAC;QACtE,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAC7B,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;YAClC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,kBAAkB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAC7D,MAAM,IAAI,KAAK,IAAI,IAAI,CAAC,CAAC,QAAQ,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,UAAU,QAAQ,CAAC,CAAC,SAAS,CAAC,MAAM,MAAM,EAAE,OAAO,CAAC;QAC/H,CAAC;QACD,MAAM,IAAI,IAAI,CAAC;QAEf,kBAAkB;QAClB,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACvD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,MAAM,IAAI,wBAAwB,CAAC;YACnC,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;gBACzB,MAAM,IAAI,SAAS,CAAC,CAAC,QAAQ,MAAM,CAAC;gBACpC,KAAK,MAAM,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;oBAChC,MAAM,IAAI,KAAK,MAAM,IAAI,CAAC;gBAC5B,CAAC;gBACD,MAAM,IAAI,IAAI,CAAC;YACjB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,sBAAsB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,KAAK,CAAC;QAC9D,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,2EAA2E;IAEnE,YAAY,CAClB,OAAqB,EACrB,SAAiB;QAEjB,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QACtD,MAAM,YAAY,GAChB,KAAK,GAAG,CAAC;YACP,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,KAAK;YAC7D,CAAC,CAAC,CAAC,CAAC;QACR,MAAM,YAAY,GAChB,KAAK,GAAG,CAAC;YACP,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC;YACnE,CAAC,CAAC,CAAC,CAAC;QAER,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI;YACtB,SAAS;YACT,MAAM;YACN,KAAK;YACL,QAAQ,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACxC,mBAAmB,EAAE,YAAY;YACjC,YAAY;YACZ,WAAW,EAAE,EAAE;YACf,YAAY,EAAE,EAAE;SACjB,CAAC;IACJ,CAAC;IAED;;;OAGG;IACK,mBAAmB,CACzB,OAAyB,EACzB,QAAyB,EACzB,cAA4B;QAE5B,MAAM,EAAE,GAAG,QAAQ,CAAC,OAAO,CAAC;QAE5B,qEAAqE;QACrE,IAAI,EAAE,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,iCAAiC,EAAE,CAAC;YACxE,MAAM,IAAI,GAAG,CAAC,CAAC,EAAE,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YACjE,OAAO,CAAC,WAAW,CAAC,IAAI,CAAC;gBACvB,MAAM,EAAE,SAAS;gBACjB,MAAM,EAAE,UAAU;gBAClB,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBAC9C,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBAClD,WAAW,EAAE,wBAAwB,IAAI,qBAAqB;aAC/D,CAAC,CAAC;QACL,CAAC;aAAM,IACL,OAAO,CAAC,QAAQ,GAAG,EAAE,CAAC,QAAQ;YAC9B,iCAAiC,EACjC,CAAC;YACD,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC;gBACxB,MAAM,EAAE,SAAS;gBACjB,MAAM,EAAE,UAAU;gBAClB,QAAQ,EAAE,GAAG,CAAC,EAAE,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBAC9C,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;aACnD,CAAC,CAAC;QACL,CAAC;QAED,qEAAqE;QACrE,IACE,OAAO,CAAC,YAAY;YAClB,EAAE,CAAC,YAAY,GAAG,yBAAyB;YAC7C,OAAO,CAAC,YAAY,GAAG,EAAE,CAAC,YAAY,GAAG,gCAAgC,EACzE,CAAC;YACD,OAAO,CAAC,WAAW,CAAC,IAAI,CAAC;gBACvB,MAAM,EAAE,SAAS;gBACjB,MAAM,EAAE,cAAc;gBACtB,QAAQ,EAAE,GAAG,EAAE,CAAC,YAAY,IAAI;gBAChC,OAAO,EAAE,GAAG,OAAO,CAAC,YAAY,IAAI;gBACpC,WAAW,EAAE,gCACX,OAAO,CAAC,YAAY,GAAG,EAAE,CAAC,YAC5B,KAAK;aACN,CAAC,CAAC;QACL,CAAC;QAED,qEAAqE;QACrE,MAAM,aAAa,GAAG,IAAI,GAAG,CAC3B,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAC3C,CAAC;QAEF,KAAK,MAAM,OAAO,IAAI,cAAc,EAAE,CAAC;YACrC,MAAM,IAAI,GAAG,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;YACjD,IAAI,CAAC,IAAI;gBAAE,SAAS,CAAC,oCAAoC;YAEzD,iCAAiC;YACjC,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;gBACnC,OAAO,CAAC,WAAW,CAAC,IAAI,CAAC;oBACvB,MAAM,EAAE,OAAO,CAAC,QAAQ;oBACxB,MAAM,EAAE,QAAQ;oBAChB,QAAQ,EAAE,MAAM;oBAChB,OAAO,EAAE,OAAO;oBAChB,WAAW,EAAE,IAAI,OAAO,CAAC,QAAQ,uCAAuC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;iBACtG,CAAC,CAAC;YACL,CAAC;YAED,iCAAiC;YACjC,IAAI,CAAC,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;gBACnC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC;oBACxB,MAAM,EAAE,OAAO,CAAC,QAAQ;oBACxB,MAAM,EAAE,QAAQ;oBAChB,QAAQ,EAAE,OAAO;oBACjB,OAAO,EAAE,MAAM;iBAChB,CAAC,CAAC;YACL,CAAC;YAED,wCAAwC;YACxC,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;YAC3C,MAAM,aAAa,GAAG,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;YAC9C,IAAI,aAAa,GAAG,aAAa,IAAI,kCAAkC,EAAE,CAAC;gBACxE,OAAO,CAAC,WAAW,CAAC,IAAI,CAAC;oBACvB,MAAM,EAAE,OAAO,CAAC,QAAQ;oBACxB,MAAM,EAAE,cAAc;oBACtB,QAAQ,EAAE,MAAM,CAAC,aAAa,CAAC;oBAC/B,OAAO,EAAE,MAAM,CAAC,aAAa,CAAC;oBAC9B,WAAW,EAAE,gCAAgC,aAAa,OAAO,aAAa,GAAG;iBAClF,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACK,YAAY;QAClB,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY;YAAE,OAAO,IAAI,CAAC;QAE3C,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;YAC/D,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAoB,CAAC;QAC5C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,MAAuB;QAC3C,IAAI,CAAC;YACH,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAElD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,GAAG,CAAC,CAAC;YAClE,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YACjE,MAAM,QAAQ,GAAG,GAAG,QAAQ,IAAI,SAAS,OAAO,CAAC;YACjD,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;YAErD,EAAE,CAAC,aAAa,CACd,QAAQ,EACR,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAC/B,OAAO,CACR,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,qDAAqD;QACvD,CAAC;IACH,CAAC;CACF;AA/QD,8BA+QC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import type { LLMProvider } from "../llm/interface";
|
|
2
|
+
import { ToolCallEvaluator } from "./tool-call-evaluator";
|
|
3
|
+
import type { EvalCase, EvalResult } from "./types";
|
|
4
|
+
export type { EvalCase, EvalResult, LLMEvalJudgment } from "./types";
|
|
5
|
+
/**
|
|
6
|
+
* An agent factory — called once per eval case to create a fresh agent
|
|
7
|
+
* with a ToolCallEvaluator hook attached.
|
|
8
|
+
*/
|
|
9
|
+
export type AgentFactory = (evaluator: ToolCallEvaluator) => {
|
|
10
|
+
run(input: string): Promise<string>;
|
|
11
|
+
cancel(): void;
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Configuration for the EvalRunner.
|
|
15
|
+
*/
|
|
16
|
+
export interface EvalRunnerConfig {
|
|
17
|
+
/**
|
|
18
|
+
* Default timeout per case in milliseconds.
|
|
19
|
+
* Cases can override with their own `timeoutMs`.
|
|
20
|
+
* Default: 120_000 (2 minutes).
|
|
21
|
+
*/
|
|
22
|
+
defaultTimeoutMs?: number;
|
|
23
|
+
/**
|
|
24
|
+
* Optional LLM provider for answer quality judging.
|
|
25
|
+
* When set, each case's final answer is independently evaluated.
|
|
26
|
+
*
|
|
27
|
+
* Using a different model than the agent's own LLM provides an
|
|
28
|
+
* unbiased quality assessment. Pass `router.forReflection()` from
|
|
29
|
+
* a ModelRouter for this purpose.
|
|
30
|
+
*/
|
|
31
|
+
judgeLLM?: LLMProvider;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* EvalRunner — runs test cases against an agent and produces pass/fail results.
|
|
35
|
+
*
|
|
36
|
+
* Uses an agent factory so each case starts fresh (no context pollution between
|
|
37
|
+
* cases). The factory receives a ToolCallEvaluator hook to collect metrics.
|
|
38
|
+
*
|
|
39
|
+
* Usage:
|
|
40
|
+
* ```ts
|
|
41
|
+
* const runner = new EvalRunner({ judgeLLM: router.forReflection() });
|
|
42
|
+
*
|
|
43
|
+
* const results = await runner.run(
|
|
44
|
+
* (evaluator) => new ReActAgent({ llm, hooks: [evaluator] }),
|
|
45
|
+
* [
|
|
46
|
+
* { name: "basic math", input: "2+2=?", expectedTools: ["calculator"] },
|
|
47
|
+
* { name: "file read", input: "read README.md", expectedTools: ["read_file"] },
|
|
48
|
+
* ],
|
|
49
|
+
* );
|
|
50
|
+
*
|
|
51
|
+
* console.log(runner.generateReport(results));
|
|
52
|
+
* ```
|
|
53
|
+
*/
|
|
54
|
+
export declare class EvalRunner {
|
|
55
|
+
private defaultTimeoutMs;
|
|
56
|
+
private judgeLLM?;
|
|
57
|
+
constructor(config?: EvalRunnerConfig);
|
|
58
|
+
/**
|
|
59
|
+
* Run a batch of evaluation cases.
|
|
60
|
+
*
|
|
61
|
+
* @param factory Creates a fresh agent for each case. Receives a
|
|
62
|
+
* ToolCallEvaluator hook that MUST be attached to the
|
|
63
|
+
* agent's hooks array.
|
|
64
|
+
* @param cases The test cases to run.
|
|
65
|
+
* @returns One EvalResult per case.
|
|
66
|
+
*/
|
|
67
|
+
run(factory: AgentFactory, cases: EvalCase[]): Promise<EvalResult[]>;
|
|
68
|
+
/**
|
|
69
|
+
* Run a single case and return the result (convenience method).
|
|
70
|
+
*/
|
|
71
|
+
runCase(factory: AgentFactory, c: EvalCase): Promise<EvalResult>;
|
|
72
|
+
/**
|
|
73
|
+
* Generate a Markdown report from evaluation results.
|
|
74
|
+
*/
|
|
75
|
+
generateReport(results: EvalResult[]): string;
|
|
76
|
+
private judgeAnswer;
|
|
77
|
+
private parseJudgment;
|
|
78
|
+
}
|
|
79
|
+
//# sourceMappingURL=eval-runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-runner.d.ts","sourceRoot":"","sources":["../../src/eval/eval-runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAe,MAAM,kBAAkB,CAAC;AAGjE,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAC;AAC1D,OAAO,KAAK,EAAE,QAAQ,EAAE,UAAU,EAAmB,MAAM,SAAS,CAAC;AAIrE,YAAY,EAAE,QAAQ,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAErE;;;GAGG;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,SAAS,EAAE,iBAAiB,KAAK;IAC3D,GAAG,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACpC,MAAM,IAAI,IAAI,CAAC;CAChB,CAAC;AAEF;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAE1B;;;;;;;OAOG;IACH,QAAQ,CAAC,EAAE,WAAW,CAAC;CACxB;AAqBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,gBAAgB,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,CAAc;gBAEnB,MAAM,CAAC,EAAE,gBAAgB;IAKrC;;;;;;;;OAQG;IACG,GAAG,CACP,OAAO,EAAE,YAAY,EACrB,KAAK,EAAE,QAAQ,EAAE,GAChB,OAAO,CAAC,UAAU,EAAE,CAAC;IAiGxB;;OAEG;IACG,OAAO,CACX,OAAO,EAAE,YAAY,EACrB,CAAC,EAAE,QAAQ,GACV,OAAO,CAAC,UAAU,CAAC;IAKtB;;OAEG;IACH,cAAc,CAAC,OAAO,EAAE,UAAU,EAAE,GAAG,MAAM;YAwD/B,WAAW;IA0BzB,OAAO,CAAC,aAAa;CAyBtB"}
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.EvalRunner = void 0;
|
|
4
|
+
const types_1 = require("../messages/types");
|
|
5
|
+
const tool_call_evaluator_1 = require("./tool-call-evaluator");
|
|
6
|
+
// ─── LLM Judge System Prompt ──────────────────────────────────────────────
|
|
7
|
+
const JUDGE_SYSTEM_PROMPT = `You are an impartial evaluation judge. Your job is to assess the quality
|
|
8
|
+
of an AI agent's answer to a user query.
|
|
9
|
+
|
|
10
|
+
Evaluate the answer across these dimensions:
|
|
11
|
+
- **Correctness**: Is the answer factually correct?
|
|
12
|
+
- **Completeness**: Does it fully address the user's query?
|
|
13
|
+
- **Clarity**: Is the answer well-structured and easy to understand?
|
|
14
|
+
- **Efficiency**: Did the agent use a reasonable approach? Any obvious wasted effort?
|
|
15
|
+
|
|
16
|
+
Output a JSON object:
|
|
17
|
+
{
|
|
18
|
+
"passed": true/false,
|
|
19
|
+
"score": 0-100,
|
|
20
|
+
"reasoning": "brief explanation (1-3 sentences)",
|
|
21
|
+
"issues": ["issue 1", "issue 2"] // empty array if no issues
|
|
22
|
+
}`;
|
|
23
|
+
/**
|
|
24
|
+
* EvalRunner — runs test cases against an agent and produces pass/fail results.
|
|
25
|
+
*
|
|
26
|
+
* Uses an agent factory so each case starts fresh (no context pollution between
|
|
27
|
+
* cases). The factory receives a ToolCallEvaluator hook to collect metrics.
|
|
28
|
+
*
|
|
29
|
+
* Usage:
|
|
30
|
+
* ```ts
|
|
31
|
+
* const runner = new EvalRunner({ judgeLLM: router.forReflection() });
|
|
32
|
+
*
|
|
33
|
+
* const results = await runner.run(
|
|
34
|
+
* (evaluator) => new ReActAgent({ llm, hooks: [evaluator] }),
|
|
35
|
+
* [
|
|
36
|
+
* { name: "basic math", input: "2+2=?", expectedTools: ["calculator"] },
|
|
37
|
+
* { name: "file read", input: "read README.md", expectedTools: ["read_file"] },
|
|
38
|
+
* ],
|
|
39
|
+
* );
|
|
40
|
+
*
|
|
41
|
+
* console.log(runner.generateReport(results));
|
|
42
|
+
* ```
|
|
43
|
+
*/
|
|
44
|
+
class EvalRunner {
|
|
45
|
+
defaultTimeoutMs;
|
|
46
|
+
judgeLLM;
|
|
47
|
+
constructor(config) {
|
|
48
|
+
this.defaultTimeoutMs = config?.defaultTimeoutMs ?? 120_000;
|
|
49
|
+
this.judgeLLM = config?.judgeLLM;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Run a batch of evaluation cases.
|
|
53
|
+
*
|
|
54
|
+
* @param factory Creates a fresh agent for each case. Receives a
|
|
55
|
+
* ToolCallEvaluator hook that MUST be attached to the
|
|
56
|
+
* agent's hooks array.
|
|
57
|
+
* @param cases The test cases to run.
|
|
58
|
+
* @returns One EvalResult per case.
|
|
59
|
+
*/
|
|
60
|
+
async run(factory, cases) {
|
|
61
|
+
const results = [];
|
|
62
|
+
for (const c of cases) {
|
|
63
|
+
const evaluator = new tool_call_evaluator_1.ToolCallEvaluator();
|
|
64
|
+
const agent = factory(evaluator);
|
|
65
|
+
const caseTimeout = c.timeoutMs ?? this.defaultTimeoutMs;
|
|
66
|
+
const startedAt = Date.now();
|
|
67
|
+
let answer;
|
|
68
|
+
let iterations = 0;
|
|
69
|
+
const toolCalls = [];
|
|
70
|
+
const failures = [];
|
|
71
|
+
try {
|
|
72
|
+
answer = await withTimeout(agent.run(c.input), caseTimeout);
|
|
73
|
+
// Collect tool calls from the evaluator
|
|
74
|
+
for (const r of evaluator.getRecords()) {
|
|
75
|
+
if (r.endTime)
|
|
76
|
+
toolCalls.push(r.toolName);
|
|
77
|
+
}
|
|
78
|
+
iterations = toolCalls.length;
|
|
79
|
+
// ── Checks ──────────────────────────────────────────────────
|
|
80
|
+
if (c.expectedTools && c.expectedTools.length > 0) {
|
|
81
|
+
for (const expected of c.expectedTools) {
|
|
82
|
+
if (!toolCalls.includes(expected)) {
|
|
83
|
+
failures.push(`Expected tool "${expected}" was not called.`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (c.forbiddenTools && c.forbiddenTools.length > 0) {
|
|
88
|
+
for (const forbidden of c.forbiddenTools) {
|
|
89
|
+
if (toolCalls.includes(forbidden)) {
|
|
90
|
+
failures.push(`Forbidden tool "${forbidden}" was called.`);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (c.expectedOutput) {
|
|
95
|
+
const pattern = c.expectedOutput;
|
|
96
|
+
const matches = typeof pattern === "string"
|
|
97
|
+
? answer.includes(pattern)
|
|
98
|
+
: pattern.test(answer);
|
|
99
|
+
if (!matches) {
|
|
100
|
+
failures.push(`Answer does not match expected pattern: "${pattern}".`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
catch (err) {
|
|
105
|
+
answer = err instanceof Error ? err.message : String(err);
|
|
106
|
+
failures.push(`Execution error: ${answer}`);
|
|
107
|
+
}
|
|
108
|
+
const durationMs = Date.now() - startedAt;
|
|
109
|
+
const scorecard = evaluator.getScorecard();
|
|
110
|
+
// ── LLM Judging ───────────────────────────────────────────────
|
|
111
|
+
let llmJudgment;
|
|
112
|
+
if (this.judgeLLM && failures.length === 0) {
|
|
113
|
+
try {
|
|
114
|
+
llmJudgment = await this.judgeAnswer(c.input, answer);
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
// Judge failed — leave judgment undefined
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (llmJudgment && !llmJudgment.passed) {
|
|
121
|
+
failures.push(`LLM judge (score ${llmJudgment.score}/100): ${llmJudgment.reasoning}`);
|
|
122
|
+
}
|
|
123
|
+
results.push({
|
|
124
|
+
caseName: c.name,
|
|
125
|
+
passed: failures.length === 0,
|
|
126
|
+
answer,
|
|
127
|
+
toolCalls,
|
|
128
|
+
iterations,
|
|
129
|
+
durationMs,
|
|
130
|
+
scorecard,
|
|
131
|
+
llmJudgment,
|
|
132
|
+
failures,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
return results;
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Run a single case and return the result (convenience method).
|
|
139
|
+
*/
|
|
140
|
+
async runCase(factory, c) {
|
|
141
|
+
const results = await this.run(factory, [c]);
|
|
142
|
+
return results[0];
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Generate a Markdown report from evaluation results.
|
|
146
|
+
*/
|
|
147
|
+
generateReport(results) {
|
|
148
|
+
const passed = results.filter((r) => r.passed).length;
|
|
149
|
+
const total = results.length;
|
|
150
|
+
const passRate = total > 0 ? ((passed / total) * 100).toFixed(1) : "0.0";
|
|
151
|
+
const avgLatency = total > 0
|
|
152
|
+
? Math.round(results.reduce((s, r) => s + r.durationMs, 0) / total)
|
|
153
|
+
: 0;
|
|
154
|
+
let report = `# Evaluation Report\n\n`;
|
|
155
|
+
report += `## Summary\n\n`;
|
|
156
|
+
report += `| Metric | Value |\n`;
|
|
157
|
+
report += `|--------|-------|\n`;
|
|
158
|
+
report += `| Cases | ${total} |\n`;
|
|
159
|
+
report += `| Passed | ${passed} |\n`;
|
|
160
|
+
report += `| Failed | ${total - passed} |\n`;
|
|
161
|
+
report += `| Pass Rate | ${passRate}% |\n`;
|
|
162
|
+
report += `| Avg Duration | ${avgLatency}ms |\n\n`;
|
|
163
|
+
report += `## Results\n\n`;
|
|
164
|
+
for (const r of results) {
|
|
165
|
+
const icon = r.passed ? "✅" : "❌";
|
|
166
|
+
report += `### ${icon} ${r.caseName}\n\n`;
|
|
167
|
+
report += `- **Duration:** ${r.durationMs}ms\n`;
|
|
168
|
+
report += `- **Tool calls:** ${r.toolCalls.join(", ") || "(none)"}\n`;
|
|
169
|
+
report += `- **Tool success rate:** ${(r.scorecard.overallSuccessRate * 100).toFixed(1)}%\n`;
|
|
170
|
+
if (r.llmJudgment) {
|
|
171
|
+
report += `- **Judge score:** ${r.llmJudgment.score}/100\n`;
|
|
172
|
+
if (r.llmJudgment.issues.length > 0) {
|
|
173
|
+
report += `- **Issues:**\n`;
|
|
174
|
+
for (const issue of r.llmJudgment.issues) {
|
|
175
|
+
report += ` - ${issue}\n`;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
if (r.failures.length > 0) {
|
|
180
|
+
report += `- **Failures:**\n`;
|
|
181
|
+
for (const f of r.failures) {
|
|
182
|
+
report += ` - ${f}\n`;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
report += `\n<details>\n<summary>Answer</summary>\n\n${r.answer}\n\n</details>\n\n`;
|
|
186
|
+
}
|
|
187
|
+
report += `---\n*Generated at ${new Date().toISOString()}*\n`;
|
|
188
|
+
return report;
|
|
189
|
+
}
|
|
190
|
+
// ─── Private ────────────────────────────────────────────────────────────
|
|
191
|
+
async judgeAnswer(query, answer) {
|
|
192
|
+
if (!this.judgeLLM) {
|
|
193
|
+
return { passed: true, score: 100, reasoning: "", issues: [] };
|
|
194
|
+
}
|
|
195
|
+
const messages = [
|
|
196
|
+
{ role: types_1.Role.System, content: JUDGE_SYSTEM_PROMPT },
|
|
197
|
+
{
|
|
198
|
+
role: types_1.Role.User,
|
|
199
|
+
content: [
|
|
200
|
+
`User query: ${query}`,
|
|
201
|
+
``,
|
|
202
|
+
`Agent answer: ${answer}`,
|
|
203
|
+
``,
|
|
204
|
+
`Please evaluate the answer quality. Output JSON only.`,
|
|
205
|
+
].join("\n"),
|
|
206
|
+
},
|
|
207
|
+
];
|
|
208
|
+
const response = await this.judgeLLM.chat(messages);
|
|
209
|
+
return this.parseJudgment(response.content);
|
|
210
|
+
}
|
|
211
|
+
parseJudgment(raw) {
|
|
212
|
+
try {
|
|
213
|
+
let json = raw.trim();
|
|
214
|
+
const fenceMatch = json.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
|
|
215
|
+
if (fenceMatch)
|
|
216
|
+
json = fenceMatch[1];
|
|
217
|
+
const parsed = JSON.parse(json);
|
|
218
|
+
return {
|
|
219
|
+
passed: Boolean(parsed.passed),
|
|
220
|
+
score: Math.max(0, Math.min(100, Number(parsed.score) || 0)),
|
|
221
|
+
reasoning: String(parsed.reasoning ?? ""),
|
|
222
|
+
issues: Array.isArray(parsed.issues)
|
|
223
|
+
? parsed.issues.map(String)
|
|
224
|
+
: [],
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
catch {
|
|
228
|
+
return {
|
|
229
|
+
passed: true,
|
|
230
|
+
score: 50,
|
|
231
|
+
reasoning: "Could not parse judge response.",
|
|
232
|
+
issues: [],
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
exports.EvalRunner = EvalRunner;
|
|
238
|
+
// ─── Helpers ───────────────────────────────────────────────────────────────
|
|
239
|
+
async function withTimeout(promise, timeoutMs) {
|
|
240
|
+
let timer;
|
|
241
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
242
|
+
timer = setTimeout(() => reject(new Error(`Timed out after ${timeoutMs / 1000}s.`)), timeoutMs);
|
|
243
|
+
});
|
|
244
|
+
try {
|
|
245
|
+
return await Promise.race([promise, timeoutPromise]);
|
|
246
|
+
}
|
|
247
|
+
finally {
|
|
248
|
+
if (timer)
|
|
249
|
+
clearTimeout(timer);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
//# sourceMappingURL=eval-runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-runner.js","sourceRoot":"","sources":["../../src/eval/eval-runner.ts"],"names":[],"mappings":";;;AAEA,6CAAyC;AACzC,+DAA0D;AAsC1D,6EAA6E;AAE7E,MAAM,mBAAmB,GAAG;;;;;;;;;;;;;;;EAe1B,CAAC;AAEH;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAa,UAAU;IACb,gBAAgB,CAAS;IACzB,QAAQ,CAAe;IAE/B,YAAY,MAAyB;QACnC,IAAI,CAAC,gBAAgB,GAAG,MAAM,EAAE,gBAAgB,IAAI,OAAO,CAAC;QAC5D,IAAI,CAAC,QAAQ,GAAG,MAAM,EAAE,QAAQ,CAAC;IACnC,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,GAAG,CACP,OAAqB,EACrB,KAAiB;QAEjB,MAAM,OAAO,GAAiB,EAAE,CAAC;QAEjC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,MAAM,SAAS,GAAG,IAAI,uCAAiB,EAAE,CAAC;YAC1C,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;YACjC,MAAM,WAAW,GAAG,CAAC,CAAC,SAAS,IAAI,IAAI,CAAC,gBAAgB,CAAC;YACzD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAE7B,IAAI,MAAc,CAAC;YACnB,IAAI,UAAU,GAAG,CAAC,CAAC;YACnB,MAAM,SAAS,GAAa,EAAE,CAAC;YAC/B,MAAM,QAAQ,GAAa,EAAE,CAAC;YAE9B,IAAI,CAAC;gBACH,MAAM,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,EAAE,WAAW,CAAC,CAAC;gBAE5D,wCAAwC;gBACxC,KAAK,MAAM,CAAC,IAAI,SAAS,CAAC,UAAU,EAAE,EAAE,CAAC;oBACvC,IAAI,CAAC,CAAC,OAAO;wBAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;gBAC5C,CAAC;gBACD,UAAU,GAAG,SAAS,CAAC,MAAM,CAAC;gBAE9B,+DAA+D;gBAE/D,IAAI,CAAC,CAAC,aAAa,IAAI,CAAC,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAClD,KAAK,MAAM,QAAQ,IAAI,CAAC,CAAC,aAAa,EAAE,CAAC;wBACvC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;4BAClC,QAAQ,CAAC,IAAI,CACX,kBAAkB,QAAQ,mBAAmB,CAC9C,CAAC;wBACJ,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC,CAAC,cAAc,IAAI,CAAC,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACpD,KAAK,MAAM,SAAS,IAAI,CAAC,CAAC,cAAc,EAAE,CAAC;wBACzC,IAAI,SAAS,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;4BAClC,QAAQ,CAAC,IAAI,CACX,mBAAmB,SAAS,eAAe,CAC5C,CAAC;wBACJ,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC,CAAC,cAAc,EAAE,CAAC;oBACrB,MAAM,OAAO,GAAG,CAAC,CAAC,cAAc,CAAC;oBACjC,MAAM,OAAO,GACX,OAAO,OAAO,KAAK,QAAQ;wBACzB,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC;wBAC1B,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;oBAC3B,IAAI,CAAC,OAAO,EAAE,CAAC;wBACb,QAAQ,CAAC,IAAI,CACX,4CAA4C,OAAO,IAAI,CACxD,CAAC;oBACJ,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAY,EAAE,CAAC;gBACtB,MAAM,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBAC1D,QAAQ,CAAC,IAAI,CAAC,oBAAoB,MAAM,EAAE,CAAC,CAAC;YAC9C,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAC1C,MAAM,SAAS,GAAG,SAAS,CAAC,YAAY,EAAE,CAAC;YAE3C,iEAAiE;YACjE,IAAI,WAAwC,CAAC;YAC7C,IAAI,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC3C,IAAI,CAAC;oBACH,WAAW,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;gBACxD,CAAC;gBAAC,MAAM,CAAC;oBACP,0CAA0C;gBAC5C,CAAC;YACH,CAAC;YAED,IAAI,WAAW,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC;gBACvC,QAAQ,CAAC,IAAI,CACX,oBAAoB,WAAW,CAAC,KAAK,UAAU,WAAW,CAAC,SAAS,EAAE,CACvE,CAAC;YACJ,CAAC;YAED,OAAO,CAAC,IAAI,CAAC;gBACX,QAAQ,EAAE,CAAC,CAAC,IAAI;gBAChB,MAAM,EAAE,QAAQ,CAAC,MAAM,KAAK,CAAC;gBAC7B,MAAM;gBACN,SAAS;gBACT,UAAU;gBACV,UAAU;gBACV,SAAS;gBACT,WAAW;gBACX,QAAQ;aACT,CAAC,CAAC;QACL,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,OAAqB,EACrB,CAAW;QAEX,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7C,OAAO,OAAO,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,OAAqB;QAClC,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QACtD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,MAAM,QAAQ,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;QACzE,MAAM,UAAU,GACd,KAAK,GAAG,CAAC;YACP,CAAC,CAAC,IAAI,CAAC,KAAK,CACR,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,KAAK,CACtD;YACH,CAAC,CAAC,CAAC,CAAC;QAER,IAAI,MAAM,GAAG,yBAAyB,CAAC;QACvC,MAAM,IAAI,gBAAgB,CAAC;QAC3B,MAAM,IAAI,sBAAsB,CAAC;QACjC,MAAM,IAAI,sBAAsB,CAAC;QACjC,MAAM,IAAI,aAAa,KAAK,MAAM,CAAC;QACnC,MAAM,IAAI,cAAc,MAAM,MAAM,CAAC;QACrC,MAAM,IAAI,cAAc,KAAK,GAAG,MAAM,MAAM,CAAC;QAC7C,MAAM,IAAI,iBAAiB,QAAQ,OAAO,CAAC;QAC3C,MAAM,IAAI,oBAAoB,UAAU,UAAU,CAAC;QAEnD,MAAM,IAAI,gBAAgB,CAAC;QAE3B,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;YAClC,MAAM,IAAI,OAAO,IAAI,IAAI,CAAC,CAAC,QAAQ,MAAM,CAAC;YAC1C,MAAM,IAAI,mBAAmB,CAAC,CAAC,UAAU,MAAM,CAAC;YAChD,MAAM,IAAI,qBAAqB,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,QAAQ,IAAI,CAAC;YACtE,MAAM,IAAI,4BAA4B,CAAC,CAAC,CAAC,SAAS,CAAC,kBAAkB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC;YAE7F,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;gBAClB,MAAM,IAAI,sBAAsB,CAAC,CAAC,WAAW,CAAC,KAAK,QAAQ,CAAC;gBAC5D,IAAI,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACpC,MAAM,IAAI,iBAAiB,CAAC;oBAC5B,KAAK,MAAM,KAAK,IAAI,CAAC,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC;wBACzC,MAAM,IAAI,OAAO,KAAK,IAAI,CAAC;oBAC7B,CAAC;gBACH,CAAC;YACH,CAAC;YAED,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,MAAM,IAAI,mBAAmB,CAAC;gBAC9B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;oBAC3B,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;gBACzB,CAAC;YACH,CAAC;YAED,MAAM,IAAI,6CAA6C,CAAC,CAAC,MAAM,oBAAoB,CAAC;QACtF,CAAC;QAED,MAAM,IAAI,sBAAsB,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,KAAK,CAAC;QAC9D,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,2EAA2E;IAEnE,KAAK,CAAC,WAAW,CACvB,KAAa,EACb,MAAc;QAEd,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACnB,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,SAAS,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC;QACjE,CAAC;QAED,MAAM,QAAQ,GAAkB;YAC9B,EAAE,IAAI,EAAE,YAAI,CAAC,MAAM,EAAE,OAAO,EAAE,mBAAmB,EAAE;YACnD;gBACE,IAAI,EAAE,YAAI,CAAC,IAAI;gBACf,OAAO,EAAE;oBACP,eAAe,KAAK,EAAE;oBACtB,EAAE;oBACF,iBAAiB,MAAM,EAAE;oBACzB,EAAE;oBACF,uDAAuD;iBACxD,CAAC,IAAI,CAAC,IAAI,CAAC;aACb;SACF,CAAC;QAEF,MAAM,QAAQ,GAAgB,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACjE,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAC9C,CAAC;IAEO,aAAa,CAAC,GAAW;QAC/B,IAAI,CAAC;YACH,IAAI,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACtB,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;YACjE,IAAI,UAAU;gBAAE,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAErC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAEhC,OAAO;gBACL,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC;gBAC9B,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC5D,SAAS,EAAE,MAAM,CAAC,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC;gBACzC,MAAM,EAAE,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC;oBAClC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;oBAC3B,CAAC,CAAC,EAAE;aACP,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,OAAO;gBACL,MAAM,EAAE,IAAI;gBACZ,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,iCAAiC;gBAC5C,MAAM,EAAE,EAAE;aACX,CAAC;QACJ,CAAC;IACH,CAAC;CACF;AA/OD,gCA+OC;AAED,8EAA8E;AAE9E,KAAK,UAAU,WAAW,CACxB,OAAmB,EACnB,SAAiB;IAEjB,IAAI,KAAgD,CAAC;IAErD,MAAM,cAAc,GAAG,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;QACtD,KAAK,GAAG,UAAU,CAChB,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,mBAAmB,SAAS,GAAG,IAAI,IAAI,CAAC,CAAC,EAChE,SAAS,CACV,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,OAAO,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC,CAAC;IACvD,CAAC;YAAS,CAAC;QACT,IAAI,KAAK;YAAE,YAAY,CAAC,KAAK,CAAC,CAAC;IACjC,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export { ToolCallEvaluator } from "./tool-call-evaluator";
|
|
2
|
+
export { EvalRunner } from "./eval-runner";
|
|
3
|
+
export type { EvalRunnerConfig, AgentFactory, EvalCase, EvalResult, LLMEvalJudgment } from "./eval-runner";
|
|
4
|
+
export { Benchmark } from "./benchmark";
|
|
5
|
+
export type { BenchmarkConfig } from "./benchmark";
|
|
6
|
+
export type { ToolCallRecord, ToolCallStats, ToolCallScorecard, Regression, Improvement, BenchmarkSummary, BenchmarkResult, } from "./types";
|
|
7
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAC;AAG1D,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,YAAY,EAAE,gBAAgB,EAAE,YAAY,EAAE,QAAQ,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAG3G,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,YAAY,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAGnD,YAAY,EACV,cAAc,EACd,aAAa,EACb,iBAAiB,EACjB,UAAU,EACV,WAAW,EACX,gBAAgB,EAChB,eAAe,GAChB,MAAM,SAAS,CAAC"}
|