ctxpkg 0.0.1 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +1 -1
- package/bin/daemon.js +1 -1
- package/dist/agent/agent.d.ts +65 -0
- package/dist/agent/agent.d.ts.map +1 -0
- package/dist/agent/agent.js +291 -0
- package/dist/agent/agent.js.map +1 -0
- package/dist/agent/agent.prompts.d.ts +13 -0
- package/dist/agent/agent.prompts.d.ts.map +1 -0
- package/{src/agent/agent.prompts.ts → dist/agent/agent.prompts.js} +11 -12
- package/dist/agent/agent.prompts.js.map +1 -0
- package/dist/agent/agent.test-runner.d.ts +73 -0
- package/dist/agent/agent.test-runner.d.ts.map +1 -0
- package/dist/agent/agent.test-runner.js +316 -0
- package/dist/agent/agent.test-runner.js.map +1 -0
- package/dist/agent/agent.test-runner.schemas.d.ts +382 -0
- package/dist/agent/agent.test-runner.schemas.d.ts.map +1 -0
- package/dist/agent/agent.test-runner.schemas.js +110 -0
- package/dist/agent/agent.test-runner.schemas.js.map +1 -0
- package/dist/agent/agent.types.d.ts +122 -0
- package/dist/agent/agent.types.d.ts.map +1 -0
- package/dist/agent/agent.types.js +19 -0
- package/dist/agent/agent.types.js.map +1 -0
- package/dist/backend/backend.d.ts +16 -0
- package/dist/backend/backend.d.ts.map +1 -0
- package/dist/backend/backend.js +79 -0
- package/dist/backend/backend.js.map +1 -0
- package/dist/backend/backend.protocol.d.ts +74 -0
- package/dist/backend/backend.protocol.d.ts.map +1 -0
- package/dist/backend/backend.protocol.js +46 -0
- package/dist/backend/backend.protocol.js.map +1 -0
- package/dist/backend/backend.schemas.d.ts +141 -0
- package/dist/backend/backend.schemas.d.ts.map +1 -0
- package/dist/backend/backend.schemas.js +59 -0
- package/dist/backend/backend.schemas.js.map +1 -0
- package/dist/backend/backend.services.d.ts +290 -0
- package/dist/backend/backend.services.d.ts.map +1 -0
- package/dist/backend/backend.services.js +103 -0
- package/dist/backend/backend.services.js.map +1 -0
- package/dist/backend/backend.types.d.ts +25 -0
- package/dist/backend/backend.types.d.ts.map +1 -0
- package/dist/backend/backend.types.js +6 -0
- package/dist/backend/backend.types.js.map +1 -0
- package/dist/cli/cli.agent.d.ts +4 -0
- package/dist/cli/cli.agent.d.ts.map +1 -0
- package/dist/cli/cli.agent.js +158 -0
- package/dist/cli/cli.agent.js.map +1 -0
- package/dist/cli/cli.chat.d.ts +4 -0
- package/dist/cli/cli.chat.d.ts.map +1 -0
- package/dist/cli/cli.chat.js +311 -0
- package/dist/cli/cli.chat.js.map +1 -0
- package/dist/cli/cli.client.d.ts +11 -0
- package/dist/cli/cli.client.d.ts.map +1 -0
- package/dist/cli/cli.client.js +40 -0
- package/dist/cli/cli.client.js.map +1 -0
- package/dist/cli/cli.collections.d.ts +4 -0
- package/dist/cli/cli.collections.d.ts.map +1 -0
- package/dist/cli/cli.collections.js +411 -0
- package/dist/cli/cli.collections.js.map +1 -0
- package/dist/cli/cli.config.d.ts +4 -0
- package/dist/cli/cli.config.d.ts.map +1 -0
- package/dist/cli/cli.config.js +192 -0
- package/dist/cli/cli.config.js.map +1 -0
- package/dist/cli/cli.d.ts +4 -0
- package/dist/cli/cli.d.ts.map +1 -0
- package/dist/cli/cli.daemon.d.ts +4 -0
- package/dist/cli/cli.daemon.d.ts.map +1 -0
- package/dist/cli/cli.daemon.js +116 -0
- package/dist/cli/cli.daemon.js.map +1 -0
- package/dist/cli/cli.documents.d.ts +4 -0
- package/dist/cli/cli.documents.d.ts.map +1 -0
- package/dist/cli/cli.documents.js +332 -0
- package/dist/cli/cli.documents.js.map +1 -0
- package/dist/cli/cli.js +23 -0
- package/dist/cli/cli.js.map +1 -0
- package/dist/cli/cli.mcp.d.ts +4 -0
- package/dist/cli/cli.mcp.d.ts.map +1 -0
- package/dist/cli/cli.mcp.js +146 -0
- package/dist/cli/cli.mcp.js.map +1 -0
- package/dist/cli/cli.utils.d.ts +51 -0
- package/dist/cli/cli.utils.d.ts.map +1 -0
- package/dist/cli/cli.utils.js +95 -0
- package/dist/cli/cli.utils.js.map +1 -0
- package/dist/client/client.adapters.d.ts +38 -0
- package/dist/client/client.adapters.d.ts.map +1 -0
- package/dist/client/client.adapters.js +233 -0
- package/dist/client/client.adapters.js.map +1 -0
- package/dist/client/client.d.ts +16 -0
- package/dist/client/client.d.ts.map +1 -0
- package/dist/client/client.js +74 -0
- package/dist/client/client.js.map +1 -0
- package/dist/client/client.types.d.ts +10 -0
- package/dist/client/client.types.d.ts.map +1 -0
- package/dist/client/client.types.js +2 -0
- package/dist/client/client.types.js.map +1 -0
- package/dist/collections/collections.d.ts +219 -0
- package/dist/collections/collections.d.ts.map +1 -0
- package/dist/collections/collections.js +933 -0
- package/dist/collections/collections.js.map +1 -0
- package/dist/collections/collections.schemas.d.ts +298 -0
- package/dist/collections/collections.schemas.d.ts.map +1 -0
- package/dist/collections/collections.schemas.js +117 -0
- package/dist/collections/collections.schemas.js.map +1 -0
- package/dist/config/config.d.ts +29 -0
- package/dist/config/config.d.ts.map +1 -0
- package/dist/config/config.js +112 -0
- package/dist/config/config.js.map +1 -0
- package/dist/daemon/daemon.config.d.ts +6 -0
- package/dist/daemon/daemon.config.d.ts.map +1 -0
- package/dist/daemon/daemon.config.js +19 -0
- package/dist/daemon/daemon.config.js.map +1 -0
- package/dist/daemon/daemon.d.ts +10 -0
- package/dist/daemon/daemon.d.ts.map +1 -0
- package/dist/daemon/daemon.js +173 -0
- package/dist/daemon/daemon.js.map +1 -0
- package/dist/daemon/daemon.manager.d.ts +20 -0
- package/dist/daemon/daemon.manager.d.ts.map +1 -0
- package/dist/daemon/daemon.manager.js +176 -0
- package/dist/daemon/daemon.manager.js.map +1 -0
- package/dist/daemon/daemon.schemas.d.ts +38 -0
- package/dist/daemon/daemon.schemas.d.ts.map +1 -0
- package/dist/daemon/daemon.schemas.js +15 -0
- package/dist/daemon/daemon.schemas.js.map +1 -0
- package/dist/database/database.d.ts +10 -0
- package/dist/database/database.d.ts.map +1 -0
- package/dist/database/database.js +52 -0
- package/dist/database/database.js.map +1 -0
- package/dist/database/migrations/migrations.001-init.d.ts +9 -0
- package/dist/database/migrations/migrations.001-init.d.ts.map +1 -0
- package/dist/database/migrations/migrations.001-init.js +46 -0
- package/dist/database/migrations/migrations.001-init.js.map +1 -0
- package/dist/database/migrations/migrations.002-fts5.d.ts +11 -0
- package/dist/database/migrations/migrations.002-fts5.d.ts.map +1 -0
- package/dist/database/migrations/migrations.002-fts5.js +29 -0
- package/dist/database/migrations/migrations.002-fts5.js.map +1 -0
- package/dist/database/migrations/migrations.d.ts +11 -0
- package/dist/database/migrations/migrations.d.ts.map +1 -0
- package/dist/database/migrations/migrations.js +14 -0
- package/dist/database/migrations/migrations.js.map +1 -0
- package/dist/database/migrations/migrations.types.d.ts +8 -0
- package/dist/database/migrations/migrations.types.d.ts.map +1 -0
- package/dist/database/migrations/migrations.types.js +2 -0
- package/dist/database/migrations/migrations.types.js.map +1 -0
- package/dist/documents/documents.d.ts +58 -0
- package/dist/documents/documents.d.ts.map +1 -0
- package/dist/documents/documents.js +597 -0
- package/dist/documents/documents.js.map +1 -0
- package/dist/documents/documents.schemas.d.ts +418 -0
- package/dist/documents/documents.schemas.d.ts.map +1 -0
- package/dist/documents/documents.schemas.js +111 -0
- package/dist/documents/documents.schemas.js.map +1 -0
- package/dist/embedder/embedder.d.ts +22 -0
- package/dist/embedder/embedder.d.ts.map +1 -0
- package/dist/embedder/embedder.js +46 -0
- package/dist/embedder/embedder.js.map +1 -0
- package/dist/exports.d.ts +2 -0
- package/dist/exports.d.ts.map +1 -0
- package/dist/exports.js +2 -0
- package/dist/exports.js.map +1 -0
- package/dist/mcp/mcp.d.ts +44 -0
- package/dist/mcp/mcp.d.ts.map +1 -0
- package/dist/mcp/mcp.js +62 -0
- package/dist/mcp/mcp.js.map +1 -0
- package/dist/tools/agent/agent.d.ts +14 -0
- package/dist/tools/agent/agent.d.ts.map +1 -0
- package/dist/tools/agent/agent.js +31 -0
- package/dist/tools/agent/agent.js.map +1 -0
- package/dist/tools/documents/documents.d.ts +28 -0
- package/dist/tools/documents/documents.d.ts.map +1 -0
- package/dist/tools/documents/documents.js +336 -0
- package/dist/tools/documents/documents.js.map +1 -0
- package/dist/tools/tools.langchain.d.ts +11 -0
- package/dist/tools/tools.langchain.d.ts.map +1 -0
- package/dist/tools/tools.langchain.js +30 -0
- package/dist/tools/tools.langchain.js.map +1 -0
- package/dist/tools/tools.mcp.d.ts +12 -0
- package/dist/tools/tools.mcp.d.ts.map +1 -0
- package/dist/tools/tools.mcp.js +38 -0
- package/dist/tools/tools.mcp.js.map +1 -0
- package/{src/tools/tools.types.ts → dist/tools/tools.types.d.ts} +10 -16
- package/dist/tools/tools.types.d.ts.map +1 -0
- package/dist/tools/tools.types.js +7 -0
- package/dist/tools/tools.types.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/dist/utils/utils.services.d.ts +14 -0
- package/dist/utils/utils.services.d.ts.map +1 -0
- package/dist/utils/utils.services.js +33 -0
- package/dist/utils/utils.services.js.map +1 -0
- package/package.json +5 -2
- package/src/agent/AGENTS.md +0 -249
- package/src/agent/agent.test-runner.schemas.ts +0 -158
- package/src/agent/agent.test-runner.ts +0 -436
- package/src/agent/agent.ts +0 -371
- package/src/agent/agent.types.ts +0 -94
- package/src/backend/AGENTS.md +0 -112
- package/src/backend/backend.protocol.ts +0 -95
- package/src/backend/backend.schemas.ts +0 -123
- package/src/backend/backend.services.ts +0 -151
- package/src/backend/backend.ts +0 -111
- package/src/backend/backend.types.ts +0 -34
- package/src/cli/AGENTS.md +0 -213
- package/src/cli/cli.agent.ts +0 -197
- package/src/cli/cli.chat.ts +0 -369
- package/src/cli/cli.client.ts +0 -55
- package/src/cli/cli.collections.ts +0 -491
- package/src/cli/cli.config.ts +0 -252
- package/src/cli/cli.daemon.ts +0 -160
- package/src/cli/cli.documents.ts +0 -413
- package/src/cli/cli.mcp.ts +0 -177
- package/src/cli/cli.ts +0 -28
- package/src/cli/cli.utils.ts +0 -122
- package/src/client/AGENTS.md +0 -135
- package/src/client/client.adapters.ts +0 -279
- package/src/client/client.ts +0 -86
- package/src/client/client.types.ts +0 -17
- package/src/collections/AGENTS.md +0 -185
- package/src/collections/collections.schemas.ts +0 -195
- package/src/collections/collections.ts +0 -1160
- package/src/config/config.ts +0 -118
- package/src/daemon/AGENTS.md +0 -168
- package/src/daemon/daemon.config.ts +0 -23
- package/src/daemon/daemon.manager.ts +0 -215
- package/src/daemon/daemon.schemas.ts +0 -22
- package/src/daemon/daemon.ts +0 -205
- package/src/database/AGENTS.md +0 -211
- package/src/database/database.ts +0 -64
- package/src/database/migrations/migrations.001-init.ts +0 -56
- package/src/database/migrations/migrations.002-fts5.ts +0 -32
- package/src/database/migrations/migrations.ts +0 -20
- package/src/database/migrations/migrations.types.ts +0 -9
- package/src/documents/AGENTS.md +0 -301
- package/src/documents/documents.schemas.ts +0 -190
- package/src/documents/documents.ts +0 -734
- package/src/embedder/embedder.ts +0 -53
- package/src/exports.ts +0 -0
- package/src/mcp/AGENTS.md +0 -264
- package/src/mcp/mcp.ts +0 -105
- package/src/tools/AGENTS.md +0 -228
- package/src/tools/agent/agent.ts +0 -45
- package/src/tools/documents/documents.ts +0 -401
- package/src/tools/tools.langchain.ts +0 -37
- package/src/tools/tools.mcp.ts +0 -46
- package/src/utils/utils.services.ts +0 -46
|
@@ -1,436 +0,0 @@
|
|
|
1
|
-
import { readFile } from 'node:fs/promises';
|
|
2
|
-
import { dirname, resolve } from 'node:path';
|
|
3
|
-
|
|
4
|
-
import { parse as parseYaml } from 'yaml';
|
|
5
|
-
|
|
6
|
-
import { createDocumentAgent, getLLMConfigFromAppConfig } from './agent.ts';
|
|
7
|
-
import type { LLMConfig } from './agent.types.ts';
|
|
8
|
-
import {
|
|
9
|
-
testSuiteSchema,
|
|
10
|
-
type TestCase,
|
|
11
|
-
type TestResult,
|
|
12
|
-
type TestRunResult,
|
|
13
|
-
type TestSuite,
|
|
14
|
-
type ValidationMode,
|
|
15
|
-
} from './agent.test-runner.schemas.ts';
|
|
16
|
-
|
|
17
|
-
import type { BackendClient } from '#root/client/client.ts';
|
|
18
|
-
import { createClient } from '#root/client/client.ts';
|
|
19
|
-
import { EmbedderService } from '#root/embedder/embedder.ts';
|
|
20
|
-
import { Services, destroy } from '#root/utils/utils.services.ts';
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Callback for test progress updates
|
|
24
|
-
*/
|
|
25
|
-
type TestProgressCallback = (event: TestProgressEvent) => void;
|
|
26
|
-
|
|
27
|
-
type TestProgressEvent =
|
|
28
|
-
| { type: 'suite_start'; suiteName: string; totalTests: number }
|
|
29
|
-
| { type: 'sync_start' }
|
|
30
|
-
| { type: 'sync_complete' }
|
|
31
|
-
| { type: 'test_start'; testId: string; index: number }
|
|
32
|
-
| { type: 'test_complete'; testId: string; result: TestResult }
|
|
33
|
-
| { type: 'suite_complete'; result: TestRunResult };
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Options for running a test suite
|
|
37
|
-
*/
|
|
38
|
-
type TestRunnerOptions = {
|
|
39
|
-
/** LLM configuration (defaults to app config) */
|
|
40
|
-
llmConfig?: LLMConfig;
|
|
41
|
-
/** Progress callback */
|
|
42
|
-
onProgress?: TestProgressCallback;
|
|
43
|
-
/** Override validation mode for all tests */
|
|
44
|
-
validationMode?: ValidationMode;
|
|
45
|
-
/** Override pass threshold for all tests */
|
|
46
|
-
passThreshold?: number;
|
|
47
|
-
/** Model to use for LLM validation (defaults to llmConfig.model) */
|
|
48
|
-
validationModel?: string;
|
|
49
|
-
/** Base directory for resolving relative URLs in the test file (defaults to test file's directory) */
|
|
50
|
-
baseDir?: string;
|
|
51
|
-
};
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* LLM validation prompt
|
|
55
|
-
*/
|
|
56
|
-
const LLM_VALIDATION_PROMPT = `You are evaluating an AI agent's answer against expected criteria.
|
|
57
|
-
|
|
58
|
-
## Expected Answer / Criteria
|
|
59
|
-
{expected}
|
|
60
|
-
|
|
61
|
-
## Actual Answer
|
|
62
|
-
{actual}
|
|
63
|
-
|
|
64
|
-
## Validation Instructions
|
|
65
|
-
{instructions}
|
|
66
|
-
|
|
67
|
-
## Task
|
|
68
|
-
Evaluate how well the actual answer meets the expected criteria. Consider:
|
|
69
|
-
- Does it address the key points?
|
|
70
|
-
- Is the information accurate (based on what was expected)?
|
|
71
|
-
- Is it appropriately detailed?
|
|
72
|
-
|
|
73
|
-
Respond with a JSON object:
|
|
74
|
-
\`\`\`json
|
|
75
|
-
{
|
|
76
|
-
"score": <0.0 to 1.0>,
|
|
77
|
-
"passed": <true if score >= threshold>,
|
|
78
|
-
"reasoning": "<brief explanation of your evaluation>"
|
|
79
|
-
}
|
|
80
|
-
\`\`\``;
|
|
81
|
-
|
|
82
|
-
const DEFAULT_VALIDATION_INSTRUCTIONS = `Evaluate whether the actual answer adequately addresses the expected criteria.
|
|
83
|
-
Focus on factual correctness and completeness rather than exact wording.`;
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Test runner service for validating agent performance
|
|
87
|
-
*/
|
|
88
|
-
class AgentTestRunner {
|
|
89
|
-
#services: Services;
|
|
90
|
-
#embedder: EmbedderService;
|
|
91
|
-
|
|
92
|
-
constructor() {
|
|
93
|
-
this.#services = new Services();
|
|
94
|
-
this.#embedder = this.#services.get(EmbedderService);
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Compute collection ID from spec URL (mirrors CollectionsService.computeCollectionId)
|
|
99
|
-
*/
|
|
100
|
-
#computeCollectionId(url: string): string {
|
|
101
|
-
const normalizedUrl = url.replace(/\/+$/, '');
|
|
102
|
-
return `pkg:${normalizedUrl}`;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
/**
|
|
106
|
-
* Load and parse a test suite from a YAML file
|
|
107
|
-
*/
|
|
108
|
-
async loadTestSuite(filePath: string): Promise<{ suite: TestSuite; baseDir: string }> {
|
|
109
|
-
const content = await readFile(filePath, 'utf-8');
|
|
110
|
-
const parsed = parseYaml(content);
|
|
111
|
-
const suite = testSuiteSchema.parse(parsed);
|
|
112
|
-
const baseDir = dirname(resolve(filePath));
|
|
113
|
-
return { suite, baseDir };
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
/**
|
|
117
|
-
* Run a complete test suite
|
|
118
|
-
*/
|
|
119
|
-
async runTestSuite(suite: TestSuite, options: TestRunnerOptions = {}): Promise<TestRunResult> {
|
|
120
|
-
const { onProgress, llmConfig: providedLlmConfig, baseDir = process.cwd() } = options;
|
|
121
|
-
const startedAt = new Date().toISOString();
|
|
122
|
-
const startTime = Date.now();
|
|
123
|
-
|
|
124
|
-
// Get LLM config
|
|
125
|
-
const llmConfig = providedLlmConfig ?? (await getLLMConfigFromAppConfig());
|
|
126
|
-
|
|
127
|
-
onProgress?.({ type: 'suite_start', suiteName: suite.name, totalTests: suite.tests.length });
|
|
128
|
-
|
|
129
|
-
const results: TestResult[] = [];
|
|
130
|
-
|
|
131
|
-
// Create client using direct mode (uses existing database)
|
|
132
|
-
const client = await createClient({ mode: 'direct' });
|
|
133
|
-
|
|
134
|
-
try {
|
|
135
|
-
// Sync collections from test suite
|
|
136
|
-
onProgress?.({ type: 'sync_start' });
|
|
137
|
-
|
|
138
|
-
// Build alias map for test suite collections only
|
|
139
|
-
const aliasMap = new Map<string, string>();
|
|
140
|
-
|
|
141
|
-
for (const [alias, spec] of Object.entries(suite.collections)) {
|
|
142
|
-
// Compute collection ID (same as CollectionsService.computeCollectionId)
|
|
143
|
-
const collectionId = this.#computeCollectionId(spec.url);
|
|
144
|
-
aliasMap.set(alias, collectionId);
|
|
145
|
-
|
|
146
|
-
// Sync the collection
|
|
147
|
-
await client.collections.sync({
|
|
148
|
-
name: alias,
|
|
149
|
-
spec,
|
|
150
|
-
cwd: baseDir,
|
|
151
|
-
});
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
onProgress?.({ type: 'sync_complete' });
|
|
155
|
-
|
|
156
|
-
// Create agent with only the test suite's collections
|
|
157
|
-
const agent = createDocumentAgent({
|
|
158
|
-
client,
|
|
159
|
-
llmConfig,
|
|
160
|
-
aliasMap,
|
|
161
|
-
// Restrict searches to only the test suite's collections
|
|
162
|
-
collections: Array.from(aliasMap.values()),
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
// Run each test
|
|
166
|
-
for (let i = 0; i < suite.tests.length; i++) {
|
|
167
|
-
const testCase = suite.tests[i];
|
|
168
|
-
onProgress?.({ type: 'test_start', testId: testCase.id, index: i });
|
|
169
|
-
|
|
170
|
-
const result = await this.#runSingleTest(testCase, agent, client, llmConfig, suite.options, options);
|
|
171
|
-
results.push(result);
|
|
172
|
-
|
|
173
|
-
onProgress?.({ type: 'test_complete', testId: testCase.id, result });
|
|
174
|
-
}
|
|
175
|
-
} finally {
|
|
176
|
-
await client.disconnect();
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
const completedAt = new Date().toISOString();
|
|
180
|
-
const durationMs = Date.now() - startTime;
|
|
181
|
-
|
|
182
|
-
const summary = {
|
|
183
|
-
total: results.length,
|
|
184
|
-
passed: results.filter((r) => r.passed && !r.skipped).length,
|
|
185
|
-
failed: results.filter((r) => !r.passed && !r.skipped).length,
|
|
186
|
-
skipped: results.filter((r) => r.skipped).length,
|
|
187
|
-
};
|
|
188
|
-
|
|
189
|
-
const runResult: TestRunResult = {
|
|
190
|
-
suiteName: suite.name,
|
|
191
|
-
startedAt,
|
|
192
|
-
completedAt,
|
|
193
|
-
durationMs,
|
|
194
|
-
summary,
|
|
195
|
-
results,
|
|
196
|
-
};
|
|
197
|
-
|
|
198
|
-
onProgress?.({ type: 'suite_complete', result: runResult });
|
|
199
|
-
|
|
200
|
-
return runResult;
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
/**
|
|
204
|
-
* Run a single test case
|
|
205
|
-
*/
|
|
206
|
-
async #runSingleTest(
|
|
207
|
-
testCase: TestCase,
|
|
208
|
-
agent: ReturnType<typeof createDocumentAgent>,
|
|
209
|
-
client: BackendClient,
|
|
210
|
-
llmConfig: LLMConfig,
|
|
211
|
-
suiteOptions: TestSuite['options'],
|
|
212
|
-
runnerOptions: TestRunnerOptions,
|
|
213
|
-
): Promise<TestResult> {
|
|
214
|
-
const startTime = Date.now();
|
|
215
|
-
|
|
216
|
-
// Check if skipped
|
|
217
|
-
if (testCase.skip) {
|
|
218
|
-
return {
|
|
219
|
-
id: testCase.id,
|
|
220
|
-
passed: false,
|
|
221
|
-
skipped: true,
|
|
222
|
-
actualAnswer: '',
|
|
223
|
-
durationMs: 0,
|
|
224
|
-
};
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
try {
|
|
228
|
-
// Get the agent's answer
|
|
229
|
-
const response = await agent.ask(testCase.query, testCase.useCase);
|
|
230
|
-
const actualAnswer = response.answer;
|
|
231
|
-
|
|
232
|
-
// Determine validation mode
|
|
233
|
-
const validationMode =
|
|
234
|
-
runnerOptions.validationMode ?? testCase.validationMode ?? suiteOptions?.validationMode ?? 'semantic';
|
|
235
|
-
|
|
236
|
-
// Determine pass threshold
|
|
237
|
-
const passThreshold =
|
|
238
|
-
runnerOptions.passThreshold ?? testCase.passThreshold ?? suiteOptions?.passThreshold ?? 0.75;
|
|
239
|
-
|
|
240
|
-
// Validate based on mode
|
|
241
|
-
let result: TestResult;
|
|
242
|
-
|
|
243
|
-
switch (validationMode) {
|
|
244
|
-
case 'keywords':
|
|
245
|
-
result = await this.#validateKeywords(testCase, actualAnswer, passThreshold);
|
|
246
|
-
break;
|
|
247
|
-
case 'llm':
|
|
248
|
-
result = await this.#validateWithLLM(
|
|
249
|
-
testCase,
|
|
250
|
-
actualAnswer,
|
|
251
|
-
passThreshold,
|
|
252
|
-
llmConfig,
|
|
253
|
-
suiteOptions,
|
|
254
|
-
runnerOptions.validationModel,
|
|
255
|
-
);
|
|
256
|
-
break;
|
|
257
|
-
case 'semantic':
|
|
258
|
-
default:
|
|
259
|
-
result = await this.#validateSemantic(testCase, actualAnswer, passThreshold);
|
|
260
|
-
break;
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
result.durationMs = Date.now() - startTime;
|
|
264
|
-
return result;
|
|
265
|
-
} catch (error) {
|
|
266
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
267
|
-
return {
|
|
268
|
-
id: testCase.id,
|
|
269
|
-
passed: false,
|
|
270
|
-
actualAnswer: '',
|
|
271
|
-
error: message,
|
|
272
|
-
durationMs: Date.now() - startTime,
|
|
273
|
-
};
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
/**
|
|
278
|
-
* Validate using semantic similarity
|
|
279
|
-
*/
|
|
280
|
-
async #validateSemantic(testCase: TestCase, actualAnswer: string, passThreshold: number): Promise<TestResult> {
|
|
281
|
-
// Embed both expected and actual as documents (not queries)
|
|
282
|
-
const embeddings = await this.#embedder.createDocumentEmbeddings([testCase.expected, actualAnswer]);
|
|
283
|
-
const [expectedEmbedding, actualEmbedding] = embeddings;
|
|
284
|
-
|
|
285
|
-
// Compute cosine similarity
|
|
286
|
-
const similarity = this.#cosineSimilarity(expectedEmbedding, actualEmbedding);
|
|
287
|
-
|
|
288
|
-
return {
|
|
289
|
-
id: testCase.id,
|
|
290
|
-
passed: similarity >= passThreshold,
|
|
291
|
-
score: similarity,
|
|
292
|
-
actualAnswer,
|
|
293
|
-
reasoning: `Semantic similarity: ${(similarity * 100).toFixed(1)}% (threshold: ${(passThreshold * 100).toFixed(1)}%)`,
|
|
294
|
-
durationMs: 0,
|
|
295
|
-
};
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
/**
|
|
299
|
-
* Validate using keyword matching
|
|
300
|
-
*/
|
|
301
|
-
async #validateKeywords(testCase: TestCase, actualAnswer: string, passThreshold: number): Promise<TestResult> {
|
|
302
|
-
const keywords = testCase.keywords ?? [];
|
|
303
|
-
|
|
304
|
-
if (keywords.length === 0) {
|
|
305
|
-
return {
|
|
306
|
-
id: testCase.id,
|
|
307
|
-
passed: false,
|
|
308
|
-
actualAnswer,
|
|
309
|
-
error: 'No keywords specified for keywords validation mode',
|
|
310
|
-
durationMs: 0,
|
|
311
|
-
};
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
const lowerAnswer = actualAnswer.toLowerCase();
|
|
315
|
-
const found: string[] = [];
|
|
316
|
-
const missing: string[] = [];
|
|
317
|
-
|
|
318
|
-
for (const keyword of keywords) {
|
|
319
|
-
if (lowerAnswer.includes(keyword.toLowerCase())) {
|
|
320
|
-
found.push(keyword);
|
|
321
|
-
} else {
|
|
322
|
-
missing.push(keyword);
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
const score = found.length / keywords.length;
|
|
327
|
-
|
|
328
|
-
return {
|
|
329
|
-
id: testCase.id,
|
|
330
|
-
passed: score >= passThreshold,
|
|
331
|
-
score,
|
|
332
|
-
actualAnswer,
|
|
333
|
-
keywordsFound: found,
|
|
334
|
-
keywordsMissing: missing,
|
|
335
|
-
reasoning: `Found ${found.length}/${keywords.length} keywords (${(score * 100).toFixed(1)}%)`,
|
|
336
|
-
durationMs: 0,
|
|
337
|
-
};
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
/**
|
|
341
|
-
* Validate using LLM as judge
|
|
342
|
-
*/
|
|
343
|
-
async #validateWithLLM(
|
|
344
|
-
testCase: TestCase,
|
|
345
|
-
actualAnswer: string,
|
|
346
|
-
passThreshold: number,
|
|
347
|
-
llmConfig: LLMConfig,
|
|
348
|
-
suiteOptions: TestSuite['options'],
|
|
349
|
-
validationModel?: string,
|
|
350
|
-
): Promise<TestResult> {
|
|
351
|
-
const { ChatOpenAI } = await import('@langchain/openai');
|
|
352
|
-
const { HumanMessage } = await import('@langchain/core/messages');
|
|
353
|
-
|
|
354
|
-
const llm = new ChatOpenAI({
|
|
355
|
-
configuration: { baseURL: llmConfig.provider },
|
|
356
|
-
modelName: validationModel ?? llmConfig.model,
|
|
357
|
-
apiKey: llmConfig.apiKey,
|
|
358
|
-
temperature: 0,
|
|
359
|
-
});
|
|
360
|
-
|
|
361
|
-
const instructions =
|
|
362
|
-
testCase.validationInstructions ?? suiteOptions?.validationInstructions ?? DEFAULT_VALIDATION_INSTRUCTIONS;
|
|
363
|
-
|
|
364
|
-
const prompt = LLM_VALIDATION_PROMPT.replace('{expected}', testCase.expected)
|
|
365
|
-
.replace('{actual}', actualAnswer)
|
|
366
|
-
.replace('{instructions}', instructions)
|
|
367
|
-
.replace('{threshold}', passThreshold.toString());
|
|
368
|
-
|
|
369
|
-
const response = await llm.invoke([new HumanMessage(prompt)]);
|
|
370
|
-
const content = typeof response.content === 'string' ? response.content : JSON.stringify(response.content);
|
|
371
|
-
|
|
372
|
-
// Parse JSON response
|
|
373
|
-
const jsonMatch = content.match(/```json\s*([\s\S]*?)\s*```/) ?? content.match(/\{[\s\S]*\}/);
|
|
374
|
-
|
|
375
|
-
if (jsonMatch) {
|
|
376
|
-
try {
|
|
377
|
-
const parsed = JSON.parse(jsonMatch[1] ?? jsonMatch[0]);
|
|
378
|
-
const score = Number(parsed.score) || 0;
|
|
379
|
-
|
|
380
|
-
return {
|
|
381
|
-
id: testCase.id,
|
|
382
|
-
passed: score >= passThreshold,
|
|
383
|
-
score,
|
|
384
|
-
actualAnswer,
|
|
385
|
-
reasoning: parsed.reasoning ?? 'No reasoning provided',
|
|
386
|
-
durationMs: 0,
|
|
387
|
-
};
|
|
388
|
-
} catch {
|
|
389
|
-
// Fall through
|
|
390
|
-
}
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
return {
|
|
394
|
-
id: testCase.id,
|
|
395
|
-
passed: false,
|
|
396
|
-
actualAnswer,
|
|
397
|
-
error: 'Failed to parse LLM validation response',
|
|
398
|
-
reasoning: content,
|
|
399
|
-
durationMs: 0,
|
|
400
|
-
};
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
/**
|
|
404
|
-
* Compute cosine similarity between two vectors
|
|
405
|
-
*/
|
|
406
|
-
#cosineSimilarity(a: number[], b: number[]): number {
|
|
407
|
-
let dotProduct = 0;
|
|
408
|
-
let normA = 0;
|
|
409
|
-
let normB = 0;
|
|
410
|
-
|
|
411
|
-
for (let i = 0; i < a.length; i++) {
|
|
412
|
-
dotProduct += a[i] * b[i];
|
|
413
|
-
normA += a[i] * a[i];
|
|
414
|
-
normB += b[i] * b[i];
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
/**
|
|
421
|
-
* Clean up resources
|
|
422
|
-
*/
|
|
423
|
-
async [destroy](): Promise<void> {
|
|
424
|
-
await this.#services.destroy();
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
/**
|
|
429
|
-
* Create a test runner instance
|
|
430
|
-
*/
|
|
431
|
-
const createTestRunner = (): AgentTestRunner => {
|
|
432
|
-
return new AgentTestRunner();
|
|
433
|
-
};
|
|
434
|
-
|
|
435
|
-
export { AgentTestRunner, createTestRunner };
|
|
436
|
-
export type { TestProgressCallback, TestProgressEvent, TestRunnerOptions };
|