cognitive-core 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -2
- package/SKILL.md +193 -0
- package/dist/agents/index.d.ts +3 -0
- package/dist/agents/index.d.ts.map +1 -0
- package/dist/agents/index.js +5 -0
- package/dist/agents/index.js.map +1 -0
- package/dist/agents/mock-provider.d.ts +23 -0
- package/dist/agents/mock-provider.d.ts.map +1 -0
- package/dist/agents/mock-provider.js +71 -0
- package/dist/agents/mock-provider.js.map +1 -0
- package/dist/agents/types.d.ts +98 -0
- package/dist/agents/types.d.ts.map +1 -0
- package/dist/agents/types.js +44 -0
- package/dist/agents/types.js.map +1 -0
- package/dist/atlas.d.ts +196 -0
- package/dist/atlas.d.ts.map +1 -0
- package/dist/atlas.js +373 -0
- package/dist/atlas.js.map +1 -0
- package/dist/bin/cognitive-core.d.ts +18 -0
- package/dist/bin/cognitive-core.d.ts.map +1 -0
- package/dist/bin/cognitive-core.js +419 -0
- package/dist/bin/cognitive-core.js.map +1 -0
- package/dist/embeddings/bm25.d.ts +104 -0
- package/dist/embeddings/bm25.d.ts.map +1 -0
- package/dist/embeddings/bm25.js +264 -0
- package/dist/embeddings/bm25.js.map +1 -0
- package/dist/embeddings/index.d.ts +12 -0
- package/dist/embeddings/index.d.ts.map +1 -0
- package/dist/embeddings/index.js +16 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/embeddings/manager.d.ts +112 -0
- package/dist/embeddings/manager.d.ts.map +1 -0
- package/dist/embeddings/manager.js +215 -0
- package/dist/embeddings/manager.js.map +1 -0
- package/dist/embeddings/provider.d.ts +101 -0
- package/dist/embeddings/provider.d.ts.map +1 -0
- package/dist/embeddings/provider.js +232 -0
- package/dist/embeddings/provider.js.map +1 -0
- package/dist/embeddings/vector-store.d.ts +101 -0
- package/dist/embeddings/vector-store.d.ts.map +1 -0
- package/dist/embeddings/vector-store.js +256 -0
- package/dist/embeddings/vector-store.js.map +1 -0
- package/dist/factory.d.ts +193 -0
- package/dist/factory.d.ts.map +1 -0
- package/dist/factory.js +109 -0
- package/dist/factory.js.map +1 -0
- package/dist/index.d.ts +43 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +84 -0
- package/dist/index.js.map +1 -0
- package/dist/learning/analyzer.d.ts +110 -0
- package/dist/learning/analyzer.d.ts.map +1 -0
- package/dist/learning/analyzer.js +213 -0
- package/dist/learning/analyzer.js.map +1 -0
- package/dist/learning/effectiveness.d.ts +158 -0
- package/dist/learning/effectiveness.d.ts.map +1 -0
- package/dist/learning/effectiveness.js +251 -0
- package/dist/learning/effectiveness.js.map +1 -0
- package/dist/learning/index.d.ts +8 -0
- package/dist/learning/index.d.ts.map +1 -0
- package/dist/learning/index.js +11 -0
- package/dist/learning/index.js.map +1 -0
- package/dist/learning/llm-extractor.d.ts +88 -0
- package/dist/learning/llm-extractor.d.ts.map +1 -0
- package/dist/learning/llm-extractor.js +372 -0
- package/dist/learning/llm-extractor.js.map +1 -0
- package/dist/learning/meta-learner.d.ts +80 -0
- package/dist/learning/meta-learner.d.ts.map +1 -0
- package/dist/learning/meta-learner.js +355 -0
- package/dist/learning/meta-learner.js.map +1 -0
- package/dist/learning/pipeline.d.ts +65 -0
- package/dist/learning/pipeline.d.ts.map +1 -0
- package/dist/learning/pipeline.js +170 -0
- package/dist/learning/pipeline.js.map +1 -0
- package/dist/learning/playbook-extractor.d.ts +113 -0
- package/dist/learning/playbook-extractor.d.ts.map +1 -0
- package/dist/learning/playbook-extractor.js +523 -0
- package/dist/learning/playbook-extractor.js.map +1 -0
- package/dist/learning/usage-inference.d.ts +82 -0
- package/dist/learning/usage-inference.d.ts.map +1 -0
- package/dist/learning/usage-inference.js +261 -0
- package/dist/learning/usage-inference.js.map +1 -0
- package/dist/mcp/index.d.ts +6 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +6 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/playbook-server.d.ts +120 -0
- package/dist/mcp/playbook-server.d.ts.map +1 -0
- package/dist/mcp/playbook-server.js +427 -0
- package/dist/mcp/playbook-server.js.map +1 -0
- package/dist/memory/curated-loader.d.ts +62 -0
- package/dist/memory/curated-loader.d.ts.map +1 -0
- package/dist/memory/curated-loader.js +106 -0
- package/dist/memory/curated-loader.js.map +1 -0
- package/dist/memory/experience.d.ts +122 -0
- package/dist/memory/experience.d.ts.map +1 -0
- package/dist/memory/experience.js +392 -0
- package/dist/memory/experience.js.map +1 -0
- package/dist/memory/index.d.ts +6 -0
- package/dist/memory/index.d.ts.map +1 -0
- package/dist/memory/index.js +9 -0
- package/dist/memory/index.js.map +1 -0
- package/dist/memory/meta.d.ts +90 -0
- package/dist/memory/meta.d.ts.map +1 -0
- package/dist/memory/meta.js +362 -0
- package/dist/memory/meta.js.map +1 -0
- package/dist/memory/playbook.d.ts +133 -0
- package/dist/memory/playbook.d.ts.map +1 -0
- package/dist/memory/playbook.js +357 -0
- package/dist/memory/playbook.js.map +1 -0
- package/dist/memory/system.d.ts +167 -0
- package/dist/memory/system.d.ts.map +1 -0
- package/dist/memory/system.js +383 -0
- package/dist/memory/system.js.map +1 -0
- package/dist/runtime/backends/acp.d.ts +67 -0
- package/dist/runtime/backends/acp.d.ts.map +1 -0
- package/dist/runtime/backends/acp.js +290 -0
- package/dist/runtime/backends/acp.js.map +1 -0
- package/dist/runtime/backends/index.d.ts +5 -0
- package/dist/runtime/backends/index.d.ts.map +1 -0
- package/dist/runtime/backends/index.js +6 -0
- package/dist/runtime/backends/index.js.map +1 -0
- package/dist/runtime/backends/mock.d.ts +67 -0
- package/dist/runtime/backends/mock.d.ts.map +1 -0
- package/dist/runtime/backends/mock.js +153 -0
- package/dist/runtime/backends/mock.js.map +1 -0
- package/dist/runtime/backends/subprocess.d.ts +56 -0
- package/dist/runtime/backends/subprocess.d.ts.map +1 -0
- package/dist/runtime/backends/subprocess.js +260 -0
- package/dist/runtime/backends/subprocess.js.map +1 -0
- package/dist/runtime/flows/learning.d.ts +73 -0
- package/dist/runtime/flows/learning.d.ts.map +1 -0
- package/dist/runtime/flows/learning.js +116 -0
- package/dist/runtime/flows/learning.js.map +1 -0
- package/dist/runtime/flows/validation.d.ts +122 -0
- package/dist/runtime/flows/validation.d.ts.map +1 -0
- package/dist/runtime/flows/validation.js +223 -0
- package/dist/runtime/flows/validation.js.map +1 -0
- package/dist/runtime/index.d.ts +6 -0
- package/dist/runtime/index.d.ts.map +1 -0
- package/dist/runtime/index.js +8 -0
- package/dist/runtime/index.js.map +1 -0
- package/dist/runtime/manager.d.ts +116 -0
- package/dist/runtime/manager.d.ts.map +1 -0
- package/dist/runtime/manager.js +416 -0
- package/dist/runtime/manager.js.map +1 -0
- package/dist/runtime/types.d.ts +138 -0
- package/dist/runtime/types.d.ts.map +1 -0
- package/dist/runtime/types.js +2 -0
- package/dist/runtime/types.js.map +1 -0
- package/dist/search/evaluator.d.ts +102 -0
- package/dist/search/evaluator.d.ts.map +1 -0
- package/dist/search/evaluator.js +352 -0
- package/dist/search/evaluator.js.map +1 -0
- package/dist/search/index.d.ts +7 -0
- package/dist/search/index.d.ts.map +1 -0
- package/dist/search/index.js +11 -0
- package/dist/search/index.js.map +1 -0
- package/dist/search/refinement-loop.d.ts +73 -0
- package/dist/search/refinement-loop.d.ts.map +1 -0
- package/dist/search/refinement-loop.js +245 -0
- package/dist/search/refinement-loop.js.map +1 -0
- package/dist/search/refinement-types.d.ts +154 -0
- package/dist/search/refinement-types.d.ts.map +1 -0
- package/dist/search/refinement-types.js +99 -0
- package/dist/search/refinement-types.js.map +1 -0
- package/dist/search/router.d.ts +61 -0
- package/dist/search/router.d.ts.map +1 -0
- package/dist/search/router.js +197 -0
- package/dist/search/router.js.map +1 -0
- package/dist/search/solver.d.ts +75 -0
- package/dist/search/solver.d.ts.map +1 -0
- package/dist/search/solver.js +216 -0
- package/dist/search/solver.js.map +1 -0
- package/dist/search/verification-runner.d.ts +125 -0
- package/dist/search/verification-runner.d.ts.map +1 -0
- package/dist/search/verification-runner.js +440 -0
- package/dist/search/verification-runner.js.map +1 -0
- package/dist/surfacing/index.d.ts +2 -0
- package/dist/surfacing/index.d.ts.map +1 -0
- package/dist/surfacing/index.js +2 -0
- package/dist/surfacing/index.js.map +1 -0
- package/dist/surfacing/skill-library.d.ts +158 -0
- package/dist/surfacing/skill-library.d.ts.map +1 -0
- package/dist/surfacing/skill-library.js +429 -0
- package/dist/surfacing/skill-library.js.map +1 -0
- package/dist/types/config.d.ts +1113 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/config.js +274 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/index.d.ts +9 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +14 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/memory.d.ts +339 -0
- package/dist/types/memory.d.ts.map +1 -0
- package/dist/types/memory.js +207 -0
- package/dist/types/memory.js.map +1 -0
- package/dist/types/meta.d.ts +146 -0
- package/dist/types/meta.d.ts.map +1 -0
- package/dist/types/meta.js +51 -0
- package/dist/types/meta.js.map +1 -0
- package/dist/types/outcome.d.ts +42 -0
- package/dist/types/outcome.d.ts.map +1 -0
- package/dist/types/outcome.js +50 -0
- package/dist/types/outcome.js.map +1 -0
- package/dist/types/playbook.d.ts +119 -0
- package/dist/types/playbook.d.ts.map +1 -0
- package/dist/types/playbook.js +71 -0
- package/dist/types/playbook.js.map +1 -0
- package/dist/types/step.d.ts +44 -0
- package/dist/types/step.d.ts.map +1 -0
- package/dist/types/step.js +32 -0
- package/dist/types/step.js.map +1 -0
- package/dist/types/task.d.ts +91 -0
- package/dist/types/task.d.ts.map +1 -0
- package/dist/types/task.js +39 -0
- package/dist/types/task.js.map +1 -0
- package/dist/types/trajectory.d.ts +221 -0
- package/dist/types/trajectory.d.ts.map +1 -0
- package/dist/types/trajectory.js +60 -0
- package/dist/types/trajectory.js.map +1 -0
- package/dist/utils/index.d.ts +4 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +4 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/similarity.d.ts +31 -0
- package/dist/utils/similarity.d.ts.map +1 -0
- package/dist/utils/similarity.js +107 -0
- package/dist/utils/similarity.js.map +1 -0
- package/dist/utils/storage.d.ts +106 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +203 -0
- package/dist/utils/storage.js.map +1 -0
- package/dist/utils/validation.d.ts +129 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +171 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +61 -9
- package/scripts/migrate-to-playbooks.ts +307 -0
- package/src/agents/index.ts +14 -0
- package/src/agents/mock-provider.ts +93 -0
- package/src/agents/types.ts +137 -0
- package/src/atlas.ts +560 -0
- package/src/bin/cognitive-core.ts +470 -0
- package/src/embeddings/bm25.ts +337 -0
- package/src/embeddings/index.ts +39 -0
- package/src/embeddings/manager.ts +288 -0
- package/src/embeddings/provider.ts +311 -0
- package/src/embeddings/vector-store.ts +353 -0
- package/src/factory.ts +263 -0
- package/src/index.ts +246 -0
- package/src/learning/analyzer.ts +335 -0
- package/src/learning/effectiveness.ts +428 -0
- package/src/learning/index.ts +58 -0
- package/src/learning/llm-extractor.ts +542 -0
- package/src/learning/meta-learner.ts +516 -0
- package/src/learning/pipeline.ts +244 -0
- package/src/learning/playbook-extractor.ts +702 -0
- package/src/learning/usage-inference.ts +372 -0
- package/src/mcp/index.ts +12 -0
- package/src/mcp/playbook-server.ts +565 -0
- package/src/memory/curated-loader.ts +160 -0
- package/src/memory/experience.ts +515 -0
- package/src/memory/index.ts +27 -0
- package/src/memory/meta.ts +506 -0
- package/src/memory/playbook.ts +493 -0
- package/src/memory/system.ts +551 -0
- package/src/runtime/backends/acp.ts +378 -0
- package/src/runtime/backends/index.ts +24 -0
- package/src/runtime/backends/mock.ts +218 -0
- package/src/runtime/backends/subprocess.ts +356 -0
- package/src/runtime/flows/learning.ts +183 -0
- package/src/runtime/flows/validation.ts +381 -0
- package/src/runtime/index.ts +53 -0
- package/src/runtime/manager.ts +541 -0
- package/src/runtime/types.ts +157 -0
- package/src/search/evaluator.ts +474 -0
- package/src/search/index.ts +59 -0
- package/src/search/refinement-loop.ts +363 -0
- package/src/search/refinement-types.ts +159 -0
- package/src/search/router.ts +261 -0
- package/src/search/solver.ts +303 -0
- package/src/search/verification-runner.ts +570 -0
- package/src/surfacing/index.ts +6 -0
- package/src/surfacing/skill-library.ts +594 -0
- package/src/types/config.ts +333 -0
- package/src/types/index.ts +130 -0
- package/src/types/memory.ts +270 -0
- package/src/types/meta.ts +218 -0
- package/src/types/outcome.ts +66 -0
- package/src/types/playbook.ts +196 -0
- package/src/types/step.ts +40 -0
- package/src/types/task.ts +52 -0
- package/src/types/trajectory.ts +80 -0
- package/src/utils/index.ts +38 -0
- package/src/utils/similarity.ts +139 -0
- package/src/utils/storage.ts +249 -0
- package/src/utils/validation.ts +286 -0
- package/tests/embeddings/bm25.test.ts +130 -0
- package/tests/embeddings/manager.test.ts +205 -0
- package/tests/integration/atlas.test.ts +266 -0
- package/tests/integration/e2e.test.ts +929 -0
- package/tests/learning/analyzer.test.ts +426 -0
- package/tests/learning/effectiveness.test.ts +542 -0
- package/tests/learning/pipeline.test.ts +176 -0
- package/tests/learning/playbook-extractor-provenance.test.ts +114 -0
- package/tests/learning/usage-inference.test.ts +254 -0
- package/tests/mcp/playbook-server.test.ts +252 -0
- package/tests/memory/experience.test.ts +198 -0
- package/tests/memory/playbook.test.ts +338 -0
- package/tests/memory/provenance.test.ts +639 -0
- package/tests/memory/system.test.ts +325 -0
- package/tests/runtime/agent-manager.test.ts +512 -0
- package/tests/runtime/mock-backend.test.ts +248 -0
- package/tests/search/refinement-loop.test.ts +468 -0
- package/tests/search/refinement.test.ts +267 -0
- package/tests/search/router.test.ts +427 -0
- package/tests/surfacing/skill-library.test.ts +292 -0
- package/tests/types/outcome.test.ts +147 -0
- package/tests/types/step.test.ts +133 -0
- package/tests/types/task.test.ts +158 -0
- package/tests/types/trajectory.test.ts +253 -0
- package/tests/utils/similarity.test.ts +188 -0
- package/tests/utils/validation.test.ts +252 -0
- package/tsconfig.json +25 -0
- package/vitest.config.ts +22 -0
- package/index.d.ts +0 -4
- package/index.js +0 -4
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Solution Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Evaluates solution quality using a hybrid approach:
|
|
5
|
+
* 1. First tries task verification (if available)
|
|
6
|
+
* 2. Falls back to ACP agent evaluation
|
|
7
|
+
* 3. Falls back to heuristic evaluation as last resort
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { Trajectory } from '../types/trajectory.js';
|
|
11
|
+
import type { Task } from '../types/task.js';
|
|
12
|
+
import type { AgentManager } from '../runtime/manager.js';
|
|
13
|
+
import {
|
|
14
|
+
type EvaluationResult,
|
|
15
|
+
createEvaluationResult,
|
|
16
|
+
scoreToQuality,
|
|
17
|
+
} from './refinement-types.js';
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Verification function signature for task-specific verification
|
|
21
|
+
*/
|
|
22
|
+
export interface VerificationFunction {
|
|
23
|
+
(trajectory: Trajectory, task: Task): Promise<VerificationResult>;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Result from a verification function
|
|
28
|
+
*/
|
|
29
|
+
export interface VerificationResult {
|
|
30
|
+
/** Whether the solution passed verification */
|
|
31
|
+
passed: boolean;
|
|
32
|
+
/** Confidence in the verification result (0-1) */
|
|
33
|
+
confidence: number;
|
|
34
|
+
/** Specific issues found */
|
|
35
|
+
issues?: Array<{
|
|
36
|
+
type: 'incomplete' | 'incorrect' | 'error';
|
|
37
|
+
description: string;
|
|
38
|
+
severity?: 'critical' | 'major' | 'minor';
|
|
39
|
+
}>;
|
|
40
|
+
/** Additional details */
|
|
41
|
+
details?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Configuration for the SolutionEvaluator
|
|
46
|
+
*/
|
|
47
|
+
export interface EvaluatorConfig {
|
|
48
|
+
/** Agent type to use for evaluation (default: 'evaluator') */
|
|
49
|
+
evaluatorAgentType?: string;
|
|
50
|
+
/** Minimum confidence for verification to be accepted */
|
|
51
|
+
verificationConfidenceThreshold?: number;
|
|
52
|
+
/** Whether to always run agent evaluation even if verification passes */
|
|
53
|
+
alwaysUseAgent?: boolean;
|
|
54
|
+
/** Timeout for agent evaluation in ms */
|
|
55
|
+
agentTimeout?: number;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Default evaluator configuration
|
|
60
|
+
*/
|
|
61
|
+
const DEFAULT_CONFIG: Required<EvaluatorConfig> = {
|
|
62
|
+
evaluatorAgentType: 'evaluator',
|
|
63
|
+
verificationConfidenceThreshold: 0.8,
|
|
64
|
+
alwaysUseAgent: false,
|
|
65
|
+
agentTimeout: 60000,
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* SolutionEvaluator - Evaluates trajectory quality using hybrid approach
|
|
70
|
+
*/
|
|
71
|
+
export class SolutionEvaluator {
|
|
72
|
+
private agentManager: AgentManager | null;
|
|
73
|
+
private config: Required<EvaluatorConfig>;
|
|
74
|
+
private verifiers: Map<string, VerificationFunction> = new Map();
|
|
75
|
+
|
|
76
|
+
constructor(
|
|
77
|
+
agentManager: AgentManager | null,
|
|
78
|
+
config: EvaluatorConfig = {}
|
|
79
|
+
) {
|
|
80
|
+
this.agentManager = agentManager;
|
|
81
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Register a verification function for a task domain
|
|
86
|
+
*/
|
|
87
|
+
registerVerifier(domain: string, verifier: VerificationFunction): void {
|
|
88
|
+
this.verifiers.set(domain, verifier);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Evaluate a trajectory's quality
|
|
93
|
+
*/
|
|
94
|
+
async evaluate(
|
|
95
|
+
trajectory: Trajectory,
|
|
96
|
+
task: Task
|
|
97
|
+
): Promise<EvaluationResult> {
|
|
98
|
+
// 1. Try verification first (if available and applicable)
|
|
99
|
+
const verificationResult = await this.tryVerification(trajectory, task);
|
|
100
|
+
if (verificationResult) {
|
|
101
|
+
const { result, evalResult } = verificationResult;
|
|
102
|
+
|
|
103
|
+
// If verification is confident enough and doesn't always require agent
|
|
104
|
+
if (
|
|
105
|
+
result.confidence >= this.config.verificationConfidenceThreshold &&
|
|
106
|
+
!this.config.alwaysUseAgent
|
|
107
|
+
) {
|
|
108
|
+
return evalResult;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// 2. Try ACP agent evaluation
|
|
113
|
+
if (this.agentManager) {
|
|
114
|
+
try {
|
|
115
|
+
const agentResult = await this.evaluateWithAgent(trajectory, task);
|
|
116
|
+
return agentResult;
|
|
117
|
+
} catch (error) {
|
|
118
|
+
// Fall through to heuristic if agent fails
|
|
119
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
120
|
+
// Log warning but continue to heuristic
|
|
121
|
+
console.warn(`Agent evaluation failed, falling back to heuristic: ${errorMsg}`);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// 3. Fall back to heuristic evaluation
|
|
126
|
+
return this.evaluateHeuristic(trajectory, task);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Try verification if a verifier is available for the task domain
|
|
131
|
+
*/
|
|
132
|
+
private async tryVerification(
|
|
133
|
+
trajectory: Trajectory,
|
|
134
|
+
task: Task
|
|
135
|
+
): Promise<{ result: VerificationResult; evalResult: EvaluationResult } | null> {
|
|
136
|
+
// Check for domain-specific verifier
|
|
137
|
+
const verifier = task.domain ? this.verifiers.get(task.domain) : undefined;
|
|
138
|
+
if (!verifier) {
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
try {
|
|
143
|
+
const result = await verifier(trajectory, task);
|
|
144
|
+
|
|
145
|
+
const score = result.passed
|
|
146
|
+
? Math.max(0.7, result.confidence)
|
|
147
|
+
: Math.min(0.5, 1 - result.confidence);
|
|
148
|
+
|
|
149
|
+
const evalResult = createEvaluationResult({
|
|
150
|
+
quality: scoreToQuality(score),
|
|
151
|
+
score,
|
|
152
|
+
acceptable: result.passed && result.confidence >= this.config.verificationConfidenceThreshold,
|
|
153
|
+
issues: result.issues?.map((issue) => ({
|
|
154
|
+
type: issue.type,
|
|
155
|
+
description: issue.description,
|
|
156
|
+
severity: issue.severity ?? 'major',
|
|
157
|
+
})) ?? [],
|
|
158
|
+
method: 'verification',
|
|
159
|
+
rawResponse: result.details,
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
return { result, evalResult };
|
|
163
|
+
} catch (error) {
|
|
164
|
+
// Verification failed, return null to try other methods
|
|
165
|
+
console.warn(
|
|
166
|
+
'Verification failed:',
|
|
167
|
+
error instanceof Error ? error.message : String(error)
|
|
168
|
+
);
|
|
169
|
+
return null;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Evaluate using an ACP agent
|
|
175
|
+
*/
|
|
176
|
+
private async evaluateWithAgent(
|
|
177
|
+
trajectory: Trajectory,
|
|
178
|
+
task: Task
|
|
179
|
+
): Promise<EvaluationResult> {
|
|
180
|
+
if (!this.agentManager) {
|
|
181
|
+
throw new Error('AgentManager not available for agent evaluation');
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Create evaluation task for the agent
|
|
185
|
+
const evaluationTask: Task = {
|
|
186
|
+
id: `eval-${trajectory.id}`,
|
|
187
|
+
description: this.buildEvaluationPrompt(trajectory, task),
|
|
188
|
+
domain: 'evaluation',
|
|
189
|
+
context: {},
|
|
190
|
+
createdAt: new Date(),
|
|
191
|
+
metadata: {
|
|
192
|
+
originalTaskId: task.id,
|
|
193
|
+
trajectoryId: trajectory.id,
|
|
194
|
+
},
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
// Spawn evaluation agent
|
|
198
|
+
const result = await this.agentManager.spawn({
|
|
199
|
+
agentType: this.config.evaluatorAgentType,
|
|
200
|
+
task: evaluationTask,
|
|
201
|
+
timeout: this.config.agentTimeout,
|
|
202
|
+
backendOptions: {
|
|
203
|
+
// Evaluation agents should be concise
|
|
204
|
+
maxTokens: 2000,
|
|
205
|
+
},
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// Parse agent response into EvaluationResult
|
|
209
|
+
return this.parseAgentEvaluation(result.session.result, result.trajectory);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Build evaluation prompt for the agent
|
|
214
|
+
*/
|
|
215
|
+
private buildEvaluationPrompt(trajectory: Trajectory, task: Task): string {
|
|
216
|
+
const steps = trajectory.steps
|
|
217
|
+
.map((step, i) => {
|
|
218
|
+
let stepStr = `Step ${i + 1}:`;
|
|
219
|
+
if (step.thought) stepStr += `\n Thought: ${step.thought}`;
|
|
220
|
+
stepStr += `\n Action: ${step.action}`;
|
|
221
|
+
if (step.observation) stepStr += `\n Observation: ${step.observation}`;
|
|
222
|
+
return stepStr;
|
|
223
|
+
})
|
|
224
|
+
.join('\n\n');
|
|
225
|
+
|
|
226
|
+
const outcomeStr = trajectory.outcome.success
|
|
227
|
+
? `SUCCESS: ${trajectory.outcome.solution ?? 'Task completed'}`
|
|
228
|
+
: `FAILURE: ${trajectory.outcome.errorInfo ?? 'Unknown error'}`;
|
|
229
|
+
|
|
230
|
+
return `Evaluate the quality of this solution attempt.
|
|
231
|
+
|
|
232
|
+
## Original Task
|
|
233
|
+
${task.description}
|
|
234
|
+
|
|
235
|
+
## Solution Steps
|
|
236
|
+
${steps}
|
|
237
|
+
|
|
238
|
+
## Outcome
|
|
239
|
+
${outcomeStr}
|
|
240
|
+
|
|
241
|
+
## Instructions
|
|
242
|
+
Analyze the solution and provide:
|
|
243
|
+
1. Overall quality assessment (excellent/good/needs_work/poor)
|
|
244
|
+
2. A numeric score from 0.0 to 1.0
|
|
245
|
+
3. Whether this solution is acceptable
|
|
246
|
+
4. List any specific issues found
|
|
247
|
+
5. Suggestions for improvement
|
|
248
|
+
|
|
249
|
+
Respond in JSON format:
|
|
250
|
+
{
|
|
251
|
+
"quality": "excellent|good|needs_work|poor",
|
|
252
|
+
"score": 0.85,
|
|
253
|
+
"acceptable": true,
|
|
254
|
+
"issues": [
|
|
255
|
+
{"type": "incomplete|incorrect|inefficient|off_topic|error", "description": "...", "severity": "critical|major|minor"}
|
|
256
|
+
],
|
|
257
|
+
"suggestions": ["..."]
|
|
258
|
+
}`;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Parse agent evaluation response into EvaluationResult
|
|
263
|
+
*/
|
|
264
|
+
private parseAgentEvaluation(
|
|
265
|
+
result: unknown,
|
|
266
|
+
_evalTrajectory: Trajectory
|
|
267
|
+
): EvaluationResult {
|
|
268
|
+
// Try to parse as JSON
|
|
269
|
+
if (typeof result === 'string') {
|
|
270
|
+
try {
|
|
271
|
+
// Extract JSON from response (agent may include extra text)
|
|
272
|
+
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
|
273
|
+
if (jsonMatch) {
|
|
274
|
+
const parsed = JSON.parse(jsonMatch[0]) as {
|
|
275
|
+
quality?: string;
|
|
276
|
+
score?: number;
|
|
277
|
+
acceptable?: boolean;
|
|
278
|
+
issues?: Array<{
|
|
279
|
+
type?: string;
|
|
280
|
+
description?: string;
|
|
281
|
+
severity?: string;
|
|
282
|
+
}>;
|
|
283
|
+
suggestions?: string[];
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
return createEvaluationResult({
|
|
287
|
+
quality: this.parseQuality(parsed.quality),
|
|
288
|
+
score: Math.min(1, Math.max(0, parsed.score ?? 0.5)),
|
|
289
|
+
acceptable: parsed.acceptable ?? false,
|
|
290
|
+
issues: (parsed.issues ?? []).map((issue) => ({
|
|
291
|
+
type: this.parseIssueType(issue.type),
|
|
292
|
+
description: issue.description ?? 'Unknown issue',
|
|
293
|
+
severity: this.parseSeverity(issue.severity),
|
|
294
|
+
suggestion: undefined,
|
|
295
|
+
})),
|
|
296
|
+
method: 'agent',
|
|
297
|
+
rawResponse: result,
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
} catch {
|
|
301
|
+
// Fall through to heuristic parsing
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// If we can't parse, return a conservative result
|
|
306
|
+
return createEvaluationResult({
|
|
307
|
+
quality: 'needs_work',
|
|
308
|
+
score: 0.5,
|
|
309
|
+
acceptable: false,
|
|
310
|
+
issues: [
|
|
311
|
+
{
|
|
312
|
+
type: 'error',
|
|
313
|
+
description: 'Could not parse agent evaluation response',
|
|
314
|
+
severity: 'major',
|
|
315
|
+
},
|
|
316
|
+
],
|
|
317
|
+
method: 'agent',
|
|
318
|
+
rawResponse: typeof result === 'string' ? result : JSON.stringify(result),
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Heuristic evaluation based on trajectory properties
|
|
324
|
+
*/
|
|
325
|
+
private evaluateHeuristic(
|
|
326
|
+
trajectory: Trajectory,
|
|
327
|
+
_task: Task
|
|
328
|
+
): EvaluationResult {
|
|
329
|
+
const issues: Array<{
|
|
330
|
+
type: 'incomplete' | 'incorrect' | 'inefficient' | 'off_topic' | 'error';
|
|
331
|
+
description: string;
|
|
332
|
+
severity: 'critical' | 'major' | 'minor';
|
|
333
|
+
}> = [];
|
|
334
|
+
|
|
335
|
+
let score = 0.5; // Start neutral
|
|
336
|
+
|
|
337
|
+
// Check outcome
|
|
338
|
+
if (trajectory.outcome.success) {
|
|
339
|
+
score += 0.3;
|
|
340
|
+
} else {
|
|
341
|
+
score -= 0.2;
|
|
342
|
+
issues.push({
|
|
343
|
+
type: 'error',
|
|
344
|
+
description: trajectory.outcome.errorInfo ?? 'Task did not complete successfully',
|
|
345
|
+
severity: 'critical',
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Check for steps taken
|
|
350
|
+
if (trajectory.steps.length === 0) {
|
|
351
|
+
score -= 0.2;
|
|
352
|
+
issues.push({
|
|
353
|
+
type: 'incomplete',
|
|
354
|
+
description: 'No steps were taken to solve the task',
|
|
355
|
+
severity: 'critical',
|
|
356
|
+
});
|
|
357
|
+
} else {
|
|
358
|
+
// Penalize very long trajectories (may indicate inefficiency)
|
|
359
|
+
if (trajectory.steps.length > 20) {
|
|
360
|
+
score -= 0.1;
|
|
361
|
+
issues.push({
|
|
362
|
+
type: 'inefficient',
|
|
363
|
+
description: `Solution took ${trajectory.steps.length} steps, which may indicate inefficiency`,
|
|
364
|
+
severity: 'minor',
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Check for errors in steps
|
|
370
|
+
const errorSteps = trajectory.steps.filter(
|
|
371
|
+
(step) => step.observation?.toLowerCase().includes('error')
|
|
372
|
+
);
|
|
373
|
+
if (errorSteps.length > 0) {
|
|
374
|
+
score -= 0.1 * Math.min(3, errorSteps.length);
|
|
375
|
+
issues.push({
|
|
376
|
+
type: 'error',
|
|
377
|
+
description: `${errorSteps.length} step(s) encountered errors`,
|
|
378
|
+
severity: errorSteps.length > 2 ? 'major' : 'minor',
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// Check key steps (high attribution)
|
|
383
|
+
const keySteps = trajectory.steps.filter(
|
|
384
|
+
(step) => (step.attributionScore ?? 0) >= 0.15
|
|
385
|
+
);
|
|
386
|
+
if (keySteps.length > 0 && trajectory.outcome.success) {
|
|
387
|
+
score += 0.1;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// Normalize score to [0, 1]
|
|
391
|
+
score = Math.max(0, Math.min(1, score));
|
|
392
|
+
|
|
393
|
+
return createEvaluationResult({
|
|
394
|
+
quality: scoreToQuality(score),
|
|
395
|
+
score,
|
|
396
|
+
acceptable: trajectory.outcome.success && score >= 0.6,
|
|
397
|
+
issues,
|
|
398
|
+
method: 'heuristic',
|
|
399
|
+
});
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Parse quality string to QualityLevel
|
|
404
|
+
*/
|
|
405
|
+
private parseQuality(
|
|
406
|
+
quality?: string
|
|
407
|
+
): 'excellent' | 'good' | 'needs_work' | 'poor' {
|
|
408
|
+
const normalized = quality?.toLowerCase();
|
|
409
|
+
switch (normalized) {
|
|
410
|
+
case 'excellent':
|
|
411
|
+
return 'excellent';
|
|
412
|
+
case 'good':
|
|
413
|
+
return 'good';
|
|
414
|
+
case 'needs_work':
|
|
415
|
+
case 'needs work':
|
|
416
|
+
return 'needs_work';
|
|
417
|
+
case 'poor':
|
|
418
|
+
return 'poor';
|
|
419
|
+
default:
|
|
420
|
+
return 'needs_work';
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
/**
|
|
425
|
+
* Parse issue type string
|
|
426
|
+
*/
|
|
427
|
+
private parseIssueType(
|
|
428
|
+
type?: string
|
|
429
|
+
): 'incomplete' | 'incorrect' | 'inefficient' | 'off_topic' | 'error' {
|
|
430
|
+
const normalized = type?.toLowerCase();
|
|
431
|
+
switch (normalized) {
|
|
432
|
+
case 'incomplete':
|
|
433
|
+
return 'incomplete';
|
|
434
|
+
case 'incorrect':
|
|
435
|
+
return 'incorrect';
|
|
436
|
+
case 'inefficient':
|
|
437
|
+
return 'inefficient';
|
|
438
|
+
case 'off_topic':
|
|
439
|
+
case 'off-topic':
|
|
440
|
+
return 'off_topic';
|
|
441
|
+
case 'error':
|
|
442
|
+
return 'error';
|
|
443
|
+
default:
|
|
444
|
+
return 'error';
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* Parse severity string
|
|
450
|
+
*/
|
|
451
|
+
private parseSeverity(severity?: string): 'critical' | 'major' | 'minor' {
|
|
452
|
+
const normalized = severity?.toLowerCase();
|
|
453
|
+
switch (normalized) {
|
|
454
|
+
case 'critical':
|
|
455
|
+
return 'critical';
|
|
456
|
+
case 'major':
|
|
457
|
+
return 'major';
|
|
458
|
+
case 'minor':
|
|
459
|
+
return 'minor';
|
|
460
|
+
default:
|
|
461
|
+
return 'major';
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Create a solution evaluator
|
|
468
|
+
*/
|
|
469
|
+
export function createSolutionEvaluator(
|
|
470
|
+
agentManager: AgentManager | null,
|
|
471
|
+
config?: EvaluatorConfig
|
|
472
|
+
): SolutionEvaluator {
|
|
473
|
+
return new SolutionEvaluator(agentManager, config);
|
|
474
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
export {
|
|
2
|
+
TaskRouter,
|
|
3
|
+
createRouter,
|
|
4
|
+
type RoutingDecision,
|
|
5
|
+
} from './router.js';
|
|
6
|
+
|
|
7
|
+
export {
|
|
8
|
+
DirectSolver,
|
|
9
|
+
createSolver,
|
|
10
|
+
type SolverConfig,
|
|
11
|
+
type SolverResult,
|
|
12
|
+
} from './solver.js';
|
|
13
|
+
|
|
14
|
+
// Refinement types
|
|
15
|
+
export {
|
|
16
|
+
type EvaluationResult,
|
|
17
|
+
type EvaluationIssue,
|
|
18
|
+
type EvaluationMethod,
|
|
19
|
+
type QualityLevel,
|
|
20
|
+
type FailureContext,
|
|
21
|
+
type RefinementResult,
|
|
22
|
+
type IssueType,
|
|
23
|
+
type IssueSeverity,
|
|
24
|
+
createEvaluationResult,
|
|
25
|
+
scoreToQuality,
|
|
26
|
+
isQualityAcceptable,
|
|
27
|
+
EvaluationResultSchema,
|
|
28
|
+
EvaluationIssueSchema,
|
|
29
|
+
QualityLevelSchema,
|
|
30
|
+
IssueSeveritySchema,
|
|
31
|
+
IssueTypeSchema,
|
|
32
|
+
EvaluationMethodSchema,
|
|
33
|
+
} from './refinement-types.js';
|
|
34
|
+
|
|
35
|
+
// Solution evaluator
|
|
36
|
+
export {
|
|
37
|
+
SolutionEvaluator,
|
|
38
|
+
createSolutionEvaluator,
|
|
39
|
+
type EvaluatorConfig,
|
|
40
|
+
type VerificationFunction,
|
|
41
|
+
type VerificationResult,
|
|
42
|
+
} from './evaluator.js';
|
|
43
|
+
|
|
44
|
+
// Refinement loop
|
|
45
|
+
export {
|
|
46
|
+
RefinementLoop,
|
|
47
|
+
createRefinementLoop,
|
|
48
|
+
type RefinementLoopConfig,
|
|
49
|
+
} from './refinement-loop.js';
|
|
50
|
+
|
|
51
|
+
// Verification runner
|
|
52
|
+
export {
|
|
53
|
+
VerificationRunner,
|
|
54
|
+
createVerificationRunner,
|
|
55
|
+
TestRunners,
|
|
56
|
+
type CommandVerificationConfig,
|
|
57
|
+
type CommandResult,
|
|
58
|
+
type VerificationIssue,
|
|
59
|
+
} from './verification-runner.js';
|