cognitive-core 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -2
- package/SKILL.md +193 -0
- package/dist/agents/index.d.ts +3 -0
- package/dist/agents/index.d.ts.map +1 -0
- package/dist/agents/index.js +5 -0
- package/dist/agents/index.js.map +1 -0
- package/dist/agents/mock-provider.d.ts +23 -0
- package/dist/agents/mock-provider.d.ts.map +1 -0
- package/dist/agents/mock-provider.js +71 -0
- package/dist/agents/mock-provider.js.map +1 -0
- package/dist/agents/types.d.ts +98 -0
- package/dist/agents/types.d.ts.map +1 -0
- package/dist/agents/types.js +44 -0
- package/dist/agents/types.js.map +1 -0
- package/dist/atlas.d.ts +196 -0
- package/dist/atlas.d.ts.map +1 -0
- package/dist/atlas.js +373 -0
- package/dist/atlas.js.map +1 -0
- package/dist/bin/cognitive-core.d.ts +18 -0
- package/dist/bin/cognitive-core.d.ts.map +1 -0
- package/dist/bin/cognitive-core.js +419 -0
- package/dist/bin/cognitive-core.js.map +1 -0
- package/dist/embeddings/bm25.d.ts +104 -0
- package/dist/embeddings/bm25.d.ts.map +1 -0
- package/dist/embeddings/bm25.js +264 -0
- package/dist/embeddings/bm25.js.map +1 -0
- package/dist/embeddings/index.d.ts +12 -0
- package/dist/embeddings/index.d.ts.map +1 -0
- package/dist/embeddings/index.js +16 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/embeddings/manager.d.ts +112 -0
- package/dist/embeddings/manager.d.ts.map +1 -0
- package/dist/embeddings/manager.js +215 -0
- package/dist/embeddings/manager.js.map +1 -0
- package/dist/embeddings/provider.d.ts +101 -0
- package/dist/embeddings/provider.d.ts.map +1 -0
- package/dist/embeddings/provider.js +232 -0
- package/dist/embeddings/provider.js.map +1 -0
- package/dist/embeddings/vector-store.d.ts +101 -0
- package/dist/embeddings/vector-store.d.ts.map +1 -0
- package/dist/embeddings/vector-store.js +256 -0
- package/dist/embeddings/vector-store.js.map +1 -0
- package/dist/factory.d.ts +193 -0
- package/dist/factory.d.ts.map +1 -0
- package/dist/factory.js +109 -0
- package/dist/factory.js.map +1 -0
- package/dist/index.d.ts +43 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +84 -0
- package/dist/index.js.map +1 -0
- package/dist/learning/analyzer.d.ts +110 -0
- package/dist/learning/analyzer.d.ts.map +1 -0
- package/dist/learning/analyzer.js +213 -0
- package/dist/learning/analyzer.js.map +1 -0
- package/dist/learning/effectiveness.d.ts +158 -0
- package/dist/learning/effectiveness.d.ts.map +1 -0
- package/dist/learning/effectiveness.js +251 -0
- package/dist/learning/effectiveness.js.map +1 -0
- package/dist/learning/index.d.ts +8 -0
- package/dist/learning/index.d.ts.map +1 -0
- package/dist/learning/index.js +11 -0
- package/dist/learning/index.js.map +1 -0
- package/dist/learning/llm-extractor.d.ts +88 -0
- package/dist/learning/llm-extractor.d.ts.map +1 -0
- package/dist/learning/llm-extractor.js +372 -0
- package/dist/learning/llm-extractor.js.map +1 -0
- package/dist/learning/meta-learner.d.ts +80 -0
- package/dist/learning/meta-learner.d.ts.map +1 -0
- package/dist/learning/meta-learner.js +355 -0
- package/dist/learning/meta-learner.js.map +1 -0
- package/dist/learning/pipeline.d.ts +65 -0
- package/dist/learning/pipeline.d.ts.map +1 -0
- package/dist/learning/pipeline.js +170 -0
- package/dist/learning/pipeline.js.map +1 -0
- package/dist/learning/playbook-extractor.d.ts +113 -0
- package/dist/learning/playbook-extractor.d.ts.map +1 -0
- package/dist/learning/playbook-extractor.js +523 -0
- package/dist/learning/playbook-extractor.js.map +1 -0
- package/dist/learning/usage-inference.d.ts +82 -0
- package/dist/learning/usage-inference.d.ts.map +1 -0
- package/dist/learning/usage-inference.js +261 -0
- package/dist/learning/usage-inference.js.map +1 -0
- package/dist/mcp/index.d.ts +6 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +6 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/playbook-server.d.ts +120 -0
- package/dist/mcp/playbook-server.d.ts.map +1 -0
- package/dist/mcp/playbook-server.js +427 -0
- package/dist/mcp/playbook-server.js.map +1 -0
- package/dist/memory/curated-loader.d.ts +62 -0
- package/dist/memory/curated-loader.d.ts.map +1 -0
- package/dist/memory/curated-loader.js +106 -0
- package/dist/memory/curated-loader.js.map +1 -0
- package/dist/memory/experience.d.ts +122 -0
- package/dist/memory/experience.d.ts.map +1 -0
- package/dist/memory/experience.js +392 -0
- package/dist/memory/experience.js.map +1 -0
- package/dist/memory/index.d.ts +6 -0
- package/dist/memory/index.d.ts.map +1 -0
- package/dist/memory/index.js +9 -0
- package/dist/memory/index.js.map +1 -0
- package/dist/memory/meta.d.ts +90 -0
- package/dist/memory/meta.d.ts.map +1 -0
- package/dist/memory/meta.js +362 -0
- package/dist/memory/meta.js.map +1 -0
- package/dist/memory/playbook.d.ts +133 -0
- package/dist/memory/playbook.d.ts.map +1 -0
- package/dist/memory/playbook.js +357 -0
- package/dist/memory/playbook.js.map +1 -0
- package/dist/memory/system.d.ts +167 -0
- package/dist/memory/system.d.ts.map +1 -0
- package/dist/memory/system.js +383 -0
- package/dist/memory/system.js.map +1 -0
- package/dist/runtime/backends/acp.d.ts +67 -0
- package/dist/runtime/backends/acp.d.ts.map +1 -0
- package/dist/runtime/backends/acp.js +290 -0
- package/dist/runtime/backends/acp.js.map +1 -0
- package/dist/runtime/backends/index.d.ts +5 -0
- package/dist/runtime/backends/index.d.ts.map +1 -0
- package/dist/runtime/backends/index.js +6 -0
- package/dist/runtime/backends/index.js.map +1 -0
- package/dist/runtime/backends/mock.d.ts +67 -0
- package/dist/runtime/backends/mock.d.ts.map +1 -0
- package/dist/runtime/backends/mock.js +153 -0
- package/dist/runtime/backends/mock.js.map +1 -0
- package/dist/runtime/backends/subprocess.d.ts +56 -0
- package/dist/runtime/backends/subprocess.d.ts.map +1 -0
- package/dist/runtime/backends/subprocess.js +260 -0
- package/dist/runtime/backends/subprocess.js.map +1 -0
- package/dist/runtime/flows/learning.d.ts +73 -0
- package/dist/runtime/flows/learning.d.ts.map +1 -0
- package/dist/runtime/flows/learning.js +116 -0
- package/dist/runtime/flows/learning.js.map +1 -0
- package/dist/runtime/flows/validation.d.ts +122 -0
- package/dist/runtime/flows/validation.d.ts.map +1 -0
- package/dist/runtime/flows/validation.js +223 -0
- package/dist/runtime/flows/validation.js.map +1 -0
- package/dist/runtime/index.d.ts +6 -0
- package/dist/runtime/index.d.ts.map +1 -0
- package/dist/runtime/index.js +8 -0
- package/dist/runtime/index.js.map +1 -0
- package/dist/runtime/manager.d.ts +116 -0
- package/dist/runtime/manager.d.ts.map +1 -0
- package/dist/runtime/manager.js +416 -0
- package/dist/runtime/manager.js.map +1 -0
- package/dist/runtime/types.d.ts +138 -0
- package/dist/runtime/types.d.ts.map +1 -0
- package/dist/runtime/types.js +2 -0
- package/dist/runtime/types.js.map +1 -0
- package/dist/search/evaluator.d.ts +102 -0
- package/dist/search/evaluator.d.ts.map +1 -0
- package/dist/search/evaluator.js +352 -0
- package/dist/search/evaluator.js.map +1 -0
- package/dist/search/index.d.ts +7 -0
- package/dist/search/index.d.ts.map +1 -0
- package/dist/search/index.js +11 -0
- package/dist/search/index.js.map +1 -0
- package/dist/search/refinement-loop.d.ts +73 -0
- package/dist/search/refinement-loop.d.ts.map +1 -0
- package/dist/search/refinement-loop.js +245 -0
- package/dist/search/refinement-loop.js.map +1 -0
- package/dist/search/refinement-types.d.ts +154 -0
- package/dist/search/refinement-types.d.ts.map +1 -0
- package/dist/search/refinement-types.js +99 -0
- package/dist/search/refinement-types.js.map +1 -0
- package/dist/search/router.d.ts +61 -0
- package/dist/search/router.d.ts.map +1 -0
- package/dist/search/router.js +197 -0
- package/dist/search/router.js.map +1 -0
- package/dist/search/solver.d.ts +75 -0
- package/dist/search/solver.d.ts.map +1 -0
- package/dist/search/solver.js +216 -0
- package/dist/search/solver.js.map +1 -0
- package/dist/search/verification-runner.d.ts +125 -0
- package/dist/search/verification-runner.d.ts.map +1 -0
- package/dist/search/verification-runner.js +440 -0
- package/dist/search/verification-runner.js.map +1 -0
- package/dist/surfacing/index.d.ts +2 -0
- package/dist/surfacing/index.d.ts.map +1 -0
- package/dist/surfacing/index.js +2 -0
- package/dist/surfacing/index.js.map +1 -0
- package/dist/surfacing/skill-library.d.ts +158 -0
- package/dist/surfacing/skill-library.d.ts.map +1 -0
- package/dist/surfacing/skill-library.js +429 -0
- package/dist/surfacing/skill-library.js.map +1 -0
- package/dist/types/config.d.ts +1113 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/config.js +274 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/index.d.ts +9 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +14 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/memory.d.ts +339 -0
- package/dist/types/memory.d.ts.map +1 -0
- package/dist/types/memory.js +207 -0
- package/dist/types/memory.js.map +1 -0
- package/dist/types/meta.d.ts +146 -0
- package/dist/types/meta.d.ts.map +1 -0
- package/dist/types/meta.js +51 -0
- package/dist/types/meta.js.map +1 -0
- package/dist/types/outcome.d.ts +42 -0
- package/dist/types/outcome.d.ts.map +1 -0
- package/dist/types/outcome.js +50 -0
- package/dist/types/outcome.js.map +1 -0
- package/dist/types/playbook.d.ts +119 -0
- package/dist/types/playbook.d.ts.map +1 -0
- package/dist/types/playbook.js +71 -0
- package/dist/types/playbook.js.map +1 -0
- package/dist/types/step.d.ts +44 -0
- package/dist/types/step.d.ts.map +1 -0
- package/dist/types/step.js +32 -0
- package/dist/types/step.js.map +1 -0
- package/dist/types/task.d.ts +91 -0
- package/dist/types/task.d.ts.map +1 -0
- package/dist/types/task.js +39 -0
- package/dist/types/task.js.map +1 -0
- package/dist/types/trajectory.d.ts +221 -0
- package/dist/types/trajectory.d.ts.map +1 -0
- package/dist/types/trajectory.js +60 -0
- package/dist/types/trajectory.js.map +1 -0
- package/dist/utils/index.d.ts +4 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +4 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/similarity.d.ts +31 -0
- package/dist/utils/similarity.d.ts.map +1 -0
- package/dist/utils/similarity.js +107 -0
- package/dist/utils/similarity.js.map +1 -0
- package/dist/utils/storage.d.ts +106 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +203 -0
- package/dist/utils/storage.js.map +1 -0
- package/dist/utils/validation.d.ts +129 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +171 -0
- package/dist/utils/validation.js.map +1 -0
- package/package.json +61 -9
- package/scripts/migrate-to-playbooks.ts +307 -0
- package/src/agents/index.ts +14 -0
- package/src/agents/mock-provider.ts +93 -0
- package/src/agents/types.ts +137 -0
- package/src/atlas.ts +560 -0
- package/src/bin/cognitive-core.ts +470 -0
- package/src/embeddings/bm25.ts +337 -0
- package/src/embeddings/index.ts +39 -0
- package/src/embeddings/manager.ts +288 -0
- package/src/embeddings/provider.ts +311 -0
- package/src/embeddings/vector-store.ts +353 -0
- package/src/factory.ts +263 -0
- package/src/index.ts +246 -0
- package/src/learning/analyzer.ts +335 -0
- package/src/learning/effectiveness.ts +428 -0
- package/src/learning/index.ts +58 -0
- package/src/learning/llm-extractor.ts +542 -0
- package/src/learning/meta-learner.ts +516 -0
- package/src/learning/pipeline.ts +244 -0
- package/src/learning/playbook-extractor.ts +702 -0
- package/src/learning/usage-inference.ts +372 -0
- package/src/mcp/index.ts +12 -0
- package/src/mcp/playbook-server.ts +565 -0
- package/src/memory/curated-loader.ts +160 -0
- package/src/memory/experience.ts +515 -0
- package/src/memory/index.ts +27 -0
- package/src/memory/meta.ts +506 -0
- package/src/memory/playbook.ts +493 -0
- package/src/memory/system.ts +551 -0
- package/src/runtime/backends/acp.ts +378 -0
- package/src/runtime/backends/index.ts +24 -0
- package/src/runtime/backends/mock.ts +218 -0
- package/src/runtime/backends/subprocess.ts +356 -0
- package/src/runtime/flows/learning.ts +183 -0
- package/src/runtime/flows/validation.ts +381 -0
- package/src/runtime/index.ts +53 -0
- package/src/runtime/manager.ts +541 -0
- package/src/runtime/types.ts +157 -0
- package/src/search/evaluator.ts +474 -0
- package/src/search/index.ts +59 -0
- package/src/search/refinement-loop.ts +363 -0
- package/src/search/refinement-types.ts +159 -0
- package/src/search/router.ts +261 -0
- package/src/search/solver.ts +303 -0
- package/src/search/verification-runner.ts +570 -0
- package/src/surfacing/index.ts +6 -0
- package/src/surfacing/skill-library.ts +594 -0
- package/src/types/config.ts +333 -0
- package/src/types/index.ts +130 -0
- package/src/types/memory.ts +270 -0
- package/src/types/meta.ts +218 -0
- package/src/types/outcome.ts +66 -0
- package/src/types/playbook.ts +196 -0
- package/src/types/step.ts +40 -0
- package/src/types/task.ts +52 -0
- package/src/types/trajectory.ts +80 -0
- package/src/utils/index.ts +38 -0
- package/src/utils/similarity.ts +139 -0
- package/src/utils/storage.ts +249 -0
- package/src/utils/validation.ts +286 -0
- package/tests/embeddings/bm25.test.ts +130 -0
- package/tests/embeddings/manager.test.ts +205 -0
- package/tests/integration/atlas.test.ts +266 -0
- package/tests/integration/e2e.test.ts +929 -0
- package/tests/learning/analyzer.test.ts +426 -0
- package/tests/learning/effectiveness.test.ts +542 -0
- package/tests/learning/pipeline.test.ts +176 -0
- package/tests/learning/playbook-extractor-provenance.test.ts +114 -0
- package/tests/learning/usage-inference.test.ts +254 -0
- package/tests/mcp/playbook-server.test.ts +252 -0
- package/tests/memory/experience.test.ts +198 -0
- package/tests/memory/playbook.test.ts +338 -0
- package/tests/memory/provenance.test.ts +639 -0
- package/tests/memory/system.test.ts +325 -0
- package/tests/runtime/agent-manager.test.ts +512 -0
- package/tests/runtime/mock-backend.test.ts +248 -0
- package/tests/search/refinement-loop.test.ts +468 -0
- package/tests/search/refinement.test.ts +267 -0
- package/tests/search/router.test.ts +427 -0
- package/tests/surfacing/skill-library.test.ts +292 -0
- package/tests/types/outcome.test.ts +147 -0
- package/tests/types/step.test.ts +133 -0
- package/tests/types/task.test.ts +158 -0
- package/tests/types/trajectory.test.ts +253 -0
- package/tests/utils/similarity.test.ts +188 -0
- package/tests/utils/validation.test.ts +252 -0
- package/tsconfig.json +25 -0
- package/vitest.config.ts +22 -0
- package/index.d.ts +0 -4
- package/index.js +0 -4
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
import type { Task } from '../../types/index.js';
|
|
2
|
+
import type { MemorySystem } from '../../memory/system.js';
|
|
3
|
+
import type { AgentManager } from '../manager.js';
|
|
4
|
+
import type { AgentResult } from '../types.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Configuration for the validation flow
|
|
8
|
+
*/
|
|
9
|
+
export interface ValidationFlowConfig {
|
|
10
|
+
/** Agent type to use for execution */
|
|
11
|
+
agentType: string;
|
|
12
|
+
/** Number of runs per task for statistical significance */
|
|
13
|
+
runsPerTask?: number;
|
|
14
|
+
/** Whether to run comparison (with vs without knowledge) */
|
|
15
|
+
runComparison?: boolean;
|
|
16
|
+
/** Timeout per task in ms */
|
|
17
|
+
taskTimeout?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Metrics for measuring improvement
|
|
22
|
+
*/
|
|
23
|
+
export interface ValidationMetrics {
|
|
24
|
+
/** Success rate (0-1) */
|
|
25
|
+
successRate: number;
|
|
26
|
+
/** Average execution time in ms */
|
|
27
|
+
avgTime: number;
|
|
28
|
+
/** Average tool calls per task */
|
|
29
|
+
avgToolCalls: number;
|
|
30
|
+
/** Standard deviation of time */
|
|
31
|
+
timeStdDev: number;
|
|
32
|
+
/** Total tasks run */
|
|
33
|
+
totalRuns: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Comparison result between with/without knowledge
|
|
38
|
+
*/
|
|
39
|
+
export interface ComparisonResult {
|
|
40
|
+
/** Metrics with knowledge injection */
|
|
41
|
+
withKnowledge: ValidationMetrics;
|
|
42
|
+
/** Metrics without knowledge (baseline) */
|
|
43
|
+
baseline: ValidationMetrics;
|
|
44
|
+
/** Improvement metrics */
|
|
45
|
+
improvement: {
|
|
46
|
+
successRateDelta: number;
|
|
47
|
+
timeDeltaPercent: number;
|
|
48
|
+
toolCallDeltaPercent: number;
|
|
49
|
+
/** Statistical significance (p-value approximation) */
|
|
50
|
+
isSignificant: boolean;
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Result of validation flow
|
|
56
|
+
*/
|
|
57
|
+
export interface ValidationFlowResult {
|
|
58
|
+
/** Results for each task */
|
|
59
|
+
taskResults: Array<{
|
|
60
|
+
task: Task;
|
|
61
|
+
results: AgentResult[];
|
|
62
|
+
metrics: ValidationMetrics;
|
|
63
|
+
}>;
|
|
64
|
+
/** Aggregate metrics */
|
|
65
|
+
aggregateMetrics: ValidationMetrics;
|
|
66
|
+
/** Comparison if runComparison was true */
|
|
67
|
+
comparison?: ComparisonResult;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Validation Flow
|
|
72
|
+
* Orchestrates: Knowledge injection → Agent execution → Performance measurement
|
|
73
|
+
*
|
|
74
|
+
* This flow is for the "injection" direction:
|
|
75
|
+
* - Test how well learning is working
|
|
76
|
+
* - Compare performance with vs without knowledge
|
|
77
|
+
* - Measure improvement over time
|
|
78
|
+
*/
|
|
79
|
+
export class ValidationFlow {
|
|
80
|
+
private manager: AgentManager;
|
|
81
|
+
private config: ValidationFlowConfig;
|
|
82
|
+
|
|
83
|
+
constructor(
|
|
84
|
+
manager: AgentManager,
|
|
85
|
+
_memory: MemorySystem, // Reserved for future memory state tracking
|
|
86
|
+
config: ValidationFlowConfig
|
|
87
|
+
) {
|
|
88
|
+
this.manager = manager;
|
|
89
|
+
this.config = {
|
|
90
|
+
runsPerTask: 1,
|
|
91
|
+
runComparison: true,
|
|
92
|
+
taskTimeout: 300000, // 5 minutes
|
|
93
|
+
...config,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Validate on a single task
|
|
99
|
+
*/
|
|
100
|
+
async validateTask(task: Task): Promise<{
|
|
101
|
+
task: Task;
|
|
102
|
+
results: AgentResult[];
|
|
103
|
+
metrics: ValidationMetrics;
|
|
104
|
+
comparison?: ComparisonResult;
|
|
105
|
+
}> {
|
|
106
|
+
const runsPerTask = this.config.runsPerTask ?? 1;
|
|
107
|
+
const results: AgentResult[] = [];
|
|
108
|
+
const baselineResults: AgentResult[] = [];
|
|
109
|
+
|
|
110
|
+
// Run with knowledge injection
|
|
111
|
+
for (let i = 0; i < runsPerTask; i++) {
|
|
112
|
+
const result = await this.manager.spawn({
|
|
113
|
+
agentType: this.config.agentType,
|
|
114
|
+
task,
|
|
115
|
+
timeout: this.config.taskTimeout,
|
|
116
|
+
captureToolCalls: true,
|
|
117
|
+
});
|
|
118
|
+
results.push(result);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Run baseline if comparison enabled
|
|
122
|
+
if (this.config.runComparison) {
|
|
123
|
+
for (let i = 0; i < runsPerTask; i++) {
|
|
124
|
+
const result = await this.manager.spawnBaseline({
|
|
125
|
+
agentType: this.config.agentType,
|
|
126
|
+
task,
|
|
127
|
+
timeout: this.config.taskTimeout,
|
|
128
|
+
captureToolCalls: true,
|
|
129
|
+
});
|
|
130
|
+
baselineResults.push(result);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const metrics = this.calculateMetrics(results);
|
|
135
|
+
let comparison: ComparisonResult | undefined;
|
|
136
|
+
|
|
137
|
+
if (this.config.runComparison && baselineResults.length > 0) {
|
|
138
|
+
const baselineMetrics = this.calculateMetrics(baselineResults);
|
|
139
|
+
comparison = this.calculateComparison(metrics, baselineMetrics);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return { task, results, metrics, comparison };
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Validate on multiple tasks
|
|
147
|
+
*/
|
|
148
|
+
async validate(tasks: Task[]): Promise<ValidationFlowResult> {
|
|
149
|
+
const taskResults: ValidationFlowResult['taskResults'] = [];
|
|
150
|
+
const allResults: AgentResult[] = [];
|
|
151
|
+
const allBaselineResults: AgentResult[] = [];
|
|
152
|
+
|
|
153
|
+
for (const task of tasks) {
|
|
154
|
+
const result = await this.validateTask(task);
|
|
155
|
+
taskResults.push({
|
|
156
|
+
task: result.task,
|
|
157
|
+
results: result.results,
|
|
158
|
+
metrics: result.metrics,
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
allResults.push(...result.results);
|
|
162
|
+
|
|
163
|
+
// Collect baseline results for aggregate comparison
|
|
164
|
+
if (this.config.runComparison) {
|
|
165
|
+
// Re-run baseline for aggregate (or we could store from validateTask)
|
|
166
|
+
for (let i = 0; i < (this.config.runsPerTask ?? 1); i++) {
|
|
167
|
+
const baselineResult = await this.manager.spawnBaseline({
|
|
168
|
+
agentType: this.config.agentType,
|
|
169
|
+
task,
|
|
170
|
+
timeout: this.config.taskTimeout,
|
|
171
|
+
captureToolCalls: true,
|
|
172
|
+
});
|
|
173
|
+
allBaselineResults.push(baselineResult);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const aggregateMetrics = this.calculateMetrics(allResults);
|
|
179
|
+
let comparison: ComparisonResult | undefined;
|
|
180
|
+
|
|
181
|
+
if (this.config.runComparison && allBaselineResults.length > 0) {
|
|
182
|
+
const baselineMetrics = this.calculateMetrics(allBaselineResults);
|
|
183
|
+
comparison = this.calculateComparison(aggregateMetrics, baselineMetrics);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
taskResults,
|
|
188
|
+
aggregateMetrics,
|
|
189
|
+
comparison,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Quick validation - single run per task, with comparison
|
|
195
|
+
*/
|
|
196
|
+
async quickValidate(tasks: Task[]): Promise<{
|
|
197
|
+
successRateWithKnowledge: number;
|
|
198
|
+
successRateBaseline: number;
|
|
199
|
+
improvement: number;
|
|
200
|
+
details: Array<{
|
|
201
|
+
task: Task;
|
|
202
|
+
withKnowledge: boolean;
|
|
203
|
+
baseline: boolean;
|
|
204
|
+
}>;
|
|
205
|
+
}> {
|
|
206
|
+
const details: Array<{
|
|
207
|
+
task: Task;
|
|
208
|
+
withKnowledge: boolean;
|
|
209
|
+
baseline: boolean;
|
|
210
|
+
}> = [];
|
|
211
|
+
|
|
212
|
+
for (const task of tasks) {
|
|
213
|
+
const [withKnowledge, baseline] = await Promise.all([
|
|
214
|
+
this.manager.spawn({
|
|
215
|
+
agentType: this.config.agentType,
|
|
216
|
+
task,
|
|
217
|
+
timeout: this.config.taskTimeout,
|
|
218
|
+
}),
|
|
219
|
+
this.manager.spawnBaseline({
|
|
220
|
+
agentType: this.config.agentType,
|
|
221
|
+
task,
|
|
222
|
+
timeout: this.config.taskTimeout,
|
|
223
|
+
}),
|
|
224
|
+
]);
|
|
225
|
+
|
|
226
|
+
details.push({
|
|
227
|
+
task,
|
|
228
|
+
withKnowledge: withKnowledge.success,
|
|
229
|
+
baseline: baseline.success,
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const successRateWithKnowledge =
|
|
234
|
+
details.filter((d) => d.withKnowledge).length / details.length;
|
|
235
|
+
const successRateBaseline =
|
|
236
|
+
details.filter((d) => d.baseline).length / details.length;
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
successRateWithKnowledge,
|
|
240
|
+
successRateBaseline,
|
|
241
|
+
improvement: successRateWithKnowledge - successRateBaseline,
|
|
242
|
+
details,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Calculate metrics from results
|
|
248
|
+
*/
|
|
249
|
+
private calculateMetrics(results: AgentResult[]): ValidationMetrics {
|
|
250
|
+
if (results.length === 0) {
|
|
251
|
+
return {
|
|
252
|
+
successRate: 0,
|
|
253
|
+
avgTime: 0,
|
|
254
|
+
avgToolCalls: 0,
|
|
255
|
+
timeStdDev: 0,
|
|
256
|
+
totalRuns: 0,
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const successCount = results.filter((r) => r.success).length;
|
|
261
|
+
const times = results.map((r) => r.metrics.totalTime);
|
|
262
|
+
const toolCalls = results.map((r) => r.metrics.toolCallCount);
|
|
263
|
+
|
|
264
|
+
const avgTime = times.reduce((a, b) => a + b, 0) / times.length;
|
|
265
|
+
const avgToolCalls =
|
|
266
|
+
toolCalls.reduce((a, b) => a + b, 0) / toolCalls.length;
|
|
267
|
+
|
|
268
|
+
// Calculate standard deviation
|
|
269
|
+
const timeVariance =
|
|
270
|
+
times.reduce((sum, t) => sum + Math.pow(t - avgTime, 2), 0) /
|
|
271
|
+
times.length;
|
|
272
|
+
const timeStdDev = Math.sqrt(timeVariance);
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
successRate: successCount / results.length,
|
|
276
|
+
avgTime,
|
|
277
|
+
avgToolCalls,
|
|
278
|
+
timeStdDev,
|
|
279
|
+
totalRuns: results.length,
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Calculate comparison between two sets of metrics
|
|
285
|
+
*/
|
|
286
|
+
private calculateComparison(
|
|
287
|
+
withKnowledge: ValidationMetrics,
|
|
288
|
+
baseline: ValidationMetrics
|
|
289
|
+
): ComparisonResult {
|
|
290
|
+
const successRateDelta =
|
|
291
|
+
withKnowledge.successRate - baseline.successRate;
|
|
292
|
+
|
|
293
|
+
const timeDeltaPercent =
|
|
294
|
+
baseline.avgTime > 0
|
|
295
|
+
? ((baseline.avgTime - withKnowledge.avgTime) / baseline.avgTime) * 100
|
|
296
|
+
: 0;
|
|
297
|
+
|
|
298
|
+
const toolCallDeltaPercent =
|
|
299
|
+
baseline.avgToolCalls > 0
|
|
300
|
+
? ((baseline.avgToolCalls - withKnowledge.avgToolCalls) /
|
|
301
|
+
baseline.avgToolCalls) *
|
|
302
|
+
100
|
|
303
|
+
: 0;
|
|
304
|
+
|
|
305
|
+
// Simple significance test (would need proper statistical test in production)
|
|
306
|
+
// Using a rough heuristic: significant if improvement > 2 std devs
|
|
307
|
+
const isSignificant =
|
|
308
|
+
Math.abs(withKnowledge.avgTime - baseline.avgTime) >
|
|
309
|
+
2 * Math.max(withKnowledge.timeStdDev, baseline.timeStdDev);
|
|
310
|
+
|
|
311
|
+
return {
|
|
312
|
+
withKnowledge,
|
|
313
|
+
baseline,
|
|
314
|
+
improvement: {
|
|
315
|
+
successRateDelta,
|
|
316
|
+
timeDeltaPercent,
|
|
317
|
+
toolCallDeltaPercent,
|
|
318
|
+
isSignificant,
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Generate a validation report
|
|
325
|
+
*/
|
|
326
|
+
generateReport(result: ValidationFlowResult): string {
|
|
327
|
+
const lines: string[] = [
|
|
328
|
+
'# Atlas Validation Report',
|
|
329
|
+
'',
|
|
330
|
+
'## Aggregate Metrics',
|
|
331
|
+
`- Success Rate: ${(result.aggregateMetrics.successRate * 100).toFixed(1)}%`,
|
|
332
|
+
`- Avg Time: ${result.aggregateMetrics.avgTime.toFixed(0)}ms`,
|
|
333
|
+
`- Avg Tool Calls: ${result.aggregateMetrics.avgToolCalls.toFixed(1)}`,
|
|
334
|
+
`- Total Runs: ${result.aggregateMetrics.totalRuns}`,
|
|
335
|
+
'',
|
|
336
|
+
];
|
|
337
|
+
|
|
338
|
+
if (result.comparison) {
|
|
339
|
+
const c = result.comparison;
|
|
340
|
+
lines.push(
|
|
341
|
+
'## Comparison (With Knowledge vs Baseline)',
|
|
342
|
+
'',
|
|
343
|
+
'| Metric | With Knowledge | Baseline | Improvement |',
|
|
344
|
+
'|--------|---------------|----------|-------------|',
|
|
345
|
+
`| Success Rate | ${(c.withKnowledge.successRate * 100).toFixed(1)}% | ${(c.baseline.successRate * 100).toFixed(1)}% | ${c.improvement.successRateDelta > 0 ? '+' : ''}${(c.improvement.successRateDelta * 100).toFixed(1)}% |`,
|
|
346
|
+
`| Avg Time | ${c.withKnowledge.avgTime.toFixed(0)}ms | ${c.baseline.avgTime.toFixed(0)}ms | ${c.improvement.timeDeltaPercent > 0 ? '+' : ''}${c.improvement.timeDeltaPercent.toFixed(1)}% faster |`,
|
|
347
|
+
`| Avg Tool Calls | ${c.withKnowledge.avgToolCalls.toFixed(1)} | ${c.baseline.avgToolCalls.toFixed(1)} | ${c.improvement.toolCallDeltaPercent > 0 ? '+' : ''}${c.improvement.toolCallDeltaPercent.toFixed(1)}% fewer |`,
|
|
348
|
+
'',
|
|
349
|
+
`Statistical Significance: ${c.improvement.isSignificant ? '✓ Yes' : '✗ No'}`,
|
|
350
|
+
''
|
|
351
|
+
);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
lines.push(
|
|
355
|
+
'## Per-Task Results',
|
|
356
|
+
''
|
|
357
|
+
);
|
|
358
|
+
|
|
359
|
+
for (const taskResult of result.taskResults) {
|
|
360
|
+
lines.push(
|
|
361
|
+
`### ${taskResult.task.description.slice(0, 50)}...`,
|
|
362
|
+
`- Success Rate: ${(taskResult.metrics.successRate * 100).toFixed(1)}%`,
|
|
363
|
+
`- Avg Time: ${taskResult.metrics.avgTime.toFixed(0)}ms`,
|
|
364
|
+
''
|
|
365
|
+
);
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return lines.join('\n');
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Create a validation flow
|
|
374
|
+
*/
|
|
375
|
+
export function createValidationFlow(
|
|
376
|
+
manager: AgentManager,
|
|
377
|
+
memory: MemorySystem,
|
|
378
|
+
config: ValidationFlowConfig
|
|
379
|
+
): ValidationFlow {
|
|
380
|
+
return new ValidationFlow(manager, memory, config);
|
|
381
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
// Types
|
|
2
|
+
export type {
|
|
3
|
+
AgentMessage,
|
|
4
|
+
ToolCall,
|
|
5
|
+
AgentState,
|
|
6
|
+
AgentSession,
|
|
7
|
+
AgentSpawnConfig,
|
|
8
|
+
AgentResult,
|
|
9
|
+
AgentBackend,
|
|
10
|
+
KnowledgeInjector,
|
|
11
|
+
TrajectoryExtractor,
|
|
12
|
+
AgentObserverCallbacks,
|
|
13
|
+
} from './types.js';
|
|
14
|
+
|
|
15
|
+
// Manager
|
|
16
|
+
export {
|
|
17
|
+
AgentManager,
|
|
18
|
+
createAgentManager,
|
|
19
|
+
DefaultKnowledgeInjector,
|
|
20
|
+
DefaultTrajectoryExtractor,
|
|
21
|
+
} from './manager.js';
|
|
22
|
+
|
|
23
|
+
// Backends
|
|
24
|
+
export {
|
|
25
|
+
SubprocessBackend,
|
|
26
|
+
createSubprocessBackend,
|
|
27
|
+
claudeCodeConfig,
|
|
28
|
+
type SubprocessAgentConfig,
|
|
29
|
+
MockBackend,
|
|
30
|
+
createMockBackend,
|
|
31
|
+
type MockAgentBehavior,
|
|
32
|
+
ACPBackend,
|
|
33
|
+
createACPBackend,
|
|
34
|
+
claudeCodeACPConfig,
|
|
35
|
+
claudeCodeDirectConfig,
|
|
36
|
+
type ACPAgentConfig,
|
|
37
|
+
} from './backends/index.js';
|
|
38
|
+
|
|
39
|
+
// Flows
|
|
40
|
+
export {
|
|
41
|
+
LearningFlow,
|
|
42
|
+
createLearningFlow,
|
|
43
|
+
type LearningFlowConfig,
|
|
44
|
+
type LearningFlowResult,
|
|
45
|
+
} from './flows/learning.js';
|
|
46
|
+
|
|
47
|
+
export {
|
|
48
|
+
ValidationFlow,
|
|
49
|
+
createValidationFlow,
|
|
50
|
+
type ValidationFlowConfig,
|
|
51
|
+
type ValidationFlowResult,
|
|
52
|
+
type ValidationMetrics,
|
|
53
|
+
} from './flows/validation.js';
|