cognitive-core 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. package/README.md +363 -2
  2. package/SKILL.md +193 -0
  3. package/dist/agents/index.d.ts +3 -0
  4. package/dist/agents/index.d.ts.map +1 -0
  5. package/dist/agents/index.js +5 -0
  6. package/dist/agents/index.js.map +1 -0
  7. package/dist/agents/mock-provider.d.ts +23 -0
  8. package/dist/agents/mock-provider.d.ts.map +1 -0
  9. package/dist/agents/mock-provider.js +71 -0
  10. package/dist/agents/mock-provider.js.map +1 -0
  11. package/dist/agents/types.d.ts +98 -0
  12. package/dist/agents/types.d.ts.map +1 -0
  13. package/dist/agents/types.js +44 -0
  14. package/dist/agents/types.js.map +1 -0
  15. package/dist/atlas.d.ts +196 -0
  16. package/dist/atlas.d.ts.map +1 -0
  17. package/dist/atlas.js +373 -0
  18. package/dist/atlas.js.map +1 -0
  19. package/dist/bin/cognitive-core.d.ts +18 -0
  20. package/dist/bin/cognitive-core.d.ts.map +1 -0
  21. package/dist/bin/cognitive-core.js +419 -0
  22. package/dist/bin/cognitive-core.js.map +1 -0
  23. package/dist/embeddings/bm25.d.ts +104 -0
  24. package/dist/embeddings/bm25.d.ts.map +1 -0
  25. package/dist/embeddings/bm25.js +264 -0
  26. package/dist/embeddings/bm25.js.map +1 -0
  27. package/dist/embeddings/index.d.ts +12 -0
  28. package/dist/embeddings/index.d.ts.map +1 -0
  29. package/dist/embeddings/index.js +16 -0
  30. package/dist/embeddings/index.js.map +1 -0
  31. package/dist/embeddings/manager.d.ts +112 -0
  32. package/dist/embeddings/manager.d.ts.map +1 -0
  33. package/dist/embeddings/manager.js +215 -0
  34. package/dist/embeddings/manager.js.map +1 -0
  35. package/dist/embeddings/provider.d.ts +101 -0
  36. package/dist/embeddings/provider.d.ts.map +1 -0
  37. package/dist/embeddings/provider.js +232 -0
  38. package/dist/embeddings/provider.js.map +1 -0
  39. package/dist/embeddings/vector-store.d.ts +101 -0
  40. package/dist/embeddings/vector-store.d.ts.map +1 -0
  41. package/dist/embeddings/vector-store.js +256 -0
  42. package/dist/embeddings/vector-store.js.map +1 -0
  43. package/dist/factory.d.ts +193 -0
  44. package/dist/factory.d.ts.map +1 -0
  45. package/dist/factory.js +109 -0
  46. package/dist/factory.js.map +1 -0
  47. package/dist/index.d.ts +43 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +84 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/learning/analyzer.d.ts +110 -0
  52. package/dist/learning/analyzer.d.ts.map +1 -0
  53. package/dist/learning/analyzer.js +213 -0
  54. package/dist/learning/analyzer.js.map +1 -0
  55. package/dist/learning/effectiveness.d.ts +158 -0
  56. package/dist/learning/effectiveness.d.ts.map +1 -0
  57. package/dist/learning/effectiveness.js +251 -0
  58. package/dist/learning/effectiveness.js.map +1 -0
  59. package/dist/learning/index.d.ts +8 -0
  60. package/dist/learning/index.d.ts.map +1 -0
  61. package/dist/learning/index.js +11 -0
  62. package/dist/learning/index.js.map +1 -0
  63. package/dist/learning/llm-extractor.d.ts +88 -0
  64. package/dist/learning/llm-extractor.d.ts.map +1 -0
  65. package/dist/learning/llm-extractor.js +372 -0
  66. package/dist/learning/llm-extractor.js.map +1 -0
  67. package/dist/learning/meta-learner.d.ts +80 -0
  68. package/dist/learning/meta-learner.d.ts.map +1 -0
  69. package/dist/learning/meta-learner.js +355 -0
  70. package/dist/learning/meta-learner.js.map +1 -0
  71. package/dist/learning/pipeline.d.ts +65 -0
  72. package/dist/learning/pipeline.d.ts.map +1 -0
  73. package/dist/learning/pipeline.js +170 -0
  74. package/dist/learning/pipeline.js.map +1 -0
  75. package/dist/learning/playbook-extractor.d.ts +113 -0
  76. package/dist/learning/playbook-extractor.d.ts.map +1 -0
  77. package/dist/learning/playbook-extractor.js +523 -0
  78. package/dist/learning/playbook-extractor.js.map +1 -0
  79. package/dist/learning/usage-inference.d.ts +82 -0
  80. package/dist/learning/usage-inference.d.ts.map +1 -0
  81. package/dist/learning/usage-inference.js +261 -0
  82. package/dist/learning/usage-inference.js.map +1 -0
  83. package/dist/mcp/index.d.ts +6 -0
  84. package/dist/mcp/index.d.ts.map +1 -0
  85. package/dist/mcp/index.js +6 -0
  86. package/dist/mcp/index.js.map +1 -0
  87. package/dist/mcp/playbook-server.d.ts +120 -0
  88. package/dist/mcp/playbook-server.d.ts.map +1 -0
  89. package/dist/mcp/playbook-server.js +427 -0
  90. package/dist/mcp/playbook-server.js.map +1 -0
  91. package/dist/memory/curated-loader.d.ts +62 -0
  92. package/dist/memory/curated-loader.d.ts.map +1 -0
  93. package/dist/memory/curated-loader.js +106 -0
  94. package/dist/memory/curated-loader.js.map +1 -0
  95. package/dist/memory/experience.d.ts +122 -0
  96. package/dist/memory/experience.d.ts.map +1 -0
  97. package/dist/memory/experience.js +392 -0
  98. package/dist/memory/experience.js.map +1 -0
  99. package/dist/memory/index.d.ts +6 -0
  100. package/dist/memory/index.d.ts.map +1 -0
  101. package/dist/memory/index.js +9 -0
  102. package/dist/memory/index.js.map +1 -0
  103. package/dist/memory/meta.d.ts +90 -0
  104. package/dist/memory/meta.d.ts.map +1 -0
  105. package/dist/memory/meta.js +362 -0
  106. package/dist/memory/meta.js.map +1 -0
  107. package/dist/memory/playbook.d.ts +133 -0
  108. package/dist/memory/playbook.d.ts.map +1 -0
  109. package/dist/memory/playbook.js +357 -0
  110. package/dist/memory/playbook.js.map +1 -0
  111. package/dist/memory/system.d.ts +167 -0
  112. package/dist/memory/system.d.ts.map +1 -0
  113. package/dist/memory/system.js +383 -0
  114. package/dist/memory/system.js.map +1 -0
  115. package/dist/runtime/backends/acp.d.ts +67 -0
  116. package/dist/runtime/backends/acp.d.ts.map +1 -0
  117. package/dist/runtime/backends/acp.js +290 -0
  118. package/dist/runtime/backends/acp.js.map +1 -0
  119. package/dist/runtime/backends/index.d.ts +5 -0
  120. package/dist/runtime/backends/index.d.ts.map +1 -0
  121. package/dist/runtime/backends/index.js +6 -0
  122. package/dist/runtime/backends/index.js.map +1 -0
  123. package/dist/runtime/backends/mock.d.ts +67 -0
  124. package/dist/runtime/backends/mock.d.ts.map +1 -0
  125. package/dist/runtime/backends/mock.js +153 -0
  126. package/dist/runtime/backends/mock.js.map +1 -0
  127. package/dist/runtime/backends/subprocess.d.ts +56 -0
  128. package/dist/runtime/backends/subprocess.d.ts.map +1 -0
  129. package/dist/runtime/backends/subprocess.js +260 -0
  130. package/dist/runtime/backends/subprocess.js.map +1 -0
  131. package/dist/runtime/flows/learning.d.ts +73 -0
  132. package/dist/runtime/flows/learning.d.ts.map +1 -0
  133. package/dist/runtime/flows/learning.js +116 -0
  134. package/dist/runtime/flows/learning.js.map +1 -0
  135. package/dist/runtime/flows/validation.d.ts +122 -0
  136. package/dist/runtime/flows/validation.d.ts.map +1 -0
  137. package/dist/runtime/flows/validation.js +223 -0
  138. package/dist/runtime/flows/validation.js.map +1 -0
  139. package/dist/runtime/index.d.ts +6 -0
  140. package/dist/runtime/index.d.ts.map +1 -0
  141. package/dist/runtime/index.js +8 -0
  142. package/dist/runtime/index.js.map +1 -0
  143. package/dist/runtime/manager.d.ts +116 -0
  144. package/dist/runtime/manager.d.ts.map +1 -0
  145. package/dist/runtime/manager.js +416 -0
  146. package/dist/runtime/manager.js.map +1 -0
  147. package/dist/runtime/types.d.ts +138 -0
  148. package/dist/runtime/types.d.ts.map +1 -0
  149. package/dist/runtime/types.js +2 -0
  150. package/dist/runtime/types.js.map +1 -0
  151. package/dist/search/evaluator.d.ts +102 -0
  152. package/dist/search/evaluator.d.ts.map +1 -0
  153. package/dist/search/evaluator.js +352 -0
  154. package/dist/search/evaluator.js.map +1 -0
  155. package/dist/search/index.d.ts +7 -0
  156. package/dist/search/index.d.ts.map +1 -0
  157. package/dist/search/index.js +11 -0
  158. package/dist/search/index.js.map +1 -0
  159. package/dist/search/refinement-loop.d.ts +73 -0
  160. package/dist/search/refinement-loop.d.ts.map +1 -0
  161. package/dist/search/refinement-loop.js +245 -0
  162. package/dist/search/refinement-loop.js.map +1 -0
  163. package/dist/search/refinement-types.d.ts +154 -0
  164. package/dist/search/refinement-types.d.ts.map +1 -0
  165. package/dist/search/refinement-types.js +99 -0
  166. package/dist/search/refinement-types.js.map +1 -0
  167. package/dist/search/router.d.ts +61 -0
  168. package/dist/search/router.d.ts.map +1 -0
  169. package/dist/search/router.js +197 -0
  170. package/dist/search/router.js.map +1 -0
  171. package/dist/search/solver.d.ts +75 -0
  172. package/dist/search/solver.d.ts.map +1 -0
  173. package/dist/search/solver.js +216 -0
  174. package/dist/search/solver.js.map +1 -0
  175. package/dist/search/verification-runner.d.ts +125 -0
  176. package/dist/search/verification-runner.d.ts.map +1 -0
  177. package/dist/search/verification-runner.js +440 -0
  178. package/dist/search/verification-runner.js.map +1 -0
  179. package/dist/surfacing/index.d.ts +2 -0
  180. package/dist/surfacing/index.d.ts.map +1 -0
  181. package/dist/surfacing/index.js +2 -0
  182. package/dist/surfacing/index.js.map +1 -0
  183. package/dist/surfacing/skill-library.d.ts +158 -0
  184. package/dist/surfacing/skill-library.d.ts.map +1 -0
  185. package/dist/surfacing/skill-library.js +429 -0
  186. package/dist/surfacing/skill-library.js.map +1 -0
  187. package/dist/types/config.d.ts +1113 -0
  188. package/dist/types/config.d.ts.map +1 -0
  189. package/dist/types/config.js +274 -0
  190. package/dist/types/config.js.map +1 -0
  191. package/dist/types/index.d.ts +9 -0
  192. package/dist/types/index.d.ts.map +1 -0
  193. package/dist/types/index.js +14 -0
  194. package/dist/types/index.js.map +1 -0
  195. package/dist/types/memory.d.ts +339 -0
  196. package/dist/types/memory.d.ts.map +1 -0
  197. package/dist/types/memory.js +207 -0
  198. package/dist/types/memory.js.map +1 -0
  199. package/dist/types/meta.d.ts +146 -0
  200. package/dist/types/meta.d.ts.map +1 -0
  201. package/dist/types/meta.js +51 -0
  202. package/dist/types/meta.js.map +1 -0
  203. package/dist/types/outcome.d.ts +42 -0
  204. package/dist/types/outcome.d.ts.map +1 -0
  205. package/dist/types/outcome.js +50 -0
  206. package/dist/types/outcome.js.map +1 -0
  207. package/dist/types/playbook.d.ts +119 -0
  208. package/dist/types/playbook.d.ts.map +1 -0
  209. package/dist/types/playbook.js +71 -0
  210. package/dist/types/playbook.js.map +1 -0
  211. package/dist/types/step.d.ts +44 -0
  212. package/dist/types/step.d.ts.map +1 -0
  213. package/dist/types/step.js +32 -0
  214. package/dist/types/step.js.map +1 -0
  215. package/dist/types/task.d.ts +91 -0
  216. package/dist/types/task.d.ts.map +1 -0
  217. package/dist/types/task.js +39 -0
  218. package/dist/types/task.js.map +1 -0
  219. package/dist/types/trajectory.d.ts +221 -0
  220. package/dist/types/trajectory.d.ts.map +1 -0
  221. package/dist/types/trajectory.js +60 -0
  222. package/dist/types/trajectory.js.map +1 -0
  223. package/dist/utils/index.d.ts +4 -0
  224. package/dist/utils/index.d.ts.map +1 -0
  225. package/dist/utils/index.js +4 -0
  226. package/dist/utils/index.js.map +1 -0
  227. package/dist/utils/similarity.d.ts +31 -0
  228. package/dist/utils/similarity.d.ts.map +1 -0
  229. package/dist/utils/similarity.js +107 -0
  230. package/dist/utils/similarity.js.map +1 -0
  231. package/dist/utils/storage.d.ts +106 -0
  232. package/dist/utils/storage.d.ts.map +1 -0
  233. package/dist/utils/storage.js +203 -0
  234. package/dist/utils/storage.js.map +1 -0
  235. package/dist/utils/validation.d.ts +129 -0
  236. package/dist/utils/validation.d.ts.map +1 -0
  237. package/dist/utils/validation.js +171 -0
  238. package/dist/utils/validation.js.map +1 -0
  239. package/package.json +61 -9
  240. package/scripts/migrate-to-playbooks.ts +307 -0
  241. package/src/agents/index.ts +14 -0
  242. package/src/agents/mock-provider.ts +93 -0
  243. package/src/agents/types.ts +137 -0
  244. package/src/atlas.ts +560 -0
  245. package/src/bin/cognitive-core.ts +470 -0
  246. package/src/embeddings/bm25.ts +337 -0
  247. package/src/embeddings/index.ts +39 -0
  248. package/src/embeddings/manager.ts +288 -0
  249. package/src/embeddings/provider.ts +311 -0
  250. package/src/embeddings/vector-store.ts +353 -0
  251. package/src/factory.ts +263 -0
  252. package/src/index.ts +246 -0
  253. package/src/learning/analyzer.ts +335 -0
  254. package/src/learning/effectiveness.ts +428 -0
  255. package/src/learning/index.ts +58 -0
  256. package/src/learning/llm-extractor.ts +542 -0
  257. package/src/learning/meta-learner.ts +516 -0
  258. package/src/learning/pipeline.ts +244 -0
  259. package/src/learning/playbook-extractor.ts +702 -0
  260. package/src/learning/usage-inference.ts +372 -0
  261. package/src/mcp/index.ts +12 -0
  262. package/src/mcp/playbook-server.ts +565 -0
  263. package/src/memory/curated-loader.ts +160 -0
  264. package/src/memory/experience.ts +515 -0
  265. package/src/memory/index.ts +27 -0
  266. package/src/memory/meta.ts +506 -0
  267. package/src/memory/playbook.ts +493 -0
  268. package/src/memory/system.ts +551 -0
  269. package/src/runtime/backends/acp.ts +378 -0
  270. package/src/runtime/backends/index.ts +24 -0
  271. package/src/runtime/backends/mock.ts +218 -0
  272. package/src/runtime/backends/subprocess.ts +356 -0
  273. package/src/runtime/flows/learning.ts +183 -0
  274. package/src/runtime/flows/validation.ts +381 -0
  275. package/src/runtime/index.ts +53 -0
  276. package/src/runtime/manager.ts +541 -0
  277. package/src/runtime/types.ts +157 -0
  278. package/src/search/evaluator.ts +474 -0
  279. package/src/search/index.ts +59 -0
  280. package/src/search/refinement-loop.ts +363 -0
  281. package/src/search/refinement-types.ts +159 -0
  282. package/src/search/router.ts +261 -0
  283. package/src/search/solver.ts +303 -0
  284. package/src/search/verification-runner.ts +570 -0
  285. package/src/surfacing/index.ts +6 -0
  286. package/src/surfacing/skill-library.ts +594 -0
  287. package/src/types/config.ts +333 -0
  288. package/src/types/index.ts +130 -0
  289. package/src/types/memory.ts +270 -0
  290. package/src/types/meta.ts +218 -0
  291. package/src/types/outcome.ts +66 -0
  292. package/src/types/playbook.ts +196 -0
  293. package/src/types/step.ts +40 -0
  294. package/src/types/task.ts +52 -0
  295. package/src/types/trajectory.ts +80 -0
  296. package/src/utils/index.ts +38 -0
  297. package/src/utils/similarity.ts +139 -0
  298. package/src/utils/storage.ts +249 -0
  299. package/src/utils/validation.ts +286 -0
  300. package/tests/embeddings/bm25.test.ts +130 -0
  301. package/tests/embeddings/manager.test.ts +205 -0
  302. package/tests/integration/atlas.test.ts +266 -0
  303. package/tests/integration/e2e.test.ts +929 -0
  304. package/tests/learning/analyzer.test.ts +426 -0
  305. package/tests/learning/effectiveness.test.ts +542 -0
  306. package/tests/learning/pipeline.test.ts +176 -0
  307. package/tests/learning/playbook-extractor-provenance.test.ts +114 -0
  308. package/tests/learning/usage-inference.test.ts +254 -0
  309. package/tests/mcp/playbook-server.test.ts +252 -0
  310. package/tests/memory/experience.test.ts +198 -0
  311. package/tests/memory/playbook.test.ts +338 -0
  312. package/tests/memory/provenance.test.ts +639 -0
  313. package/tests/memory/system.test.ts +325 -0
  314. package/tests/runtime/agent-manager.test.ts +512 -0
  315. package/tests/runtime/mock-backend.test.ts +248 -0
  316. package/tests/search/refinement-loop.test.ts +468 -0
  317. package/tests/search/refinement.test.ts +267 -0
  318. package/tests/search/router.test.ts +427 -0
  319. package/tests/surfacing/skill-library.test.ts +292 -0
  320. package/tests/types/outcome.test.ts +147 -0
  321. package/tests/types/step.test.ts +133 -0
  322. package/tests/types/task.test.ts +158 -0
  323. package/tests/types/trajectory.test.ts +253 -0
  324. package/tests/utils/similarity.test.ts +188 -0
  325. package/tests/utils/validation.test.ts +252 -0
  326. package/tsconfig.json +25 -0
  327. package/vitest.config.ts +22 -0
  328. package/index.d.ts +0 -4
  329. package/index.js +0 -4
@@ -0,0 +1,381 @@
1
+ import type { Task } from '../../types/index.js';
2
+ import type { MemorySystem } from '../../memory/system.js';
3
+ import type { AgentManager } from '../manager.js';
4
+ import type { AgentResult } from '../types.js';
5
+
6
+ /**
7
+ * Configuration for the validation flow
8
+ */
9
+ export interface ValidationFlowConfig {
10
+ /** Agent type to use for execution */
11
+ agentType: string;
12
+ /** Number of runs per task for statistical significance */
13
+ runsPerTask?: number;
14
+ /** Whether to run comparison (with vs without knowledge) */
15
+ runComparison?: boolean;
16
+ /** Timeout per task in ms */
17
+ taskTimeout?: number;
18
+ }
19
+
20
+ /**
21
+ * Metrics for measuring improvement
22
+ */
23
+ export interface ValidationMetrics {
24
+ /** Success rate (0-1) */
25
+ successRate: number;
26
+ /** Average execution time in ms */
27
+ avgTime: number;
28
+ /** Average tool calls per task */
29
+ avgToolCalls: number;
30
+ /** Standard deviation of time */
31
+ timeStdDev: number;
32
+ /** Total tasks run */
33
+ totalRuns: number;
34
+ }
35
+
36
+ /**
37
+ * Comparison result between with/without knowledge
38
+ */
39
+ export interface ComparisonResult {
40
+ /** Metrics with knowledge injection */
41
+ withKnowledge: ValidationMetrics;
42
+ /** Metrics without knowledge (baseline) */
43
+ baseline: ValidationMetrics;
44
+ /** Improvement metrics */
45
+ improvement: {
46
+ successRateDelta: number;
47
+ timeDeltaPercent: number;
48
+ toolCallDeltaPercent: number;
49
+ /** Statistical significance (p-value approximation) */
50
+ isSignificant: boolean;
51
+ };
52
+ }
53
+
54
+ /**
55
+ * Result of validation flow
56
+ */
57
+ export interface ValidationFlowResult {
58
+ /** Results for each task */
59
+ taskResults: Array<{
60
+ task: Task;
61
+ results: AgentResult[];
62
+ metrics: ValidationMetrics;
63
+ }>;
64
+ /** Aggregate metrics */
65
+ aggregateMetrics: ValidationMetrics;
66
+ /** Comparison if runComparison was true */
67
+ comparison?: ComparisonResult;
68
+ }
69
+
70
+ /**
71
+ * Validation Flow
72
+ * Orchestrates: Knowledge injection → Agent execution → Performance measurement
73
+ *
74
+ * This flow is for the "injection" direction:
75
+ * - Test how well learning is working
76
+ * - Compare performance with vs without knowledge
77
+ * - Measure improvement over time
78
+ */
79
+ export class ValidationFlow {
80
+ private manager: AgentManager;
81
+ private config: ValidationFlowConfig;
82
+
83
+ constructor(
84
+ manager: AgentManager,
85
+ _memory: MemorySystem, // Reserved for future memory state tracking
86
+ config: ValidationFlowConfig
87
+ ) {
88
+ this.manager = manager;
89
+ this.config = {
90
+ runsPerTask: 1,
91
+ runComparison: true,
92
+ taskTimeout: 300000, // 5 minutes
93
+ ...config,
94
+ };
95
+ }
96
+
97
+ /**
98
+ * Validate on a single task
99
+ */
100
+ async validateTask(task: Task): Promise<{
101
+ task: Task;
102
+ results: AgentResult[];
103
+ metrics: ValidationMetrics;
104
+ comparison?: ComparisonResult;
105
+ }> {
106
+ const runsPerTask = this.config.runsPerTask ?? 1;
107
+ const results: AgentResult[] = [];
108
+ const baselineResults: AgentResult[] = [];
109
+
110
+ // Run with knowledge injection
111
+ for (let i = 0; i < runsPerTask; i++) {
112
+ const result = await this.manager.spawn({
113
+ agentType: this.config.agentType,
114
+ task,
115
+ timeout: this.config.taskTimeout,
116
+ captureToolCalls: true,
117
+ });
118
+ results.push(result);
119
+ }
120
+
121
+ // Run baseline if comparison enabled
122
+ if (this.config.runComparison) {
123
+ for (let i = 0; i < runsPerTask; i++) {
124
+ const result = await this.manager.spawnBaseline({
125
+ agentType: this.config.agentType,
126
+ task,
127
+ timeout: this.config.taskTimeout,
128
+ captureToolCalls: true,
129
+ });
130
+ baselineResults.push(result);
131
+ }
132
+ }
133
+
134
+ const metrics = this.calculateMetrics(results);
135
+ let comparison: ComparisonResult | undefined;
136
+
137
+ if (this.config.runComparison && baselineResults.length > 0) {
138
+ const baselineMetrics = this.calculateMetrics(baselineResults);
139
+ comparison = this.calculateComparison(metrics, baselineMetrics);
140
+ }
141
+
142
+ return { task, results, metrics, comparison };
143
+ }
144
+
145
+ /**
146
+ * Validate on multiple tasks
147
+ */
148
+ async validate(tasks: Task[]): Promise<ValidationFlowResult> {
149
+ const taskResults: ValidationFlowResult['taskResults'] = [];
150
+ const allResults: AgentResult[] = [];
151
+ const allBaselineResults: AgentResult[] = [];
152
+
153
+ for (const task of tasks) {
154
+ const result = await this.validateTask(task);
155
+ taskResults.push({
156
+ task: result.task,
157
+ results: result.results,
158
+ metrics: result.metrics,
159
+ });
160
+
161
+ allResults.push(...result.results);
162
+
163
+ // Collect baseline results for aggregate comparison
164
+ if (this.config.runComparison) {
165
+ // Re-run baseline for aggregate (or we could store from validateTask)
166
+ for (let i = 0; i < (this.config.runsPerTask ?? 1); i++) {
167
+ const baselineResult = await this.manager.spawnBaseline({
168
+ agentType: this.config.agentType,
169
+ task,
170
+ timeout: this.config.taskTimeout,
171
+ captureToolCalls: true,
172
+ });
173
+ allBaselineResults.push(baselineResult);
174
+ }
175
+ }
176
+ }
177
+
178
+ const aggregateMetrics = this.calculateMetrics(allResults);
179
+ let comparison: ComparisonResult | undefined;
180
+
181
+ if (this.config.runComparison && allBaselineResults.length > 0) {
182
+ const baselineMetrics = this.calculateMetrics(allBaselineResults);
183
+ comparison = this.calculateComparison(aggregateMetrics, baselineMetrics);
184
+ }
185
+
186
+ return {
187
+ taskResults,
188
+ aggregateMetrics,
189
+ comparison,
190
+ };
191
+ }
192
+
193
+ /**
194
+ * Quick validation - single run per task, with comparison
195
+ */
196
+ async quickValidate(tasks: Task[]): Promise<{
197
+ successRateWithKnowledge: number;
198
+ successRateBaseline: number;
199
+ improvement: number;
200
+ details: Array<{
201
+ task: Task;
202
+ withKnowledge: boolean;
203
+ baseline: boolean;
204
+ }>;
205
+ }> {
206
+ const details: Array<{
207
+ task: Task;
208
+ withKnowledge: boolean;
209
+ baseline: boolean;
210
+ }> = [];
211
+
212
+ for (const task of tasks) {
213
+ const [withKnowledge, baseline] = await Promise.all([
214
+ this.manager.spawn({
215
+ agentType: this.config.agentType,
216
+ task,
217
+ timeout: this.config.taskTimeout,
218
+ }),
219
+ this.manager.spawnBaseline({
220
+ agentType: this.config.agentType,
221
+ task,
222
+ timeout: this.config.taskTimeout,
223
+ }),
224
+ ]);
225
+
226
+ details.push({
227
+ task,
228
+ withKnowledge: withKnowledge.success,
229
+ baseline: baseline.success,
230
+ });
231
+ }
232
+
233
+ const successRateWithKnowledge =
234
+ details.filter((d) => d.withKnowledge).length / details.length;
235
+ const successRateBaseline =
236
+ details.filter((d) => d.baseline).length / details.length;
237
+
238
+ return {
239
+ successRateWithKnowledge,
240
+ successRateBaseline,
241
+ improvement: successRateWithKnowledge - successRateBaseline,
242
+ details,
243
+ };
244
+ }
245
+
246
+ /**
247
+ * Calculate metrics from results
248
+ */
249
+ private calculateMetrics(results: AgentResult[]): ValidationMetrics {
250
+ if (results.length === 0) {
251
+ return {
252
+ successRate: 0,
253
+ avgTime: 0,
254
+ avgToolCalls: 0,
255
+ timeStdDev: 0,
256
+ totalRuns: 0,
257
+ };
258
+ }
259
+
260
+ const successCount = results.filter((r) => r.success).length;
261
+ const times = results.map((r) => r.metrics.totalTime);
262
+ const toolCalls = results.map((r) => r.metrics.toolCallCount);
263
+
264
+ const avgTime = times.reduce((a, b) => a + b, 0) / times.length;
265
+ const avgToolCalls =
266
+ toolCalls.reduce((a, b) => a + b, 0) / toolCalls.length;
267
+
268
+ // Calculate standard deviation
269
+ const timeVariance =
270
+ times.reduce((sum, t) => sum + Math.pow(t - avgTime, 2), 0) /
271
+ times.length;
272
+ const timeStdDev = Math.sqrt(timeVariance);
273
+
274
+ return {
275
+ successRate: successCount / results.length,
276
+ avgTime,
277
+ avgToolCalls,
278
+ timeStdDev,
279
+ totalRuns: results.length,
280
+ };
281
+ }
282
+
283
+ /**
284
+ * Calculate comparison between two sets of metrics
285
+ */
286
+ private calculateComparison(
287
+ withKnowledge: ValidationMetrics,
288
+ baseline: ValidationMetrics
289
+ ): ComparisonResult {
290
+ const successRateDelta =
291
+ withKnowledge.successRate - baseline.successRate;
292
+
293
+ const timeDeltaPercent =
294
+ baseline.avgTime > 0
295
+ ? ((baseline.avgTime - withKnowledge.avgTime) / baseline.avgTime) * 100
296
+ : 0;
297
+
298
+ const toolCallDeltaPercent =
299
+ baseline.avgToolCalls > 0
300
+ ? ((baseline.avgToolCalls - withKnowledge.avgToolCalls) /
301
+ baseline.avgToolCalls) *
302
+ 100
303
+ : 0;
304
+
305
+ // Simple significance test (would need proper statistical test in production)
306
+ // Using a rough heuristic: significant if improvement > 2 std devs
307
+ const isSignificant =
308
+ Math.abs(withKnowledge.avgTime - baseline.avgTime) >
309
+ 2 * Math.max(withKnowledge.timeStdDev, baseline.timeStdDev);
310
+
311
+ return {
312
+ withKnowledge,
313
+ baseline,
314
+ improvement: {
315
+ successRateDelta,
316
+ timeDeltaPercent,
317
+ toolCallDeltaPercent,
318
+ isSignificant,
319
+ },
320
+ };
321
+ }
322
+
323
+ /**
324
+ * Generate a validation report
325
+ */
326
+ generateReport(result: ValidationFlowResult): string {
327
+ const lines: string[] = [
328
+ '# Atlas Validation Report',
329
+ '',
330
+ '## Aggregate Metrics',
331
+ `- Success Rate: ${(result.aggregateMetrics.successRate * 100).toFixed(1)}%`,
332
+ `- Avg Time: ${result.aggregateMetrics.avgTime.toFixed(0)}ms`,
333
+ `- Avg Tool Calls: ${result.aggregateMetrics.avgToolCalls.toFixed(1)}`,
334
+ `- Total Runs: ${result.aggregateMetrics.totalRuns}`,
335
+ '',
336
+ ];
337
+
338
+ if (result.comparison) {
339
+ const c = result.comparison;
340
+ lines.push(
341
+ '## Comparison (With Knowledge vs Baseline)',
342
+ '',
343
+ '| Metric | With Knowledge | Baseline | Improvement |',
344
+ '|--------|---------------|----------|-------------|',
345
+ `| Success Rate | ${(c.withKnowledge.successRate * 100).toFixed(1)}% | ${(c.baseline.successRate * 100).toFixed(1)}% | ${c.improvement.successRateDelta > 0 ? '+' : ''}${(c.improvement.successRateDelta * 100).toFixed(1)}% |`,
346
+ `| Avg Time | ${c.withKnowledge.avgTime.toFixed(0)}ms | ${c.baseline.avgTime.toFixed(0)}ms | ${c.improvement.timeDeltaPercent > 0 ? '+' : ''}${c.improvement.timeDeltaPercent.toFixed(1)}% faster |`,
347
+ `| Avg Tool Calls | ${c.withKnowledge.avgToolCalls.toFixed(1)} | ${c.baseline.avgToolCalls.toFixed(1)} | ${c.improvement.toolCallDeltaPercent > 0 ? '+' : ''}${c.improvement.toolCallDeltaPercent.toFixed(1)}% fewer |`,
348
+ '',
349
+ `Statistical Significance: ${c.improvement.isSignificant ? '✓ Yes' : '✗ No'}`,
350
+ ''
351
+ );
352
+ }
353
+
354
+ lines.push(
355
+ '## Per-Task Results',
356
+ ''
357
+ );
358
+
359
+ for (const taskResult of result.taskResults) {
360
+ lines.push(
361
+ `### ${taskResult.task.description.slice(0, 50)}...`,
362
+ `- Success Rate: ${(taskResult.metrics.successRate * 100).toFixed(1)}%`,
363
+ `- Avg Time: ${taskResult.metrics.avgTime.toFixed(0)}ms`,
364
+ ''
365
+ );
366
+ }
367
+
368
+ return lines.join('\n');
369
+ }
370
+ }
371
+
372
+ /**
373
+ * Create a validation flow
374
+ */
375
+ export function createValidationFlow(
376
+ manager: AgentManager,
377
+ memory: MemorySystem,
378
+ config: ValidationFlowConfig
379
+ ): ValidationFlow {
380
+ return new ValidationFlow(manager, memory, config);
381
+ }
@@ -0,0 +1,53 @@
1
+ // Types
2
+ export type {
3
+ AgentMessage,
4
+ ToolCall,
5
+ AgentState,
6
+ AgentSession,
7
+ AgentSpawnConfig,
8
+ AgentResult,
9
+ AgentBackend,
10
+ KnowledgeInjector,
11
+ TrajectoryExtractor,
12
+ AgentObserverCallbacks,
13
+ } from './types.js';
14
+
15
+ // Manager
16
+ export {
17
+ AgentManager,
18
+ createAgentManager,
19
+ DefaultKnowledgeInjector,
20
+ DefaultTrajectoryExtractor,
21
+ } from './manager.js';
22
+
23
+ // Backends
24
+ export {
25
+ SubprocessBackend,
26
+ createSubprocessBackend,
27
+ claudeCodeConfig,
28
+ type SubprocessAgentConfig,
29
+ MockBackend,
30
+ createMockBackend,
31
+ type MockAgentBehavior,
32
+ ACPBackend,
33
+ createACPBackend,
34
+ claudeCodeACPConfig,
35
+ claudeCodeDirectConfig,
36
+ type ACPAgentConfig,
37
+ } from './backends/index.js';
38
+
39
+ // Flows
40
+ export {
41
+ LearningFlow,
42
+ createLearningFlow,
43
+ type LearningFlowConfig,
44
+ type LearningFlowResult,
45
+ } from './flows/learning.js';
46
+
47
+ export {
48
+ ValidationFlow,
49
+ createValidationFlow,
50
+ type ValidationFlowConfig,
51
+ type ValidationFlowResult,
52
+ type ValidationMetrics,
53
+ } from './flows/validation.js';