midas-mcp 5.23.0 → 5.43.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analyzer.d.ts.map +1 -1
- package/dist/analyzer.js +9 -2
- package/dist/analyzer.js.map +1 -1
- package/dist/context.d.ts +264 -5
- package/dist/context.d.ts.map +1 -1
- package/dist/context.js +634 -7
- package/dist/context.js.map +1 -1
- package/dist/experiments/recursive-planning/recursive-session.d.ts +201 -0
- package/dist/experiments/recursive-planning/recursive-session.d.ts.map +1 -0
- package/dist/experiments/recursive-planning/recursive-session.js +348 -0
- package/dist/experiments/recursive-planning/recursive-session.js.map +1 -0
- package/dist/experiments/recursive-planning/recursive-session.test.d.ts +19 -0
- package/dist/experiments/recursive-planning/recursive-session.test.d.ts.map +1 -0
- package/dist/experiments/recursive-planning/recursive-session.test.js +799 -0
- package/dist/experiments/recursive-planning/recursive-session.test.js.map +1 -0
- package/dist/gameplan-tracker.d.ts +63 -0
- package/dist/gameplan-tracker.d.ts.map +1 -0
- package/dist/gameplan-tracker.js +330 -0
- package/dist/gameplan-tracker.js.map +1 -0
- package/dist/preflight.d.ts +140 -0
- package/dist/preflight.d.ts.map +1 -0
- package/dist/preflight.js +1100 -0
- package/dist/preflight.js.map +1 -0
- package/dist/security.d.ts.map +1 -1
- package/dist/security.js +22 -2
- package/dist/security.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +9 -4
- package/dist/server.js.map +1 -1
- package/dist/state/phase.d.ts.map +1 -1
- package/dist/state/phase.js +58 -8
- package/dist/state/phase.js.map +1 -1
- package/dist/tools/gameplan.d.ts +49 -0
- package/dist/tools/gameplan.d.ts.map +1 -0
- package/dist/tools/gameplan.js +79 -0
- package/dist/tools/gameplan.js.map +1 -0
- package/dist/tools/index.d.ts +2 -1
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +6 -2
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/preflight.d.ts +121 -0
- package/dist/tools/preflight.d.ts.map +1 -0
- package/dist/tools/preflight.js +144 -0
- package/dist/tools/preflight.js.map +1 -0
- package/dist/tools/verify.js +2 -2
- package/dist/tools/verify.js.map +1 -1
- package/dist/tracker.d.ts.map +1 -1
- package/dist/tracker.js +38 -2
- package/dist/tracker.js.map +1 -1
- package/dist/tui.js +53 -53
- package/dist/tui.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recursive Planning Session - Tests
|
|
3
|
+
*
|
|
4
|
+
* TDD-first implementation of TRM-inspired recursive planning.
|
|
5
|
+
*
|
|
6
|
+
* Core concepts from the TRM paper:
|
|
7
|
+
* - x: input/requirements (stable across iterations)
|
|
8
|
+
* - z: latent reasoning state (accumulated learning, like chain-of-thought)
|
|
9
|
+
* - y: current answer/implementation state
|
|
10
|
+
*
|
|
11
|
+
* Key behaviors:
|
|
12
|
+
* 1. Deep supervision: multiple refinement iterations
|
|
13
|
+
* 2. Recursive z refinement: reasoning improves with each cycle
|
|
14
|
+
* 3. Answer refinement: y improves given z
|
|
15
|
+
* 4. Adaptive halting: stop when correct, don't waste iterations
|
|
16
|
+
* 5. State persistence: z carries forward, preventing "forgetting"
|
|
17
|
+
*/
|
|
18
|
+
import { describe, it, beforeEach, afterEach } from 'node:test';
|
|
19
|
+
import assert from 'node:assert';
|
|
20
|
+
import { mkdirSync, rmSync, existsSync, readFileSync } from 'fs';
|
|
21
|
+
import { join } from 'path';
|
|
22
|
+
import { tmpdir } from 'os';
|
|
23
|
+
import { createSession, refineReasoning, refineAnswer, checkHalt, runIteration, runSession, serializeState, deserializeState, calculateConfidence, mergeReasoning, } from './recursive-session.js';
|
|
24
|
+
// ============================================================================
|
|
25
|
+
// TEST UTILITIES
|
|
26
|
+
// ============================================================================
|
|
27
|
+
let testDir;
|
|
28
|
+
let sessionCounter = 0;
|
|
29
|
+
function createTestDir() {
|
|
30
|
+
const dir = join(tmpdir(), `recursive-planning-test-${Date.now()}-${++sessionCounter}`);
|
|
31
|
+
mkdirSync(dir, { recursive: true });
|
|
32
|
+
return dir;
|
|
33
|
+
}
|
|
34
|
+
function cleanup(dir) {
|
|
35
|
+
try {
|
|
36
|
+
if (existsSync(dir)) {
|
|
37
|
+
rmSync(dir, { recursive: true, force: true });
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
// Ignore
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
// Mock refiners for testing
|
|
45
|
+
function mockReasoningRefiner(x, y, z) {
|
|
46
|
+
// Simulates reasoning improvement
|
|
47
|
+
const iteration = (z.match(/iteration:/g) || []).length + 1;
|
|
48
|
+
return `${z}\niteration:${iteration} analyzed "${x}" with current "${y}"`;
|
|
49
|
+
}
|
|
50
|
+
function mockAnswerRefiner(y, z) {
|
|
51
|
+
// Simulates answer improvement based on reasoning
|
|
52
|
+
const improvements = (z.match(/iteration:/g) || []).length;
|
|
53
|
+
return `${y} [improved x${improvements}]`;
|
|
54
|
+
}
|
|
55
|
+
function mockHaltChecker(x, y, z) {
|
|
56
|
+
// Halt when we have 3+ improvements
|
|
57
|
+
const improvements = (y.match(/improved/g) || []).length;
|
|
58
|
+
return {
|
|
59
|
+
shouldHalt: improvements >= 3,
|
|
60
|
+
confidence: Math.min(100, improvements * 30),
|
|
61
|
+
reason: improvements >= 3 ? 'Sufficient improvements' : 'More iterations needed',
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
// ============================================================================
|
|
65
|
+
// SESSION STATE TESTS
|
|
66
|
+
// ============================================================================
|
|
67
|
+
describe('RecursiveSession State Management', () => {
|
|
68
|
+
beforeEach(() => {
|
|
69
|
+
testDir = createTestDir();
|
|
70
|
+
});
|
|
71
|
+
afterEach(() => {
|
|
72
|
+
cleanup(testDir);
|
|
73
|
+
});
|
|
74
|
+
describe('createSession', () => {
|
|
75
|
+
it('should create session with initial state', () => {
|
|
76
|
+
const session = createSession({
|
|
77
|
+
x: 'Build a todo app',
|
|
78
|
+
projectPath: testDir,
|
|
79
|
+
});
|
|
80
|
+
assert.strictEqual(session.state.x, 'Build a todo app');
|
|
81
|
+
assert.strictEqual(session.state.y, ''); // Empty initial answer
|
|
82
|
+
assert.strictEqual(session.state.z, ''); // Empty initial reasoning
|
|
83
|
+
assert.strictEqual(session.state.iteration, 0);
|
|
84
|
+
assert.strictEqual(session.state.halted, false);
|
|
85
|
+
});
|
|
86
|
+
it('should accept initial y and z', () => {
|
|
87
|
+
const session = createSession({
|
|
88
|
+
x: 'Build a todo app',
|
|
89
|
+
y: 'Initial implementation',
|
|
90
|
+
z: 'Prior reasoning',
|
|
91
|
+
projectPath: testDir,
|
|
92
|
+
});
|
|
93
|
+
assert.strictEqual(session.state.y, 'Initial implementation');
|
|
94
|
+
assert.strictEqual(session.state.z, 'Prior reasoning');
|
|
95
|
+
});
|
|
96
|
+
it('should set max iterations', () => {
|
|
97
|
+
const session = createSession({
|
|
98
|
+
x: 'Build a todo app',
|
|
99
|
+
projectPath: testDir,
|
|
100
|
+
maxIterations: 8,
|
|
101
|
+
});
|
|
102
|
+
assert.strictEqual(session.config.maxIterations, 8);
|
|
103
|
+
});
|
|
104
|
+
it('should default to 16 max iterations (per TRM paper)', () => {
|
|
105
|
+
const session = createSession({
|
|
106
|
+
x: 'Build a todo app',
|
|
107
|
+
projectPath: testDir,
|
|
108
|
+
});
|
|
109
|
+
assert.strictEqual(session.config.maxIterations, 16);
|
|
110
|
+
});
|
|
111
|
+
it('should generate unique session ID', () => {
|
|
112
|
+
const s1 = createSession({ x: 'A', projectPath: testDir });
|
|
113
|
+
const s2 = createSession({ x: 'B', projectPath: testDir });
|
|
114
|
+
assert.notStrictEqual(s1.id, s2.id);
|
|
115
|
+
});
|
|
116
|
+
it('should record creation timestamp', () => {
|
|
117
|
+
const before = Date.now();
|
|
118
|
+
const session = createSession({ x: 'A', projectPath: testDir });
|
|
119
|
+
const after = Date.now();
|
|
120
|
+
assert.ok(session.createdAt >= before);
|
|
121
|
+
assert.ok(session.createdAt <= after);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
describe('State Serialization', () => {
|
|
125
|
+
it('should serialize state to JSON', () => {
|
|
126
|
+
const state = {
|
|
127
|
+
x: 'Build a todo app',
|
|
128
|
+
y: 'Current code',
|
|
129
|
+
z: 'Reasoning so far',
|
|
130
|
+
iteration: 5,
|
|
131
|
+
halted: false,
|
|
132
|
+
haltReason: null,
|
|
133
|
+
history: [],
|
|
134
|
+
};
|
|
135
|
+
const json = serializeState(state);
|
|
136
|
+
const parsed = JSON.parse(json);
|
|
137
|
+
assert.strictEqual(parsed.x, 'Build a todo app');
|
|
138
|
+
assert.strictEqual(parsed.iteration, 5);
|
|
139
|
+
});
|
|
140
|
+
it('should deserialize state from JSON', () => {
|
|
141
|
+
const json = JSON.stringify({
|
|
142
|
+
x: 'Build a todo app',
|
|
143
|
+
y: 'Current code',
|
|
144
|
+
z: 'Reasoning',
|
|
145
|
+
iteration: 3,
|
|
146
|
+
halted: false,
|
|
147
|
+
haltReason: null,
|
|
148
|
+
history: [],
|
|
149
|
+
});
|
|
150
|
+
const state = deserializeState(json);
|
|
151
|
+
assert.strictEqual(state.x, 'Build a todo app');
|
|
152
|
+
assert.strictEqual(state.iteration, 3);
|
|
153
|
+
});
|
|
154
|
+
it('should handle corrupted JSON gracefully', () => {
|
|
155
|
+
const state = deserializeState('{ invalid json }}}');
|
|
156
|
+
assert.strictEqual(state.x, '');
|
|
157
|
+
assert.strictEqual(state.iteration, 0);
|
|
158
|
+
});
|
|
159
|
+
it('should preserve history across serialization', () => {
|
|
160
|
+
const state = {
|
|
161
|
+
x: 'Test',
|
|
162
|
+
y: 'Answer',
|
|
163
|
+
z: 'Reasoning',
|
|
164
|
+
iteration: 2,
|
|
165
|
+
halted: false,
|
|
166
|
+
haltReason: null,
|
|
167
|
+
history: [
|
|
168
|
+
{ iteration: 1, z: 'First reasoning', y: 'First answer', confidence: 30 },
|
|
169
|
+
{ iteration: 2, z: 'Second reasoning', y: 'Second answer', confidence: 60 },
|
|
170
|
+
],
|
|
171
|
+
};
|
|
172
|
+
const json = serializeState(state);
|
|
173
|
+
const restored = deserializeState(json);
|
|
174
|
+
assert.strictEqual(restored.history.length, 2);
|
|
175
|
+
assert.strictEqual(restored.history[0].confidence, 30);
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
// ============================================================================
|
|
180
|
+
// REFINEMENT TESTS
|
|
181
|
+
// ============================================================================
|
|
182
|
+
describe('Recursive Refinement', () => {
|
|
183
|
+
beforeEach(() => {
|
|
184
|
+
testDir = createTestDir();
|
|
185
|
+
});
|
|
186
|
+
afterEach(() => {
|
|
187
|
+
cleanup(testDir);
|
|
188
|
+
});
|
|
189
|
+
describe('refineReasoning (z refinement)', () => {
|
|
190
|
+
it('should improve z given x, y, and current z', () => {
|
|
191
|
+
const result = refineReasoning({
|
|
192
|
+
x: 'Build a todo app',
|
|
193
|
+
y: 'Empty project',
|
|
194
|
+
z: 'Initial thoughts',
|
|
195
|
+
refiner: mockReasoningRefiner,
|
|
196
|
+
});
|
|
197
|
+
assert.ok(result.z.includes('iteration:1'));
|
|
198
|
+
assert.ok(result.z.includes('Initial thoughts')); // Carries forward
|
|
199
|
+
});
|
|
200
|
+
it('should accumulate reasoning across multiple calls', () => {
|
|
201
|
+
let z = '';
|
|
202
|
+
z = refineReasoning({ x: 'Task', y: 'Code v1', z, refiner: mockReasoningRefiner }).z;
|
|
203
|
+
z = refineReasoning({ x: 'Task', y: 'Code v2', z, refiner: mockReasoningRefiner }).z;
|
|
204
|
+
z = refineReasoning({ x: 'Task', y: 'Code v3', z, refiner: mockReasoningRefiner }).z;
|
|
205
|
+
assert.ok(z.includes('iteration:1'));
|
|
206
|
+
assert.ok(z.includes('iteration:2'));
|
|
207
|
+
assert.ok(z.includes('iteration:3'));
|
|
208
|
+
});
|
|
209
|
+
it('should record refinement duration', () => {
|
|
210
|
+
const result = refineReasoning({
|
|
211
|
+
x: 'Task',
|
|
212
|
+
y: 'Code',
|
|
213
|
+
z: '',
|
|
214
|
+
refiner: mockReasoningRefiner,
|
|
215
|
+
});
|
|
216
|
+
assert.ok(result.duration >= 0);
|
|
217
|
+
});
|
|
218
|
+
});
|
|
219
|
+
describe('refineAnswer (y refinement)', () => {
|
|
220
|
+
it('should improve y given current y and z', () => {
|
|
221
|
+
const z = 'iteration:1 analyzed\niteration:2 analyzed';
|
|
222
|
+
const result = refineAnswer({
|
|
223
|
+
y: 'Initial code',
|
|
224
|
+
z,
|
|
225
|
+
refiner: mockAnswerRefiner,
|
|
226
|
+
});
|
|
227
|
+
assert.ok(result.y.includes('improved'));
|
|
228
|
+
assert.ok(result.y.includes('Initial code'));
|
|
229
|
+
});
|
|
230
|
+
it('should not modify y if z is empty', () => {
|
|
231
|
+
const result = refineAnswer({
|
|
232
|
+
y: 'Code',
|
|
233
|
+
z: '',
|
|
234
|
+
refiner: (y, z) => z ? `${y} improved` : y,
|
|
235
|
+
});
|
|
236
|
+
assert.strictEqual(result.y, 'Code');
|
|
237
|
+
});
|
|
238
|
+
});
|
|
239
|
+
describe('mergeReasoning', () => {
|
|
240
|
+
it('should combine old and new reasoning', () => {
|
|
241
|
+
const merged = mergeReasoning('Old insight: use React', 'New insight: add TypeScript');
|
|
242
|
+
assert.ok(merged.includes('Old insight'));
|
|
243
|
+
assert.ok(merged.includes('New insight'));
|
|
244
|
+
});
|
|
245
|
+
it('should handle empty old reasoning', () => {
|
|
246
|
+
const merged = mergeReasoning('', 'First insight');
|
|
247
|
+
assert.strictEqual(merged, 'First insight');
|
|
248
|
+
});
|
|
249
|
+
it('should handle empty new reasoning', () => {
|
|
250
|
+
const merged = mergeReasoning('Old insight', '');
|
|
251
|
+
assert.strictEqual(merged, 'Old insight');
|
|
252
|
+
});
|
|
253
|
+
it('should cap reasoning length to prevent unbounded growth', () => {
|
|
254
|
+
const longOld = 'x'.repeat(10000);
|
|
255
|
+
const longNew = 'y'.repeat(10000);
|
|
256
|
+
const merged = mergeReasoning(longOld, longNew);
|
|
257
|
+
// Should be capped (e.g., 8000 chars max)
|
|
258
|
+
assert.ok(merged.length <= 8000);
|
|
259
|
+
});
|
|
260
|
+
});
|
|
261
|
+
});
|
|
262
|
+
// ============================================================================
|
|
263
|
+
// HALTING TESTS
|
|
264
|
+
// ============================================================================
|
|
265
|
+
describe('Adaptive Halting', () => {
|
|
266
|
+
beforeEach(() => {
|
|
267
|
+
testDir = createTestDir();
|
|
268
|
+
});
|
|
269
|
+
afterEach(() => {
|
|
270
|
+
cleanup(testDir);
|
|
271
|
+
});
|
|
272
|
+
describe('checkHalt', () => {
|
|
273
|
+
it('should return shouldHalt=true when criteria met', () => {
|
|
274
|
+
const decision = checkHalt({
|
|
275
|
+
x: 'Task',
|
|
276
|
+
y: 'Final answer [improved x1] [improved x2] [improved x3]',
|
|
277
|
+
z: 'Reasoning',
|
|
278
|
+
checker: mockHaltChecker,
|
|
279
|
+
});
|
|
280
|
+
assert.strictEqual(decision.shouldHalt, true);
|
|
281
|
+
assert.ok(decision.confidence >= 90);
|
|
282
|
+
});
|
|
283
|
+
it('should return shouldHalt=false when more work needed', () => {
|
|
284
|
+
const decision = checkHalt({
|
|
285
|
+
x: 'Task',
|
|
286
|
+
y: 'Initial answer',
|
|
287
|
+
z: 'Reasoning',
|
|
288
|
+
checker: mockHaltChecker,
|
|
289
|
+
});
|
|
290
|
+
assert.strictEqual(decision.shouldHalt, false);
|
|
291
|
+
});
|
|
292
|
+
it('should provide halt reason', () => {
|
|
293
|
+
const decision = checkHalt({
|
|
294
|
+
x: 'Task',
|
|
295
|
+
y: 'Final answer [improved x1] [improved x2] [improved x3]',
|
|
296
|
+
z: 'Reasoning',
|
|
297
|
+
checker: mockHaltChecker,
|
|
298
|
+
});
|
|
299
|
+
assert.ok(decision.reason.length > 0);
|
|
300
|
+
});
|
|
301
|
+
it('should halt at max iterations even if not confident', () => {
|
|
302
|
+
const decision = checkHalt({
|
|
303
|
+
x: 'Task',
|
|
304
|
+
y: 'Partial answer',
|
|
305
|
+
z: 'Reasoning',
|
|
306
|
+
iteration: 16,
|
|
307
|
+
maxIterations: 16,
|
|
308
|
+
checker: mockHaltChecker,
|
|
309
|
+
});
|
|
310
|
+
assert.strictEqual(decision.shouldHalt, true);
|
|
311
|
+
assert.ok(decision.reason.includes('max'));
|
|
312
|
+
});
|
|
313
|
+
});
|
|
314
|
+
describe('calculateConfidence', () => {
|
|
315
|
+
it('should return 0 for empty answer', () => {
|
|
316
|
+
const conf = calculateConfidence('Task', '', 'Reasoning');
|
|
317
|
+
assert.strictEqual(conf, 0);
|
|
318
|
+
});
|
|
319
|
+
it('should increase with more reasoning iterations', () => {
|
|
320
|
+
const z1 = 'iteration:1';
|
|
321
|
+
const z2 = 'iteration:1\niteration:2\niteration:3';
|
|
322
|
+
const conf1 = calculateConfidence('Task', 'Answer', z1);
|
|
323
|
+
const conf2 = calculateConfidence('Task', 'Answer', z2);
|
|
324
|
+
assert.ok(conf2 > conf1);
|
|
325
|
+
});
|
|
326
|
+
it('should be bounded 0-100', () => {
|
|
327
|
+
const z = Array(100).fill('iteration').join('\n');
|
|
328
|
+
const conf = calculateConfidence('Task', 'Answer', z);
|
|
329
|
+
assert.ok(conf >= 0);
|
|
330
|
+
assert.ok(conf <= 100);
|
|
331
|
+
});
|
|
332
|
+
});
|
|
333
|
+
});
|
|
334
|
+
// ============================================================================
|
|
335
|
+
// ITERATION TESTS
|
|
336
|
+
// ============================================================================
|
|
337
|
+
describe('Single Iteration', () => {
|
|
338
|
+
beforeEach(() => {
|
|
339
|
+
testDir = createTestDir();
|
|
340
|
+
});
|
|
341
|
+
afterEach(() => {
|
|
342
|
+
cleanup(testDir);
|
|
343
|
+
});
|
|
344
|
+
describe('runIteration', () => {
|
|
345
|
+
it('should perform one complete cycle: z refinement → y refinement', () => {
|
|
346
|
+
const state = {
|
|
347
|
+
x: 'Build todo app',
|
|
348
|
+
y: 'Empty',
|
|
349
|
+
z: '',
|
|
350
|
+
iteration: 0,
|
|
351
|
+
halted: false,
|
|
352
|
+
haltReason: null,
|
|
353
|
+
history: [],
|
|
354
|
+
};
|
|
355
|
+
const result = runIteration(state, {
|
|
356
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
357
|
+
answerRefiner: mockAnswerRefiner,
|
|
358
|
+
haltChecker: mockHaltChecker,
|
|
359
|
+
});
|
|
360
|
+
assert.strictEqual(result.state.iteration, 1);
|
|
361
|
+
assert.ok(result.state.z.includes('iteration:1'));
|
|
362
|
+
assert.ok(result.state.y.includes('improved'));
|
|
363
|
+
});
|
|
364
|
+
it('should record iteration in history', () => {
|
|
365
|
+
const state = {
|
|
366
|
+
x: 'Task',
|
|
367
|
+
y: 'Code',
|
|
368
|
+
z: '',
|
|
369
|
+
iteration: 0,
|
|
370
|
+
halted: false,
|
|
371
|
+
haltReason: null,
|
|
372
|
+
history: [],
|
|
373
|
+
};
|
|
374
|
+
const result = runIteration(state, {
|
|
375
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
376
|
+
answerRefiner: mockAnswerRefiner,
|
|
377
|
+
haltChecker: mockHaltChecker,
|
|
378
|
+
});
|
|
379
|
+
assert.strictEqual(result.state.history.length, 1);
|
|
380
|
+
assert.strictEqual(result.state.history[0].iteration, 1);
|
|
381
|
+
});
|
|
382
|
+
it('should check halt after refinement', () => {
|
|
383
|
+
const state = {
|
|
384
|
+
x: 'Task',
|
|
385
|
+
y: 'Answer [improved x1] [improved x2] [improved x3]',
|
|
386
|
+
z: '',
|
|
387
|
+
iteration: 0,
|
|
388
|
+
halted: false,
|
|
389
|
+
haltReason: null,
|
|
390
|
+
history: [],
|
|
391
|
+
};
|
|
392
|
+
const result = runIteration(state, {
|
|
393
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
394
|
+
answerRefiner: mockAnswerRefiner,
|
|
395
|
+
haltChecker: mockHaltChecker,
|
|
396
|
+
});
|
|
397
|
+
// After one more improvement, should have 4 and halt
|
|
398
|
+
assert.strictEqual(result.state.halted, true);
|
|
399
|
+
});
|
|
400
|
+
it('should not mutate original state', () => {
|
|
401
|
+
const state = {
|
|
402
|
+
x: 'Task',
|
|
403
|
+
y: 'Code',
|
|
404
|
+
z: 'Reasoning',
|
|
405
|
+
iteration: 5,
|
|
406
|
+
halted: false,
|
|
407
|
+
haltReason: null,
|
|
408
|
+
history: [],
|
|
409
|
+
};
|
|
410
|
+
const originalZ = state.z;
|
|
411
|
+
runIteration(state, {
|
|
412
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
413
|
+
answerRefiner: mockAnswerRefiner,
|
|
414
|
+
haltChecker: mockHaltChecker,
|
|
415
|
+
});
|
|
416
|
+
assert.strictEqual(state.z, originalZ);
|
|
417
|
+
assert.strictEqual(state.iteration, 5);
|
|
418
|
+
});
|
|
419
|
+
});
|
|
420
|
+
});
|
|
421
|
+
// ============================================================================
|
|
422
|
+
// FULL SESSION TESTS
|
|
423
|
+
// ============================================================================
|
|
424
|
+
describe('Full Session Run', () => {
|
|
425
|
+
beforeEach(() => {
|
|
426
|
+
testDir = createTestDir();
|
|
427
|
+
});
|
|
428
|
+
afterEach(() => {
|
|
429
|
+
cleanup(testDir);
|
|
430
|
+
});
|
|
431
|
+
describe('runSession', () => {
|
|
432
|
+
it('should run until halt condition met', () => {
|
|
433
|
+
const session = createSession({
|
|
434
|
+
x: 'Build todo app',
|
|
435
|
+
projectPath: testDir,
|
|
436
|
+
maxIterations: 16,
|
|
437
|
+
});
|
|
438
|
+
const result = runSession(session, {
|
|
439
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
440
|
+
answerRefiner: mockAnswerRefiner,
|
|
441
|
+
haltChecker: mockHaltChecker,
|
|
442
|
+
});
|
|
443
|
+
assert.strictEqual(result.state.halted, true);
|
|
444
|
+
assert.ok(result.state.iteration >= 3); // Needs 3+ improvements to halt
|
|
445
|
+
assert.ok(result.state.iteration <= 16); // Should halt before max
|
|
446
|
+
});
|
|
447
|
+
it('should stop at maxIterations if no halt', () => {
|
|
448
|
+
const neverHaltChecker = () => ({
|
|
449
|
+
shouldHalt: false,
|
|
450
|
+
confidence: 0,
|
|
451
|
+
reason: 'Never halt',
|
|
452
|
+
});
|
|
453
|
+
const session = createSession({
|
|
454
|
+
x: 'Impossible task',
|
|
455
|
+
projectPath: testDir,
|
|
456
|
+
maxIterations: 5,
|
|
457
|
+
});
|
|
458
|
+
const result = runSession(session, {
|
|
459
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
460
|
+
answerRefiner: mockAnswerRefiner,
|
|
461
|
+
haltChecker: neverHaltChecker,
|
|
462
|
+
});
|
|
463
|
+
assert.strictEqual(result.state.iteration, 5);
|
|
464
|
+
assert.strictEqual(result.state.halted, true);
|
|
465
|
+
assert.ok(result.state.haltReason?.includes('max'));
|
|
466
|
+
});
|
|
467
|
+
it('should accumulate z across all iterations', () => {
|
|
468
|
+
const session = createSession({
|
|
469
|
+
x: 'Task',
|
|
470
|
+
projectPath: testDir,
|
|
471
|
+
maxIterations: 5,
|
|
472
|
+
});
|
|
473
|
+
const result = runSession(session, {
|
|
474
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
475
|
+
answerRefiner: mockAnswerRefiner,
|
|
476
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
477
|
+
});
|
|
478
|
+
// Should have 5 iterations recorded in z
|
|
479
|
+
const iterations = (result.state.z.match(/iteration:/g) || []).length;
|
|
480
|
+
assert.strictEqual(iterations, 5);
|
|
481
|
+
});
|
|
482
|
+
it('should preserve full history', () => {
|
|
483
|
+
const session = createSession({
|
|
484
|
+
x: 'Task',
|
|
485
|
+
projectPath: testDir,
|
|
486
|
+
maxIterations: 4,
|
|
487
|
+
});
|
|
488
|
+
const result = runSession(session, {
|
|
489
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
490
|
+
answerRefiner: mockAnswerRefiner,
|
|
491
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
492
|
+
});
|
|
493
|
+
assert.strictEqual(result.state.history.length, 4);
|
|
494
|
+
});
|
|
495
|
+
it('should report total duration', () => {
|
|
496
|
+
const session = createSession({
|
|
497
|
+
x: 'Task',
|
|
498
|
+
projectPath: testDir,
|
|
499
|
+
maxIterations: 3,
|
|
500
|
+
});
|
|
501
|
+
const result = runSession(session, {
|
|
502
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
503
|
+
answerRefiner: mockAnswerRefiner,
|
|
504
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
505
|
+
});
|
|
506
|
+
assert.ok(result.totalDuration >= 0);
|
|
507
|
+
});
|
|
508
|
+
it('should handle early halt efficiently', () => {
|
|
509
|
+
const immediateHalt = () => ({
|
|
510
|
+
shouldHalt: true,
|
|
511
|
+
confidence: 100,
|
|
512
|
+
reason: 'Already perfect',
|
|
513
|
+
});
|
|
514
|
+
const session = createSession({
|
|
515
|
+
x: 'Easy task',
|
|
516
|
+
y: 'Perfect answer',
|
|
517
|
+
projectPath: testDir,
|
|
518
|
+
maxIterations: 16,
|
|
519
|
+
});
|
|
520
|
+
const result = runSession(session, {
|
|
521
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
522
|
+
answerRefiner: mockAnswerRefiner,
|
|
523
|
+
haltChecker: immediateHalt,
|
|
524
|
+
});
|
|
525
|
+
assert.strictEqual(result.state.iteration, 1); // Only one iteration
|
|
526
|
+
});
|
|
527
|
+
});
|
|
528
|
+
});
|
|
529
|
+
// ============================================================================
|
|
530
|
+
// DEEP SUPERVISION TESTS (TRM-specific)
|
|
531
|
+
// ============================================================================
|
|
532
|
+
describe('Deep Supervision (TRM Pattern)', () => {
|
|
533
|
+
beforeEach(() => {
|
|
534
|
+
testDir = createTestDir();
|
|
535
|
+
});
|
|
536
|
+
afterEach(() => {
|
|
537
|
+
cleanup(testDir);
|
|
538
|
+
});
|
|
539
|
+
it('should detach reasoning between supervision steps (no gradient explosion analog)', () => {
|
|
540
|
+
// In TRM, z.detach() prevents gradient explosion
|
|
541
|
+
// Our analog: reasoning from previous iteration is "frozen" - we don't re-analyze it
|
|
542
|
+
const session = createSession({
|
|
543
|
+
x: 'Complex task',
|
|
544
|
+
projectPath: testDir,
|
|
545
|
+
maxIterations: 3,
|
|
546
|
+
});
|
|
547
|
+
let lastZ = '';
|
|
548
|
+
const trackingRefiner = (x, y, z) => {
|
|
549
|
+
// The incoming z should equal the last z we produced (frozen, not re-processed)
|
|
550
|
+
if (lastZ !== '') {
|
|
551
|
+
assert.ok(z.includes(lastZ.slice(-50))); // Check tail preserved
|
|
552
|
+
}
|
|
553
|
+
const newZ = mockReasoningRefiner(x, y, z);
|
|
554
|
+
lastZ = newZ;
|
|
555
|
+
return newZ;
|
|
556
|
+
};
|
|
557
|
+
runSession(session, {
|
|
558
|
+
reasoningRefiner: trackingRefiner,
|
|
559
|
+
answerRefiner: mockAnswerRefiner,
|
|
560
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
561
|
+
});
|
|
562
|
+
});
|
|
563
|
+
it('should support T recursions per supervision step (like TRM n param)', () => {
|
|
564
|
+
// TRM does T=3 latent recursions before updating answer
|
|
565
|
+
// We simulate by doing multiple z refinements per y update
|
|
566
|
+
const session = createSession({
|
|
567
|
+
x: 'Task',
|
|
568
|
+
projectPath: testDir,
|
|
569
|
+
maxIterations: 2,
|
|
570
|
+
});
|
|
571
|
+
let zRefinements = 0;
|
|
572
|
+
let yRefinements = 0;
|
|
573
|
+
const countingZRefiner = (x, y, z) => {
|
|
574
|
+
zRefinements++;
|
|
575
|
+
return mockReasoningRefiner(x, y, z);
|
|
576
|
+
};
|
|
577
|
+
const countingYRefiner = (y, z) => {
|
|
578
|
+
yRefinements++;
|
|
579
|
+
return mockAnswerRefiner(y, z);
|
|
580
|
+
};
|
|
581
|
+
runSession(session, {
|
|
582
|
+
reasoningRefiner: countingZRefiner,
|
|
583
|
+
answerRefiner: countingYRefiner,
|
|
584
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
585
|
+
latentRecursions: 3, // T=3 like TRM
|
|
586
|
+
});
|
|
587
|
+
// With 2 iterations and T=3, should have 6 z refinements and 2 y refinements
|
|
588
|
+
assert.strictEqual(zRefinements, 6);
|
|
589
|
+
assert.strictEqual(yRefinements, 2);
|
|
590
|
+
});
|
|
591
|
+
});
|
|
592
|
+
// ============================================================================
|
|
593
|
+
// PERSISTENCE TESTS
|
|
594
|
+
// ============================================================================
|
|
595
|
+
describe('Session Persistence', () => {
|
|
596
|
+
beforeEach(() => {
|
|
597
|
+
testDir = createTestDir();
|
|
598
|
+
});
|
|
599
|
+
afterEach(() => {
|
|
600
|
+
cleanup(testDir);
|
|
601
|
+
});
|
|
602
|
+
it('should save session state to disk', () => {
|
|
603
|
+
const session = createSession({
|
|
604
|
+
x: 'Build todo app',
|
|
605
|
+
projectPath: testDir,
|
|
606
|
+
});
|
|
607
|
+
const result = runSession(session, {
|
|
608
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
609
|
+
answerRefiner: mockAnswerRefiner,
|
|
610
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
611
|
+
});
|
|
612
|
+
// State should be persisted
|
|
613
|
+
const statePath = join(testDir, '.midas', 'recursive-session.json');
|
|
614
|
+
assert.ok(existsSync(statePath));
|
|
615
|
+
const saved = JSON.parse(readFileSync(statePath, 'utf-8'));
|
|
616
|
+
assert.strictEqual(saved.x, 'Build todo app');
|
|
617
|
+
assert.ok(saved.iteration > 0);
|
|
618
|
+
});
|
|
619
|
+
it('should resume session from disk', () => {
|
|
620
|
+
// First run
|
|
621
|
+
const session1 = createSession({
|
|
622
|
+
x: 'Build todo app',
|
|
623
|
+
projectPath: testDir,
|
|
624
|
+
maxIterations: 3,
|
|
625
|
+
});
|
|
626
|
+
runSession(session1, {
|
|
627
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
628
|
+
answerRefiner: mockAnswerRefiner,
|
|
629
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
630
|
+
});
|
|
631
|
+
// Resume - should load previous state
|
|
632
|
+
const session2 = createSession({
|
|
633
|
+
x: 'Build todo app',
|
|
634
|
+
projectPath: testDir,
|
|
635
|
+
maxIterations: 6,
|
|
636
|
+
resume: true,
|
|
637
|
+
});
|
|
638
|
+
assert.strictEqual(session2.state.iteration, 3); // Resumed from 3
|
|
639
|
+
});
|
|
640
|
+
it('should handle missing session file gracefully', () => {
|
|
641
|
+
const session = createSession({
|
|
642
|
+
x: 'New task',
|
|
643
|
+
projectPath: testDir,
|
|
644
|
+
resume: true, // Try to resume but no file exists
|
|
645
|
+
});
|
|
646
|
+
assert.strictEqual(session.state.iteration, 0); // Fresh start
|
|
647
|
+
});
|
|
648
|
+
});
|
|
649
|
+
// ============================================================================
|
|
650
|
+
// EDGE CASE TESTS
|
|
651
|
+
// ============================================================================
|
|
652
|
+
describe('Edge Cases', () => {
|
|
653
|
+
beforeEach(() => {
|
|
654
|
+
testDir = createTestDir();
|
|
655
|
+
});
|
|
656
|
+
afterEach(() => {
|
|
657
|
+
cleanup(testDir);
|
|
658
|
+
});
|
|
659
|
+
it('should handle empty x (requirements)', () => {
|
|
660
|
+
const session = createSession({
|
|
661
|
+
x: '',
|
|
662
|
+
projectPath: testDir,
|
|
663
|
+
});
|
|
664
|
+
// Should still work, just with empty context
|
|
665
|
+
const result = runSession(session, {
|
|
666
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
667
|
+
answerRefiner: mockAnswerRefiner,
|
|
668
|
+
haltChecker: () => ({ shouldHalt: true, confidence: 100, reason: 'Done' }),
|
|
669
|
+
});
|
|
670
|
+
assert.strictEqual(result.state.halted, true);
|
|
671
|
+
});
|
|
672
|
+
it('should handle refiner throwing error', () => {
|
|
673
|
+
const throwingRefiner = () => {
|
|
674
|
+
throw new Error('Refiner crashed');
|
|
675
|
+
};
|
|
676
|
+
const session = createSession({
|
|
677
|
+
x: 'Task',
|
|
678
|
+
projectPath: testDir,
|
|
679
|
+
});
|
|
680
|
+
// Should not crash, should handle gracefully
|
|
681
|
+
const result = runSession(session, {
|
|
682
|
+
reasoningRefiner: throwingRefiner,
|
|
683
|
+
answerRefiner: mockAnswerRefiner,
|
|
684
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
685
|
+
});
|
|
686
|
+
assert.ok(result.state.halted);
|
|
687
|
+
assert.ok(result.state.haltReason?.includes('error'));
|
|
688
|
+
});
|
|
689
|
+
it('should handle very long x (requirements)', () => {
|
|
690
|
+
const longX = 'x'.repeat(100000);
|
|
691
|
+
const session = createSession({
|
|
692
|
+
x: longX,
|
|
693
|
+
projectPath: testDir,
|
|
694
|
+
maxIterations: 1,
|
|
695
|
+
});
|
|
696
|
+
const result = runSession(session, {
|
|
697
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
698
|
+
answerRefiner: mockAnswerRefiner,
|
|
699
|
+
haltChecker: () => ({ shouldHalt: true, confidence: 100, reason: 'Done' }),
|
|
700
|
+
});
|
|
701
|
+
assert.strictEqual(result.state.halted, true);
|
|
702
|
+
});
|
|
703
|
+
it('should handle concurrent session access', async () => {
|
|
704
|
+
const session1 = createSession({ x: 'Task 1', projectPath: testDir });
|
|
705
|
+
const session2 = createSession({ x: 'Task 2', projectPath: testDir });
|
|
706
|
+
// Different sessions should not interfere
|
|
707
|
+
const result1 = runSession(session1, {
|
|
708
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
709
|
+
answerRefiner: mockAnswerRefiner,
|
|
710
|
+
haltChecker: () => ({ shouldHalt: true, confidence: 100, reason: 'Done' }),
|
|
711
|
+
});
|
|
712
|
+
const result2 = runSession(session2, {
|
|
713
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
714
|
+
answerRefiner: mockAnswerRefiner,
|
|
715
|
+
haltChecker: () => ({ shouldHalt: true, confidence: 100, reason: 'Done' }),
|
|
716
|
+
});
|
|
717
|
+
assert.strictEqual(result1.state.x, 'Task 1');
|
|
718
|
+
assert.strictEqual(result2.state.x, 'Task 2');
|
|
719
|
+
});
|
|
720
|
+
it('should handle zero maxIterations', () => {
|
|
721
|
+
const session = createSession({
|
|
722
|
+
x: 'Task',
|
|
723
|
+
projectPath: testDir,
|
|
724
|
+
maxIterations: 0,
|
|
725
|
+
});
|
|
726
|
+
const result = runSession(session, {
|
|
727
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
728
|
+
answerRefiner: mockAnswerRefiner,
|
|
729
|
+
haltChecker: mockHaltChecker,
|
|
730
|
+
});
|
|
731
|
+
assert.strictEqual(result.state.iteration, 0);
|
|
732
|
+
assert.strictEqual(result.state.halted, true);
|
|
733
|
+
});
|
|
734
|
+
});
|
|
735
|
+
// ============================================================================
|
|
736
|
+
// PROPERTY-BASED TESTS
|
|
737
|
+
// ============================================================================
|
|
738
|
+
describe('Property-Based Invariants', () => {
|
|
739
|
+
beforeEach(() => {
|
|
740
|
+
testDir = createTestDir();
|
|
741
|
+
});
|
|
742
|
+
afterEach(() => {
|
|
743
|
+
cleanup(testDir);
|
|
744
|
+
});
|
|
745
|
+
it('iteration count should equal history length', () => {
|
|
746
|
+
for (let max = 1; max <= 10; max++) {
|
|
747
|
+
const session = createSession({
|
|
748
|
+
x: 'Task',
|
|
749
|
+
projectPath: testDir,
|
|
750
|
+
maxIterations: max,
|
|
751
|
+
});
|
|
752
|
+
const result = runSession(session, {
|
|
753
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
754
|
+
answerRefiner: mockAnswerRefiner,
|
|
755
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
756
|
+
});
|
|
757
|
+
assert.strictEqual(result.state.iteration, result.state.history.length);
|
|
758
|
+
}
|
|
759
|
+
});
|
|
760
|
+
it('z should only grow (accumulate), never shrink', () => {
|
|
761
|
+
const session = createSession({
|
|
762
|
+
x: 'Task',
|
|
763
|
+
projectPath: testDir,
|
|
764
|
+
maxIterations: 5,
|
|
765
|
+
});
|
|
766
|
+
let prevZLength = 0;
|
|
767
|
+
const growthCheckRefiner = (x, y, z) => {
|
|
768
|
+
if (prevZLength > 0) {
|
|
769
|
+
assert.ok(z.length >= prevZLength - 100); // Allow small variance from truncation
|
|
770
|
+
}
|
|
771
|
+
const newZ = mockReasoningRefiner(x, y, z);
|
|
772
|
+
prevZLength = newZ.length;
|
|
773
|
+
return newZ;
|
|
774
|
+
};
|
|
775
|
+
runSession(session, {
|
|
776
|
+
reasoningRefiner: growthCheckRefiner,
|
|
777
|
+
answerRefiner: mockAnswerRefiner,
|
|
778
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
779
|
+
});
|
|
780
|
+
});
|
|
781
|
+
it('confidence should generally increase over iterations', () => {
|
|
782
|
+
const session = createSession({
|
|
783
|
+
x: 'Task',
|
|
784
|
+
projectPath: testDir,
|
|
785
|
+
maxIterations: 5,
|
|
786
|
+
});
|
|
787
|
+
const result = runSession(session, {
|
|
788
|
+
reasoningRefiner: mockReasoningRefiner,
|
|
789
|
+
answerRefiner: mockAnswerRefiner,
|
|
790
|
+
haltChecker: () => ({ shouldHalt: false, confidence: 0, reason: '' }),
|
|
791
|
+
});
|
|
792
|
+
// Check that confidence generally trends up
|
|
793
|
+
const confidences = result.state.history.map(h => h.confidence);
|
|
794
|
+
const isGenerallyIncreasing = confidences.every((c, i) => i === 0 || c >= confidences[i - 1] - 10 // Allow small dips
|
|
795
|
+
);
|
|
796
|
+
assert.ok(isGenerallyIncreasing, `Confidences should trend up: ${confidences}`);
|
|
797
|
+
});
|
|
798
|
+
});
|
|
799
|
+
//# sourceMappingURL=recursive-session.test.js.map
|