npm - @kognai/orchestrator-core - Versions diffs - 0.2.8 → 0.2.10 - Mend

@kognai/orchestrator-core 0.2.8 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/lib/engine-coding-agent.js +32 -12
package/dist/lib/engine-orchestrator.js +12 -1
package/dist/lib/engine-primitives.js +62 -1
package/package.json +1 -1

package/dist/lib/engine-coding-agent.js CHANGED Viewed

@@ -49,7 +49,11 @@ class CodingAgent {
     constructor(name, systemPrompt) { this.name = name; this.systemPrompt = systemPrompt; }
     async execute(task, previousReview) {
         (0, orchestrate_engine_1.log)(orchestrate_engine_1.c.cyan, `\n[${this.name}] Executing: ${task.id} (${task.priority})`);
-        const deliverables = [...(task.deliverables.code || []), ...(task.deliverables.tests || []), ...(task.deliverables.docs || [])];
+        // TICKET-353: TEST-FIRST ordering. Generate paired tests BEFORE their code so
+        // the test is authored from the SPEC (contract-first), frozen, then the code
+        // is written to pass it — instead of one model writing code+test together and
+        // letting both agree on the same wrong understanding (#2 separate author).
+        const deliverables = [...(task.deliverables.tests || []), ...(task.deliverables.code || []), ...(task.deliverables.docs || [])];
         // Complexity-aware model routing:
         // Claude Sonnet → complex tasks (many files, complex keywords, large existing files)
         // MiniMax M2.5  → simple tasks (small edits, config, stubs) + truncation retry as safety net
@@ -233,15 +237,31 @@ class CodingAgent {
             const existingContent = (0, fs_1.existsSync)(filepath)
                 ? `\n\n## EXISTING FILE — SURGICAL EDIT ONLY\nDo NOT rewrite the entire file. Output the COMPLETE updated file with your changes merged in.\nIf you add a function, append it. If you edit a line, change only that line.\nFile has ${existingLines} lines — preserve ALL existing code.\n\n### Current Content\n\`\`\`typescript\n${(0, fs_1.readFileSync)(filepath, 'utf-8').substring(0, 3000)}\n\`\`\`\n`
                 : `\n\n## Note: This is a NEW file — create it from scratch.\n`;
+            // TICKET-353 (#1 examples + #2 separate author): for a test file, you ARE
+            // the test author and this test DEFINES correctness. The implementation is
+            // written separately to PASS this test and cannot change it.
             const testConstraint = isTestFile
-                ? `\n\n## CRITICAL: TEST FILE SIZE LIMIT
-This is a test file. You MUST keep it SHORT to avoid truncation:
-- Maximum 5-6 test cases (describe + it blocks)
-- Maximum 80 lines total
-- NO verbose setup — use inline mocks
-- NO redundant tests — one test per behavior
-- Cover: happy path, error case, edge case, defaults — that's it
-- If you write more than 80 lines, the file WILL be truncated and REJECTED\n`
+                ? `\n\n## YOU ARE THE TEST AUTHOR — this test DEFINES correctness
+The implementation file this pairs with will be written SEPARATELY to PASS this test and CANNOT modify it. So this test is the CONTRACT.
+- Derive 3–5 CONCRETE golden input→output examples from the spec/description above and encode them as assertions with EXACT expected values (do the arithmetic yourself; don't assert on whatever the code returns).
+- Use Node's built-in runner: \`import { test } from 'node:test'\` and \`import assert from 'node:assert/strict'\`.
+- Import the real symbols from the implementation file (correct relative path) and exercise ACTUAL behavior: happy path, an error/edge case, and any invariant ("never exceeds capacity").
+- It WILL be executed; assertions must reflect the SPEC's truth, not a guess. Keep it ≤80 lines, no verbose setup.\n`
+                : '';
+            // TICKET-353 (#2): author tests with a stronger model than the coder
+            // default; code files use the assessed/escalated coder model.
+            const TEST_AUTHOR_MODEL = process.env.KOGNAI_TEST_AUTHOR_MODEL || 'anthropic/claude-sonnet-4.6';
+            const fileProvider = isTestFile ? 'clawrouter' : provider;
+            const fileModel = isTestFile ? TEST_AUTHOR_MODEL : model;
+            if (isTestFile)
+                (0, orchestrate_engine_1.log)(orchestrate_engine_1.c.magenta, `  ✎ [TEST-AUTHOR] ${filepath} via ${fileModel} (contract-first, examples)`);
+            // For a CODE file, inject its already-authored paired test (tests-first
+            // ordering placed it in createdFiles) as the frozen contract to satisfy.
+            const pairedTest = !isTestFile
+                ? createdFiles.find(f => f.path === filepath.replace(/\.([tj]sx?)$/, '.test.$1'))
+                : undefined;
+            const contractBlock = pairedTest
+                ? `\n\n## CONTRACT — your code MUST make this test pass (already written; DO NOT modify it; it defines correctness):\n\`\`\`typescript\n${pairedTest.content.substring(0, 3000)}\n\`\`\`\n`
                 : '';
             // EXACT CONTENT mode: task description contains code block(s) with the exact file content.
             // Extract them deterministically and bypass LLM to prevent model hallucination.
@@ -270,7 +290,7 @@ ${task.context}
 ${fileList}
 ## Generate ONLY: ${filepath}
-${existingContent}${priorCtx}${testConstraint}
+${existingContent}${priorCtx}${testConstraint}${contractBlock}
 Write ONLY the content for "${filepath}". Rules:
 - S64-001: Output the raw file content using FILE: format as described in the system prompt
 - Do NOT wrap output in markdown code fences (\`\`\`) — for .md files especially, output RAW markdown text, NOT inside a \`\`\`markdown or \`\`\`typescript block
@@ -281,7 +301,7 @@ Write ONLY the content for "${filepath}". Rules:
 - No explanatory text — output file content only`;
             try {
                 const startTime = Date.now();
-                const response = await (0, orchestrate_engine_1.callLLM)(provider, model, this.systemPrompt, userPrompt, 480000, this.name, task.id); // 8 min — qwen3:14b needs time for large files
+                const response = await (0, orchestrate_engine_1.callLLM)(fileProvider, fileModel, this.systemPrompt, userPrompt, 480000, this.name, task.id); // 8 min — qwen3:14b needs time for large files
                 const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
                 let content = response.choices?.[0]?.message?.content || '';
                 // Strip MiniMax <think>...</think> tags that leak into responses
@@ -358,7 +378,7 @@ ${fileContent.substring(fileContent.length - 1500)}
 Continue EXACTLY from where it left off and output ONLY the remaining code (no duplicated content, no preamble, no markdown fences). End the file with its final closing brace.`;
                     let grew = false;
                     try {
-                        const contResponse = await (0, orchestrate_engine_1.callLLM)(provider, model, this.systemPrompt, continuationPrompt, 120000, this.name, `${task.id}_continuation_${chunk}`);
+                        const contResponse = await (0, orchestrate_engine_1.callLLM)(fileProvider, fileModel, this.systemPrompt, continuationPrompt, 120000, this.name, `${task.id}_continuation_${chunk}`);
                         let contContent = contResponse.choices?.[0]?.message?.content || '';
                         contContent = contContent.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
                         const contBlocks = this.extractCodeBlocks(contContent);

package/dist/lib/engine-orchestrator.js CHANGED Viewed

@@ -795,8 +795,19 @@ ONLY output the JSON array. No markdown, no explanation.`;
                     (0, engine_primitives_1.safeResetLastCommit)(task.id, task.agent, task.type, '  ');
                     (0, code_failure_logger_1.logCodeFailure)({ taskId: task.id, sprintId: (0, orchestrate_engine_1.resolveActiveSprintId)(), agentId: (0, orchestrate_engine_1.resolveAgentDid)(task.agent), attemptNum: attempt, score: 0, model: taskRun.model_used || result?.model || task.model || 'unknown', rejectionReason: qaResult.reason, issues: [], failType: 'qa_gate' });
                     monotask_state_machine_1.MonotaskSM.release(task.agent, task.id, `QA gate: ${qaResult.reason}`);
+                    // TICKET-352: feed the QA-gate failure to the coder on retry. Previously
+                    // the QA reason was only logged, so the coder repaired BLIND (lastReview
+                    // stayed stale/undefined). A red paired-test's output (expected X, got Y)
+                    // is the most actionable repair signal there is — surface it as the review.
+                    lastReview = {
+                        verdict: 'REJECTED',
+                        score: 0,
+                        summary: `QA gate failed: ${qaResult.reason}`,
+                        issues: [{ severity: 'critical', file: (task.deliverables.code?.[0] || task.id), description: qaResult.reason }],
+                        strengths: [],
+                    };
                     if (attempt < MAX_RETRIES) {
-                        (0, engine_primitives_1.log)(engine_primitives_1.c.yellow, '  QA gate failed — retrying without supervisor...');
+                        (0, engine_primitives_1.log)(engine_primitives_1.c.yellow, '  QA gate failed — retrying with the failure as feedback...');
                         continue;
                     }
                     taskRun.status = 'rejected';

package/dist/lib/engine-primitives.js CHANGED Viewed

@@ -46,6 +46,16 @@ exports.callAnthropicCached = callAnthropicCached;
 exports.compressContext = compressContext;
 exports.localQAGate = localQAGate;
 exports.httpPost = httpPost;
+/**
+ * engine-primitives.ts — shared low-level primitives extracted from orchestrate-engine.ts
+ * (TICKET-231 engine split 4). Token accounting, AgentTask/ReviewResult/CTO* types,
+ * the LLM gateway (callLLM/callAnthropicCached/compressContext), credit/budget alerts,
+ * provider classification, and the QA/debug helpers (localQAGate/typecheckChangedFiles/
+ * tieredDebug/httpPost). This is a LEAF module — it imports only sibling leaf modules,
+ * never back from orchestrate-engine, so the prior split modules can depend on it without
+ * a cycle (orchestrate-engine re-exports the primitives they import for back-compat).
+ */
+const fs_1 = require("fs");
 const child_process_1 = require("child_process");
 const https = __importStar(require("https"));
 const http = __importStar(require("http"));
@@ -544,6 +554,50 @@ function detectRumination(content) {
     const wordCount = Math.max(1, content.split(/\s+/).filter(Boolean).length);
     return { hits, ratio: hits / wordCount };
 }
+// TICKET-352: green-test approval gate. The remaining wall after 347-351 is
+// CONTENT CORRECTNESS — a well-shaped single-concern logic file still rejects at
+// ~50/100 because the supervisor's read is subjective and the repair loop has no
+// ground truth. This makes correctness OBJECTIVE: when a task's generated set
+// includes a paired test file, EXECUTE it and fail QA if it's red — and the
+// failure output flows back to the coder as concrete repair feedback (expected X,
+// got Y) instead of prose. No-op when no test files are present (back-compat).
+// Disable with KOGNAI_TEST_GATE=0. NOTE: this executes model-generated test code
+// in-repo, bounded by a timeout — same trust surface as the swarm already
+// committing generated code.
+const TEST_GATE_ENABLED = (process.env.KOGNAI_TEST_GATE ?? '1') !== '0';
+const TEST_RUN_TIMEOUT_MS = parseInt(process.env.KOGNAI_TEST_TIMEOUT_MS ?? '90000', 10);
+function runPairedTests(fileContents) {
+    if (!TEST_GATE_ENABLED)
+        return { pass: true, reason: 'test gate disabled (KOGNAI_TEST_GATE=0)' };
+    const testFiles = fileContents.filter(f => /\.(test|spec)\.[tj]sx?$/.test(f.path) && (0, fs_1.existsSync)(f.path));
+    if (testFiles.length === 0)
+        return { pass: true, reason: 'no paired tests' };
+    for (const tf of testFiles) {
+        const isTs = /\.tsx?$/.test(tf.path);
+        // node:test auto-runs on execution and sets a non-zero exit code if any test
+        // fails → execSync throws. TS runs via ts-node transpile-only (the typecheck
+        // gate already validated types); JS via `node --test`.
+        const cmd = isTs
+            ? `npx ts-node --transpile-only ${JSON.stringify(tf.path)}`
+            : `node --test ${JSON.stringify(tf.path)}`;
+        try {
+            (0, child_process_1.execSync)(cmd, {
+                cwd: process.cwd(),
+                timeout: TEST_RUN_TIMEOUT_MS,
+                stdio: 'pipe',
+                env: { ...process.env, TS_NODE_TRANSPILE_ONLY: 'true' },
+            });
+        }
+        catch (e) {
+            if (e?.signal === 'SIGTERM' || /ETIMEDOUT|timed out/i.test(String(e?.message || ''))) {
+                return { pass: false, reason: `Paired test TIMED OUT (${TEST_RUN_TIMEOUT_MS}ms): ${tf.path} — likely an infinite loop/hang in the code under test.` };
+            }
+            const out = `${e?.stdout?.toString?.() || ''}\n${e?.stderr?.toString?.() || ''}`.trim();
+            return { pass: false, reason: `Paired test FAILED: ${tf.path}\n--- test output (tail) ---\n${out.slice(-1500)}` };
+        }
+    }
+    return { pass: true, reason: `${testFiles.length} paired test(s) green` };
+}
 async function localQAGate(_task, fileContents) {
     // Fail only on structurally empty files (< 50 chars indicates the model returned nothing useful)
     const emptyFiles = fileContents.filter(f => (f.content || '').trim().length < 50);
@@ -584,7 +638,14 @@ async function localQAGate(_task, fileContents) {
             };
         }
     }
-    return { pass: true, reason: `${fileContents.length} file(s) non-empty + no rumination + typecheck PASS — proceeding to supervisor review` };
+    // TICKET-352: objective correctness gate — run any paired test file(s) and
+    // fail QA (with the test output as the reason → fed to the coder on retry) if
+    // red. No-op when the task generated no test files.
+    const testResult = runPairedTests(fileContents);
+    if (!testResult.pass) {
+        return { pass: false, reason: testResult.reason };
+    }
+    return { pass: true, reason: `${fileContents.length} file(s) non-empty + no rumination + typecheck PASS + ${testResult.reason} — proceeding to supervisor review` };
 }
 // TICKET-085 (v2 — TICKET-088 fix): project-aware typecheck. v1 used
 // loose-file mode + `npx -y typescript@5 tsc` and silently passed

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@kognai/orchestrator-core",
-  "version": "0.2.8",
+  "version": "0.2.10",
   "description": "Kognai sovereign orchestrator — core engine (template-agnostic). Shared by all products (Kognai/coding, Voxight/market-intel, Invoica/fin-compliance); each supplies only its template. Replaces per-repo forks of orchestrate-agents-v2 / sprint-runner / lib.",
   "license": "MIT",
   "author": "SkinGem",