npm - onbuzz - Versions diffs - 4.8.0 → 4.8.2 - Mend

onbuzz 4.8.0 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/package.json +1 -1
package/src/core/__tests__/agentPool.test.js +185 -0
package/src/core/__tests__/agentScheduler.nativePromptPick.test.js +319 -0
package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
package/src/core/agentPool.js +319 -0
package/src/core/agentScheduler.js +216 -2
package/src/services/__tests__/conversationCompactionService.test.js +141 -0
package/src/services/__tests__/modelRouterNaming.test.js +41 -23
package/src/services/conversationCompactionService.js +120 -46
package/src/tools/__tests__/baseTool.test.js +171 -0
package/src/tools/__tests__/codeMapTool.test.js +179 -0
package/src/tools/__tests__/taskManagerTool.test.js +141 -0
package/src/tools/baseTool.js +89 -1
package/src/tools/openaiFunctionSchemas.js +14 -0
package/src/tools/skillsTool.js +282 -277
package/src/tools/taskManagerTool.js +72 -2
package/src/utilities/constants.js +19 -1

package/src/tools/__tests__/codeMapTool.test.js CHANGED Viewed

@@ -471,6 +471,185 @@ describe('CodeMapTool', () => {
     });
   });
+  // ─────────────────────────────────────────────────────────────────
+  // TypeScript / TSX coverage. The JS parser is the same parser used
+  // for .ts / .tsx / .mjs / .cjs (see _langOf); these tests pin which
+  // TypeScript-specific patterns the no-regex parser captures TODAY
+  // and which it MISSES, so a future tree-sitter migration (see the
+  // file-header comment in codeMapTool.js) has an explicit baseline
+  // to preserve / improve against.
+  // ─────────────────────────────────────────────────────────────────
+  describe('_parseJS — TypeScript / TSX coverage', () => {
+    const opts = { publicOnly: false, withComments: false, includeImports: false };
+    const sigs = (lines) =>
+      tool._parseJS(lines, opts).filter(e => e.kind === 'signature').map(e => e.text.trim());
+    // ── Captures we rely on (regressions here would break TS skeletons) ──
+    test('export interface — one-line', () => {
+      const out = sigs(['export interface User { id: string; name: string; }']);
+      expect(out.join('\n')).toMatch(/export interface User/);
+    });
+    test('export type alias', () => {
+      const out = sigs(['export type ID = string | number;']);
+      expect(out.join('\n')).toMatch(/export type ID/);
+    });
+    test('export enum', () => {
+      const out = sigs(['export enum Color { Red, Green, Blue }']);
+      expect(out.join('\n')).toMatch(/export enum Color/);
+    });
+    test('export interface multi-line opening', () => {
+      const out = sigs([
+        'export interface User {',
+        '  id: string;',
+        '  name: string;',
+        '}',
+      ]);
+      expect(out.join('\n')).toMatch(/export interface User \{/);
+    });
+    test('abstract class — declaration + abstract method', () => {
+      const out = sigs([
+        'abstract class Animal {',
+        '  abstract sound(): string;',
+        '}',
+      ]);
+      expect(out.join('\n')).toMatch(/abstract class Animal/);
+      expect(out.join('\n')).toMatch(/abstract sound\(\): string/);
+    });
+    test('generic function: identity<T>(x: T): T', () => {
+      const out = sigs(['function identity<T>(x: T): T { return x; }']);
+      expect(out.join('\n')).toMatch(/function identity<T>\(x: T\): T/);
+    });
+    test('generic class: Container<T>', () => {
+      const out = sigs([
+        'class Container<T> {',
+        '  value: T;',
+        '}',
+      ]);
+      expect(out.join('\n')).toMatch(/class Container<T>/);
+    });
+    test('class method with TS return type annotation', () => {
+      const out = sigs([
+        'class C {',
+        '  foo(x: number): string { return String(x); }',
+        '}',
+      ]);
+      expect(out.join('\n')).toMatch(/foo\(x: number\): string/);
+    });
+    test('TSX function component: () => JSX.Element', () => {
+      const out = sigs([
+        'function App(): JSX.Element {',
+        '  return <div />;',
+        '}',
+      ]);
+      expect(out.join('\n')).toMatch(/function App\(\): JSX\.Element/);
+    });
+    test('ESM re-export: export { foo } from "./bar"', () => {
+      const out = sigs([`export { foo } from './bar';`]);
+      expect(out.join('\n')).toMatch(/export \{ foo \} from/);
+    });
+    test('ESM aliased re-export: export { foo as bar } from "./baz"', () => {
+      const out = sigs([`export { foo as bar } from './baz';`]);
+      expect(out.join('\n')).toMatch(/export \{ foo as bar \} from/);
+    });
+    test('decorator above class — class is still captured (decorator dropped is acceptable)', () => {
+      const out = sigs([
+        '@Component({ selector: "x" })',
+        'class Foo {}',
+      ]);
+      expect(out.join('\n')).toMatch(/class Foo/);
+    });
+    test('literal-union return type', () => {
+      const out = sigs([`function getKind(): "a" | "b" { return "a"; }`]);
+      expect(out.join('\n')).toMatch(/function getKind\(\):/);
+    });
+    test('export const generic arrow: <T>(x: T): T => x', () => {
+      const out = sigs(['export const fn = <T>(x: T): T => x;']);
+      expect(out.join('\n')).toMatch(/export const fn/);
+    });
+    // ── Language detection on all declared TS/TSX/MJS/CJS extensions ──
+    test('_langOf maps .ts / .tsx / .mjs / .cjs / .jsx → "js"', () => {
+      expect(tool._langOf('a.ts')).toBe('js');
+      expect(tool._langOf('a.tsx')).toBe('js');
+      expect(tool._langOf('a.mjs')).toBe('js');
+      expect(tool._langOf('a.cjs')).toBe('js');
+      expect(tool._langOf('a.jsx')).toBe('js');
+      // Case-insensitive on the extension.
+      expect(tool._langOf('a.TS')).toBe('js');
+    });
+    // ── Known gaps. These tests pin CURRENT (limited) behavior so an
+    //    improvement to the parser fails them — at which point you
+    //    update the assertion. Each gap is real and worth fixing in a
+    //    tree-sitter migration. ─────────────────────────────────────
+    describe('KNOWN GAPS — pin current limitations', () => {
+      test('GAP: bare `interface` (no `export`) is NOT captured', () => {
+        const out = sigs(['interface User { id: string; name: string; }']);
+        expect(out).toEqual([]);
+      });
+      test('GAP: bare `type` alias (no `export`) is NOT captured', () => {
+        const out = sigs(['type ID = string | number;']);
+        expect(out).toEqual([]);
+      });
+      test('GAP: bare `enum` (no `export`) is NOT captured', () => {
+        const out = sigs(['enum Color { Red, Green, Blue }']);
+        expect(out).toEqual([]);
+      });
+      test('GAP: bare multi-line `interface` (no `export`) is NOT captured', () => {
+        const out = sigs([
+          'interface User {',
+          '  id: string;',
+          '}',
+        ]);
+        expect(out).toEqual([]);
+      });
+      test('GAP: typed arrow component `const App: React.FC = () => <div />` is NOT captured', () => {
+        // The `: React.FC` annotation between the identifier and `=`
+        // breaks the parser's "ident = arrow" recognition. Common in
+        // older React+TS codebases.
+        const out = sigs(['const App: React.FC = () => <div />;']);
+        expect(out).toEqual([]);
+      });
+      test('GAP: async generator `async function* foo()` is NOT captured', () => {
+        const out = sigs([
+          'async function* stream(): AsyncIterableIterator<number> {',
+          '  yield 1;',
+          '}',
+        ]);
+        expect(out).toEqual([]);
+      });
+      test('GAP: destructured-arg arrow with type annotation is NOT captured', () => {
+        // `const fn = ({ name }: { name: string }): string => …`
+        // The destructured + typed parameter list trips the
+        // ident = arrow recognition.
+        const out = sigs([`const greet = ({ name }: { name: string }): string => \`hi \${name}\`;`]);
+        expect(out).toEqual([]);
+      });
+    });
+  });
   // ─────────────────────────────────────────────────────────────────
   // C / C++ — _parseC. Same approach as the JS path; we lock the
   // patterns the regex needs to handle on real-world C/CPP files so

package/src/tools/__tests__/taskManagerTool.test.js CHANGED Viewed

@@ -481,6 +481,147 @@ describe('TaskManagerTool', () => {
       expect(result.error).toContain('Invalid status');
     });
+    // ── Destructive-sync guardrail ───────────────────────────────────
+    // Real production failure: post-compaction, an agent forgot it had
+    // a 9-task plan and called sync with 4 unrelated tasks → all 9
+    // were silently dropped. These tests pin the guardrail.
+    describe('REGRESSION: destructive-sync guardrail', () => {
+      // Helper: seed the agent with two pending + one in_progress task,
+      // then attempt to sync with a totally different list.
+      async function setupAgentWithPlan() {
+        const { tool, context } = createTestSetup();
+        await tool.execute({
+          action: 'sync',
+          tasks: [
+            { title: 'Char select bg', status: 'in_progress', priority: 'high' },
+            { title: 'Board art',      status: 'pending',     priority: 'high' },
+            { title: 'Dice animation', status: 'pending',     priority: 'medium' },
+          ],
+        }, context);
+        return { tool, context };
+      }
+      test('REFUSES sync that would drop pending/in_progress tasks without confirmReplace', async () => {
+        const { tool, context } = await setupAgentWithPlan();
+        const result = await tool.execute({
+          action: 'sync',
+          tasks: [
+            { title: 'Add Settings UI', status: 'pending', priority: 'high' },
+            { title: 'Add Settings logic', status: 'pending', priority: 'high' },
+          ],
+        }, context);
+        expect(result.success).toBe(false);
+        expect(result.error).toMatch(/Sync would drop 3 non-terminal task/);
+        // Must name the at-risk tasks so the agent can see them.
+        expect(result.error).toContain('Char select bg');
+        expect(result.error).toContain('Board art');
+        expect(result.error).toContain('Dice animation');
+        // Must explain the escape hatch.
+        expect(result.error).toContain('confirmReplace: true');
+      });
+      test('PROCEEDS when confirmReplace=true is explicitly set', async () => {
+        const { tool, context } = await setupAgentWithPlan();
+        const result = await tool.execute({
+          action: 'sync',
+          confirmReplace: true,
+          tasks: [
+            { title: 'Add Settings UI', status: 'pending', priority: 'high' },
+          ],
+        }, context);
+        expect(result.success).toBe(true);
+        expect(result.result.summary.total).toBe(1);
+        expect(result.result.summary.removed).toBe(3);
+      });
+      test('does NOT trigger when the incoming list keeps every open task (rename only)', async () => {
+        const { tool, context } = await setupAgentWithPlan();
+        // Update statuses, but keep all titles. No drops.
+        const result = await tool.execute({
+          action: 'sync',
+          tasks: [
+            { title: 'Char select bg', status: 'completed',   priority: 'high' },
+            { title: 'Board art',      status: 'in_progress', priority: 'high' },
+            { title: 'Dice animation', status: 'pending',     priority: 'medium' },
+          ],
+        }, context);
+        expect(result.success).toBe(true);
+        expect(result.result.summary.updated).toBe(3);
+        expect(result.result.summary.removed).toBe(0);
+      });
+      test('does NOT trigger when only completed/cancelled tasks would be dropped', async () => {
+        const { tool, context } = createTestSetup();
+        // Seed with one done task + one pending.
+        await tool.execute({
+          action: 'sync',
+          tasks: [
+            { title: 'Already done',   status: 'completed', priority: 'low' },
+            { title: 'Still going',    status: 'pending',   priority: 'high' },
+          ],
+        }, context);
+        // New sync drops the completed one but keeps the pending one.
+        const result = await tool.execute({
+          action: 'sync',
+          tasks: [{ title: 'Still going', status: 'in_progress', priority: 'high' }],
+        }, context);
+        // No guardrail trip — completed task is safe to drop.
+        expect(result.success).toBe(true);
+      });
+      test('error response includes droppedTasks metadata for programmatic recovery', async () => {
+        const { tool, context } = await setupAgentWithPlan();
+        const result = await tool.execute({
+          action: 'sync',
+          tasks: [{ title: 'New thing', status: 'pending', priority: 'high' }],
+        }, context);
+        expect(result.success).toBe(false);
+        // BaseTool's execute() catches the thrown Error; the message
+        // carries the human-readable hint. We assert on the hint
+        // contents — that's what the agent sees.
+        expect(result.error).toMatch(/3 non-terminal task/);
+      });
+      test('REGRESSION: the exact Talisman failure scenario is now blocked', async () => {
+        // Reproduce the production failure: agent has a 9-task plan
+        // reflecting the user's UI revision request. Post-compaction,
+        // agent loses context and tries to sync a 4-task Settings plan.
+        const { tool, context } = createTestSetup();
+        await tool.execute({
+          action: 'sync',
+          tasks: [
+            { title: 'Explore current code and image assets', status: 'in_progress', priority: 'high' },
+            { title: 'Generate character select background',  status: 'pending',     priority: 'high' },
+            { title: 'Generate board space art for all nodes', status: 'pending',    priority: 'high' },
+            { title: 'Generate adventure card art',           status: 'pending',     priority: 'medium' },
+            { title: 'Fix board to rectangular layout',       status: 'pending',     priority: 'high' },
+            { title: 'Fix character select sticky buttons',   status: 'pending',     priority: 'medium' },
+            { title: 'Fix dice animation and combat',         status: 'pending',     priority: 'high' },
+            { title: 'Remove all emojis from UI',             status: 'pending',     priority: 'medium' },
+            { title: 'Test and verify all changes',           status: 'pending',     priority: 'medium' },
+          ],
+        }, context);
+        // The agent now (mistakenly) tries to sync a Settings plan.
+        const result = await tool.execute({
+          action: 'sync',
+          tasks: [
+            { title: 'Add Settings UI to index.html', status: 'pending', priority: 'high' },
+            { title: 'Add Settings logic to game.js', status: 'pending', priority: 'high' },
+            { title: 'Wire Settings to title screen', status: 'pending', priority: 'medium' },
+            { title: 'Test Settings persistence',     status: 'pending', priority: 'medium' },
+          ],
+        }, context);
+        expect(result.success).toBe(false);
+        // ALL 9 original tasks must be named so the agent sees them.
+        expect(result.error).toContain('Explore current code');
+        expect(result.error).toContain('Generate character select background');
+        expect(result.error).toContain('Fix dice animation and combat');
+        expect(result.error).toContain('Remove all emojis from UI');
+        expect(result.error).toMatch(/Sync would drop 9 non-terminal task/);
+      });
+    });
     test('enforces only one in_progress task', async () => {
       const { tool, agent, context } = createTestSetup();
       await tool.execute({

package/src/tools/baseTool.js CHANGED Viewed

@@ -15,6 +15,7 @@ import {
   ERROR_TYPES,
   SYSTEM_DEFAULTS
 } from '../utilities/constants.js';
+import { NATIVE_SCHEMA_TOOL_NAMES } from './openaiFunctionSchemas.js';
 class BaseTool {
   constructor(config = {}, logger = null) {
@@ -690,8 +691,20 @@ class ToolsRegistry {
       includeUsageGuidelines = true,
       includeSecurityNotes = true,
       compact = false,
-      layered = false
+      layered = false,
+      // 'responses' | 'chat_completion' | undefined.
+      // When 'responses', the target model uses native function-calling
+      // (Codex / o-series / gpt-5-pro). For tools that have a native
+      // schema in openaiFunctionSchemas.js, the structured schema sent
+      // in `tools:` IS the canonical source of truth for the model —
+      // so we skip baking the same information into the system prompt
+      // as text. This eliminates ~3K duplicated tokens per turn on the
+      // models that need it most. Defaults to undefined (= 'chat_completion'
+      // behaviour: include text descriptions). Old callers that don't
+      // pass this option get the previous behaviour verbatim — back-compat.
+      apiType = undefined,
     } = options;
+    const isNativeApi = apiType === 'responses';
     // Get tools to include — always inject 'help' so agents can query tool docs
     let toolIds = capabilities.length > 0
@@ -751,6 +764,19 @@ class ToolsRegistry {
         const tool = this.tools.get(toolId);
         if (!tool || !tool.isEnabled) continue;
+        // Skip text descriptions for tools that have a native function
+        // schema, when the target model uses the Responses API. The
+        // structured schema is the canonical source for these models.
+        // We DO still emit a one-line pointer so the agent isn't blind
+        // to the tool's existence (its capability list lives in the
+        // system prompt elsewhere too, but a single-line mention here
+        // costs ~10 tokens and is a useful breadcrumb).
+        if (isNativeApi && NATIVE_SCHEMA_TOOL_NAMES.has(toolId.toLowerCase())) {
+          const summary = this.toolSummaries.get(toolId) || `${toolId} tool`;
+          description += `- **${toolId}** — ${summary} (see structured schema)\n`;
+          continue;
+        }
         try {
           if (compact) {
             // Compact format - just tool name and brief description
@@ -795,6 +821,64 @@ class ToolsRegistry {
     description += '- **TOOL RESULTS ARE AVAILABLE ONLY AFTER YOUR MESSAGE ENDS**: Tools execute after your entire message is sent. You will NOT see any tool results until your next turn. This means: if the next tool call depends on results from a previous one, they MUST be in separate messages. You may batch independent tool calls in a single message, but never assume or guess the output of a tool — always wait for the actual result in the next turn before proceeding.\n\n';
     description += 'After invoking a tool, WAIT for the actual response. Do NOT generate imaginary responses.\n\n';
+    // ── Operating posture ────────────────────────────────────────────
+    // Cross-cutting habits agents should adopt VOLUNTARILY. The tool
+    // descriptions tell them WHAT each tool does; this block tells them
+    // WHEN to reach for them without being asked. Without this, agents
+    // tend to:
+    //   • skip the memory/skills check at the start of a new task,
+    //     re-discovering things the team already wrote down
+    //   • never create a plan/* memory, losing the thread across the
+    //     first compaction
+    //   • only invoke `help`/`skills` after a failure, not proactively
+    // Only emitted when the relevant tools are actually in the agent's
+    // capability set — no point teaching "check skills" to an agent
+    // that doesn't have the skills tool.
+    const hasMemory = toolIds.includes('memory');
+    const hasSkills = toolIds.includes('skills');
+    if (hasMemory || hasSkills) {
+      description += '## OPERATING POSTURE\n\n';
+      description += 'Treat these as habits, not optional extras. Use them proactively, before you need them.\n\n';
+      if (hasMemory || hasSkills) {
+        description += '**At the start of a new task or topic shift:**\n';
+        if (hasMemory) {
+          description += '- Run `memory` → `list` (titles only) to scan for relevant context the team or your past self stored. If a title looks relevant, `read` it before improvising.\n';
+        }
+        if (hasSkills) {
+          description += '- Run `skills` → `list` to see if a skill already encodes how to do this task. If yes, follow its checklist instead of inventing one.\n';
+        }
+        description += '\n';
+      }
+      if (hasMemory) {
+        // Concrete event-based write triggers. The previous version asked
+        // the agent to "save a memory when you recognize the work is
+        // multi-turn" — that's a judgment call, and in practice agents
+        // never made the call. Observed in production: 670 messages,
+        // 0 memory writes, despite the OPERATING POSTURE block being
+        // present in the system prompt. The fix is to replace "when
+        // you recognize" with concrete event triggers the model can
+        // pattern-match on without judgment.
+        description += '**WRITE memory on these events (not "when you think it\'s a good idea" — these are mandatory):**\n';
+        description += '- **A new user message contains a numbered list, a multi-bullet ask, OR more than ~30 words of substantive request.** Your VERY NEXT tool call must be `memory` → `add` with title `plan/<short-topic>` and content = the user\'s entire message verbatim. Do this BEFORE any other tool call, including taskmanager. Why mandatory: compaction may later destroy that message and the agent (you, next session) will not remember what the user actually asked for.\n';
+        description += '- **You are about to call `taskmanager` → `sync` with more than 3 tasks at once.** First, save a `memory` titled `plan/<feature>` containing the user\'s original request + your rationale for the plan. The task list is fragile (it can be wiped by a later sync); the plan/* memory is the source of truth.\n';
+        description += '- **You made a non-obvious decision** (chose approach A over B, fixed a tricky bug, discovered a constraint while exploring code, hit an unexpected error and figured out the cause). Save it as a `memory` with a non-`plan/` title before the next tool call. Format: title = the conclusion, content = the evidence. The next agent will not re-derive it.\n';
+        description += '- **The user gave you a preference or rule that should apply to all future work** ("never use emojis", "always use Tailwind", "the API base URL is X"). Save it immediately as a `memory`.\n\n';
+        description += '**`plan/*` memories auto-inject into your system prompt every turn under `## AGENT WORKING PLAN` — that\'s how they survive compaction. Update or delete them as the situation evolves; a stale plan is worse than no plan.**\n\n';
+      }
+      description += '**Distinction:**\n';
+      description += '- `memory` = persistent knowledge that survives sessions (why, constraints, durable facts, working plans).\n';
+      if (toolIds.includes('taskmanager')) {
+        description += '- `taskmanager` = step-by-step checkboxes for the CURRENT task (what to do next, in order).\n';
+      }
+      if (hasSkills) {
+        description += '- `skills` = reusable playbooks the team curated for recurring tasks.\n';
+      }
+      description += '\n';
+    }
     // Add exploration strategy if code-map is available
     if (toolIds.includes('code-map')) {
       description += '## CODE EXPLORATION STRATEGY\n\n';
@@ -817,6 +901,10 @@ class ToolsRegistry {
    * @returns {string} Enhanced system prompt
    */
   enhanceSystemPrompt(existingPrompt, capabilities = [], options = {}) {
+    // `options.apiType` ('responses' | 'chat_completion' | undefined)
+    // is forwarded to the description builder so native-API models get
+    // a trimmed prompt that doesn't duplicate the structured schemas.
+    // Old callers omit it and get pre-existing behaviour unchanged.
     const toolSection = this.generateToolDescriptionsForPrompt(capabilities, options);
     if (!toolSection.trim()) {

package/src/tools/openaiFunctionSchemas.js CHANGED Viewed

@@ -322,4 +322,18 @@ export function getToolSchemasForAgent(capabilities = []) {
   return OPENAI_FUNCTION_SCHEMAS.filter(s => allowed.has(s.name.toLowerCase()));
 }
+/**
+ * Names of every tool that has a native function schema in this file.
+ * Importable as a Set so other modules (notably baseTool's system-prompt
+ * builder) can decide "is the structured schema the canonical source of
+ * truth for this tool, or do we still need to bake a text description
+ * into the system prompt?". When a model uses the Responses API (which
+ * is RLHFed for native function-calling), the structured schema in
+ * `tools:` is the canonical source — emitting the text description as
+ * well doubles the same information in the context window.
+ */
+export const NATIVE_SCHEMA_TOOL_NAMES = new Set(
+  OPENAI_FUNCTION_SCHEMAS.map(s => s.name.toLowerCase())
+);
 export default OPENAI_FUNCTION_SCHEMAS;