onbuzz 4.8.0 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -471,6 +471,185 @@ describe('CodeMapTool', () => {
471
471
  });
472
472
  });
473
473
 
474
+ // ─────────────────────────────────────────────────────────────────
475
+ // TypeScript / TSX coverage. The JS parser is the same parser used
476
+ // for .ts / .tsx / .mjs / .cjs (see _langOf); these tests pin which
477
+ // TypeScript-specific patterns the no-regex parser captures TODAY
478
+ // and which it MISSES, so a future tree-sitter migration (see the
479
+ // file-header comment in codeMapTool.js) has an explicit baseline
480
+ // to preserve / improve against.
481
+ // ─────────────────────────────────────────────────────────────────
482
+ describe('_parseJS — TypeScript / TSX coverage', () => {
483
+ const opts = { publicOnly: false, withComments: false, includeImports: false };
484
+ const sigs = (lines) =>
485
+ tool._parseJS(lines, opts).filter(e => e.kind === 'signature').map(e => e.text.trim());
486
+
487
+ // ── Captures we rely on (regressions here would break TS skeletons) ──
488
+
489
+ test('export interface — one-line', () => {
490
+ const out = sigs(['export interface User { id: string; name: string; }']);
491
+ expect(out.join('\n')).toMatch(/export interface User/);
492
+ });
493
+
494
+ test('export type alias', () => {
495
+ const out = sigs(['export type ID = string | number;']);
496
+ expect(out.join('\n')).toMatch(/export type ID/);
497
+ });
498
+
499
+ test('export enum', () => {
500
+ const out = sigs(['export enum Color { Red, Green, Blue }']);
501
+ expect(out.join('\n')).toMatch(/export enum Color/);
502
+ });
503
+
504
+ test('export interface multi-line opening', () => {
505
+ const out = sigs([
506
+ 'export interface User {',
507
+ ' id: string;',
508
+ ' name: string;',
509
+ '}',
510
+ ]);
511
+ expect(out.join('\n')).toMatch(/export interface User \{/);
512
+ });
513
+
514
+ test('abstract class — declaration + abstract method', () => {
515
+ const out = sigs([
516
+ 'abstract class Animal {',
517
+ ' abstract sound(): string;',
518
+ '}',
519
+ ]);
520
+ expect(out.join('\n')).toMatch(/abstract class Animal/);
521
+ expect(out.join('\n')).toMatch(/abstract sound\(\): string/);
522
+ });
523
+
524
+ test('generic function: identity<T>(x: T): T', () => {
525
+ const out = sigs(['function identity<T>(x: T): T { return x; }']);
526
+ expect(out.join('\n')).toMatch(/function identity<T>\(x: T\): T/);
527
+ });
528
+
529
+ test('generic class: Container<T>', () => {
530
+ const out = sigs([
531
+ 'class Container<T> {',
532
+ ' value: T;',
533
+ '}',
534
+ ]);
535
+ expect(out.join('\n')).toMatch(/class Container<T>/);
536
+ });
537
+
538
+ test('class method with TS return type annotation', () => {
539
+ const out = sigs([
540
+ 'class C {',
541
+ ' foo(x: number): string { return String(x); }',
542
+ '}',
543
+ ]);
544
+ expect(out.join('\n')).toMatch(/foo\(x: number\): string/);
545
+ });
546
+
547
+ test('TSX function component: () => JSX.Element', () => {
548
+ const out = sigs([
549
+ 'function App(): JSX.Element {',
550
+ ' return <div />;',
551
+ '}',
552
+ ]);
553
+ expect(out.join('\n')).toMatch(/function App\(\): JSX\.Element/);
554
+ });
555
+
556
+ test('ESM re-export: export { foo } from "./bar"', () => {
557
+ const out = sigs([`export { foo } from './bar';`]);
558
+ expect(out.join('\n')).toMatch(/export \{ foo \} from/);
559
+ });
560
+
561
+ test('ESM aliased re-export: export { foo as bar } from "./baz"', () => {
562
+ const out = sigs([`export { foo as bar } from './baz';`]);
563
+ expect(out.join('\n')).toMatch(/export \{ foo as bar \} from/);
564
+ });
565
+
566
+ test('decorator above class — class is still captured (decorator dropped is acceptable)', () => {
567
+ const out = sigs([
568
+ '@Component({ selector: "x" })',
569
+ 'class Foo {}',
570
+ ]);
571
+ expect(out.join('\n')).toMatch(/class Foo/);
572
+ });
573
+
574
+ test('literal-union return type', () => {
575
+ const out = sigs([`function getKind(): "a" | "b" { return "a"; }`]);
576
+ expect(out.join('\n')).toMatch(/function getKind\(\):/);
577
+ });
578
+
579
+ test('export const generic arrow: <T>(x: T): T => x', () => {
580
+ const out = sigs(['export const fn = <T>(x: T): T => x;']);
581
+ expect(out.join('\n')).toMatch(/export const fn/);
582
+ });
583
+
584
+ // ── Language detection on all declared TS/TSX/MJS/CJS extensions ──
585
+
586
+ test('_langOf maps .ts / .tsx / .mjs / .cjs / .jsx → "js"', () => {
587
+ expect(tool._langOf('a.ts')).toBe('js');
588
+ expect(tool._langOf('a.tsx')).toBe('js');
589
+ expect(tool._langOf('a.mjs')).toBe('js');
590
+ expect(tool._langOf('a.cjs')).toBe('js');
591
+ expect(tool._langOf('a.jsx')).toBe('js');
592
+ // Case-insensitive on the extension.
593
+ expect(tool._langOf('a.TS')).toBe('js');
594
+ });
595
+
596
+ // ── Known gaps. These tests pin CURRENT (limited) behavior so an
597
+ // improvement to the parser fails them — at which point you
598
+ // update the assertion. Each gap is real and worth fixing in a
599
+ // tree-sitter migration. ─────────────────────────────────────
600
+
601
+ describe('KNOWN GAPS — pin current limitations', () => {
602
+ test('GAP: bare `interface` (no `export`) is NOT captured', () => {
603
+ const out = sigs(['interface User { id: string; name: string; }']);
604
+ expect(out).toEqual([]);
605
+ });
606
+
607
+ test('GAP: bare `type` alias (no `export`) is NOT captured', () => {
608
+ const out = sigs(['type ID = string | number;']);
609
+ expect(out).toEqual([]);
610
+ });
611
+
612
+ test('GAP: bare `enum` (no `export`) is NOT captured', () => {
613
+ const out = sigs(['enum Color { Red, Green, Blue }']);
614
+ expect(out).toEqual([]);
615
+ });
616
+
617
+ test('GAP: bare multi-line `interface` (no `export`) is NOT captured', () => {
618
+ const out = sigs([
619
+ 'interface User {',
620
+ ' id: string;',
621
+ '}',
622
+ ]);
623
+ expect(out).toEqual([]);
624
+ });
625
+
626
+ test('GAP: typed arrow component `const App: React.FC = () => <div />` is NOT captured', () => {
627
+ // The `: React.FC` annotation between the identifier and `=`
628
+ // breaks the parser's "ident = arrow" recognition. Common in
629
+ // older React+TS codebases.
630
+ const out = sigs(['const App: React.FC = () => <div />;']);
631
+ expect(out).toEqual([]);
632
+ });
633
+
634
+ test('GAP: async generator `async function* foo()` is NOT captured', () => {
635
+ const out = sigs([
636
+ 'async function* stream(): AsyncIterableIterator<number> {',
637
+ ' yield 1;',
638
+ '}',
639
+ ]);
640
+ expect(out).toEqual([]);
641
+ });
642
+
643
+ test('GAP: destructured-arg arrow with type annotation is NOT captured', () => {
644
+ // `const fn = ({ name }: { name: string }): string => …`
645
+ // The destructured + typed parameter list trips the
646
+ // ident = arrow recognition.
647
+ const out = sigs([`const greet = ({ name }: { name: string }): string => \`hi \${name}\`;`]);
648
+ expect(out).toEqual([]);
649
+ });
650
+ });
651
+ });
652
+
474
653
  // ─────────────────────────────────────────────────────────────────
475
654
  // C / C++ — _parseC. Same approach as the JS path; we lock the
476
655
  // patterns the regex needs to handle on real-world C/CPP files so
@@ -481,6 +481,147 @@ describe('TaskManagerTool', () => {
481
481
  expect(result.error).toContain('Invalid status');
482
482
  });
483
483
 
484
+ // ── Destructive-sync guardrail ───────────────────────────────────
485
+ // Real production failure: post-compaction, an agent forgot it had
486
+ // a 9-task plan and called sync with 4 unrelated tasks → all 9
487
+ // were silently dropped. These tests pin the guardrail.
488
+
489
+ describe('REGRESSION: destructive-sync guardrail', () => {
490
+ // Helper: seed the agent with two pending + one in_progress task,
491
+ // then attempt to sync with a totally different list.
492
+ async function setupAgentWithPlan() {
493
+ const { tool, context } = createTestSetup();
494
+ await tool.execute({
495
+ action: 'sync',
496
+ tasks: [
497
+ { title: 'Char select bg', status: 'in_progress', priority: 'high' },
498
+ { title: 'Board art', status: 'pending', priority: 'high' },
499
+ { title: 'Dice animation', status: 'pending', priority: 'medium' },
500
+ ],
501
+ }, context);
502
+ return { tool, context };
503
+ }
504
+
505
+ test('REFUSES sync that would drop pending/in_progress tasks without confirmReplace', async () => {
506
+ const { tool, context } = await setupAgentWithPlan();
507
+ const result = await tool.execute({
508
+ action: 'sync',
509
+ tasks: [
510
+ { title: 'Add Settings UI', status: 'pending', priority: 'high' },
511
+ { title: 'Add Settings logic', status: 'pending', priority: 'high' },
512
+ ],
513
+ }, context);
514
+ expect(result.success).toBe(false);
515
+ expect(result.error).toMatch(/Sync would drop 3 non-terminal task/);
516
+ // Must name the at-risk tasks so the agent can see them.
517
+ expect(result.error).toContain('Char select bg');
518
+ expect(result.error).toContain('Board art');
519
+ expect(result.error).toContain('Dice animation');
520
+ // Must explain the escape hatch.
521
+ expect(result.error).toContain('confirmReplace: true');
522
+ });
523
+
524
+ test('PROCEEDS when confirmReplace=true is explicitly set', async () => {
525
+ const { tool, context } = await setupAgentWithPlan();
526
+ const result = await tool.execute({
527
+ action: 'sync',
528
+ confirmReplace: true,
529
+ tasks: [
530
+ { title: 'Add Settings UI', status: 'pending', priority: 'high' },
531
+ ],
532
+ }, context);
533
+ expect(result.success).toBe(true);
534
+ expect(result.result.summary.total).toBe(1);
535
+ expect(result.result.summary.removed).toBe(3);
536
+ });
537
+
538
+ test('does NOT trigger when the incoming list keeps every open task (rename only)', async () => {
539
+ const { tool, context } = await setupAgentWithPlan();
540
+ // Update statuses, but keep all titles. No drops.
541
+ const result = await tool.execute({
542
+ action: 'sync',
543
+ tasks: [
544
+ { title: 'Char select bg', status: 'completed', priority: 'high' },
545
+ { title: 'Board art', status: 'in_progress', priority: 'high' },
546
+ { title: 'Dice animation', status: 'pending', priority: 'medium' },
547
+ ],
548
+ }, context);
549
+ expect(result.success).toBe(true);
550
+ expect(result.result.summary.updated).toBe(3);
551
+ expect(result.result.summary.removed).toBe(0);
552
+ });
553
+
554
+ test('does NOT trigger when only completed/cancelled tasks would be dropped', async () => {
555
+ const { tool, context } = createTestSetup();
556
+ // Seed with one done task + one pending.
557
+ await tool.execute({
558
+ action: 'sync',
559
+ tasks: [
560
+ { title: 'Already done', status: 'completed', priority: 'low' },
561
+ { title: 'Still going', status: 'pending', priority: 'high' },
562
+ ],
563
+ }, context);
564
+ // New sync drops the completed one but keeps the pending one.
565
+ const result = await tool.execute({
566
+ action: 'sync',
567
+ tasks: [{ title: 'Still going', status: 'in_progress', priority: 'high' }],
568
+ }, context);
569
+ // No guardrail trip — completed task is safe to drop.
570
+ expect(result.success).toBe(true);
571
+ });
572
+
573
+ test('error response includes droppedTasks metadata for programmatic recovery', async () => {
574
+ const { tool, context } = await setupAgentWithPlan();
575
+ const result = await tool.execute({
576
+ action: 'sync',
577
+ tasks: [{ title: 'New thing', status: 'pending', priority: 'high' }],
578
+ }, context);
579
+ expect(result.success).toBe(false);
580
+ // BaseTool's execute() catches the thrown Error; the message
581
+ // carries the human-readable hint. We assert on the hint
582
+ // contents — that's what the agent sees.
583
+ expect(result.error).toMatch(/3 non-terminal task/);
584
+ });
585
+
586
+ test('REGRESSION: the exact Talisman failure scenario is now blocked', async () => {
587
+ // Reproduce the production failure: agent has a 9-task plan
588
+ // reflecting the user's UI revision request. Post-compaction,
589
+ // agent loses context and tries to sync a 4-task Settings plan.
590
+ const { tool, context } = createTestSetup();
591
+ await tool.execute({
592
+ action: 'sync',
593
+ tasks: [
594
+ { title: 'Explore current code and image assets', status: 'in_progress', priority: 'high' },
595
+ { title: 'Generate character select background', status: 'pending', priority: 'high' },
596
+ { title: 'Generate board space art for all nodes', status: 'pending', priority: 'high' },
597
+ { title: 'Generate adventure card art', status: 'pending', priority: 'medium' },
598
+ { title: 'Fix board to rectangular layout', status: 'pending', priority: 'high' },
599
+ { title: 'Fix character select sticky buttons', status: 'pending', priority: 'medium' },
600
+ { title: 'Fix dice animation and combat', status: 'pending', priority: 'high' },
601
+ { title: 'Remove all emojis from UI', status: 'pending', priority: 'medium' },
602
+ { title: 'Test and verify all changes', status: 'pending', priority: 'medium' },
603
+ ],
604
+ }, context);
605
+ // The agent now (mistakenly) tries to sync a Settings plan.
606
+ const result = await tool.execute({
607
+ action: 'sync',
608
+ tasks: [
609
+ { title: 'Add Settings UI to index.html', status: 'pending', priority: 'high' },
610
+ { title: 'Add Settings logic to game.js', status: 'pending', priority: 'high' },
611
+ { title: 'Wire Settings to title screen', status: 'pending', priority: 'medium' },
612
+ { title: 'Test Settings persistence', status: 'pending', priority: 'medium' },
613
+ ],
614
+ }, context);
615
+ expect(result.success).toBe(false);
616
+ // ALL 9 original tasks must be named so the agent sees them.
617
+ expect(result.error).toContain('Explore current code');
618
+ expect(result.error).toContain('Generate character select background');
619
+ expect(result.error).toContain('Fix dice animation and combat');
620
+ expect(result.error).toContain('Remove all emojis from UI');
621
+ expect(result.error).toMatch(/Sync would drop 9 non-terminal task/);
622
+ });
623
+ });
624
+
484
625
  test('enforces only one in_progress task', async () => {
485
626
  const { tool, agent, context } = createTestSetup();
486
627
  await tool.execute({
@@ -15,6 +15,7 @@ import {
15
15
  ERROR_TYPES,
16
16
  SYSTEM_DEFAULTS
17
17
  } from '../utilities/constants.js';
18
+ import { NATIVE_SCHEMA_TOOL_NAMES } from './openaiFunctionSchemas.js';
18
19
 
19
20
  class BaseTool {
20
21
  constructor(config = {}, logger = null) {
@@ -690,8 +691,20 @@ class ToolsRegistry {
690
691
  includeUsageGuidelines = true,
691
692
  includeSecurityNotes = true,
692
693
  compact = false,
693
- layered = false
694
+ layered = false,
695
+ // 'responses' | 'chat_completion' | undefined.
696
+ // When 'responses', the target model uses native function-calling
697
+ // (Codex / o-series / gpt-5-pro). For tools that have a native
698
+ // schema in openaiFunctionSchemas.js, the structured schema sent
699
+ // in `tools:` IS the canonical source of truth for the model —
700
+ // so we skip baking the same information into the system prompt
701
+ // as text. This eliminates ~3K duplicated tokens per turn on the
702
+ // models that need it most. Defaults to undefined (= 'chat_completion'
703
+ // behaviour: include text descriptions). Old callers that don't
704
+ // pass this option get the previous behaviour verbatim — back-compat.
705
+ apiType = undefined,
694
706
  } = options;
707
+ const isNativeApi = apiType === 'responses';
695
708
 
696
709
  // Get tools to include — always inject 'help' so agents can query tool docs
697
710
  let toolIds = capabilities.length > 0
@@ -751,6 +764,19 @@ class ToolsRegistry {
751
764
  const tool = this.tools.get(toolId);
752
765
  if (!tool || !tool.isEnabled) continue;
753
766
 
767
+ // Skip text descriptions for tools that have a native function
768
+ // schema, when the target model uses the Responses API. The
769
+ // structured schema is the canonical source for these models.
770
+ // We DO still emit a one-line pointer so the agent isn't blind
771
+ // to the tool's existence (its capability list lives in the
772
+ // system prompt elsewhere too, but a single-line mention here
773
+ // costs ~10 tokens and is a useful breadcrumb).
774
+ if (isNativeApi && NATIVE_SCHEMA_TOOL_NAMES.has(toolId.toLowerCase())) {
775
+ const summary = this.toolSummaries.get(toolId) || `${toolId} tool`;
776
+ description += `- **${toolId}** — ${summary} (see structured schema)\n`;
777
+ continue;
778
+ }
779
+
754
780
  try {
755
781
  if (compact) {
756
782
  // Compact format - just tool name and brief description
@@ -795,6 +821,64 @@ class ToolsRegistry {
795
821
  description += '- **TOOL RESULTS ARE AVAILABLE ONLY AFTER YOUR MESSAGE ENDS**: Tools execute after your entire message is sent. You will NOT see any tool results until your next turn. This means: if the next tool call depends on results from a previous one, they MUST be in separate messages. You may batch independent tool calls in a single message, but never assume or guess the output of a tool — always wait for the actual result in the next turn before proceeding.\n\n';
796
822
  description += 'After invoking a tool, WAIT for the actual response. Do NOT generate imaginary responses.\n\n';
797
823
 
824
+ // ── Operating posture ────────────────────────────────────────────
825
+ // Cross-cutting habits agents should adopt VOLUNTARILY. The tool
826
+ // descriptions tell them WHAT each tool does; this block tells them
827
+ // WHEN to reach for them without being asked. Without this, agents
828
+ // tend to:
829
+ // • skip the memory/skills check at the start of a new task,
830
+ // re-discovering things the team already wrote down
831
+ // • never create a plan/* memory, losing the thread across the
832
+ // first compaction
833
+ // • only invoke `help`/`skills` after a failure, not proactively
834
+ // Only emitted when the relevant tools are actually in the agent's
835
+ // capability set — no point teaching "check skills" to an agent
836
+ // that doesn't have the skills tool.
837
+ const hasMemory = toolIds.includes('memory');
838
+ const hasSkills = toolIds.includes('skills');
839
+ if (hasMemory || hasSkills) {
840
+ description += '## OPERATING POSTURE\n\n';
841
+ description += 'Treat these as habits, not optional extras. Use them proactively, before you need them.\n\n';
842
+
843
+ if (hasMemory || hasSkills) {
844
+ description += '**At the start of a new task or topic shift:**\n';
845
+ if (hasMemory) {
846
+ description += '- Run `memory` → `list` (titles only) to scan for relevant context the team or your past self stored. If a title looks relevant, `read` it before improvising.\n';
847
+ }
848
+ if (hasSkills) {
849
+ description += '- Run `skills` → `list` to see if a skill already encodes how to do this task. If yes, follow its checklist instead of inventing one.\n';
850
+ }
851
+ description += '\n';
852
+ }
853
+
854
+ if (hasMemory) {
855
+ // Concrete event-based write triggers. The previous version asked
856
+ // the agent to "save a memory when you recognize the work is
857
+ // multi-turn" — that's a judgment call, and in practice agents
858
+ // never made the call. Observed in production: 670 messages,
859
+ // 0 memory writes, despite the OPERATING POSTURE block being
860
+ // present in the system prompt. The fix is to replace "when
861
+ // you recognize" with concrete event triggers the model can
862
+ // pattern-match on without judgment.
863
+ description += '**WRITE memory on these events (not "when you think it\'s a good idea" — these are mandatory):**\n';
864
+ description += '- **A new user message contains a numbered list, a multi-bullet ask, OR more than ~30 words of substantive request.** Your VERY NEXT tool call must be `memory` → `add` with title `plan/<short-topic>` and content = the user\'s entire message verbatim. Do this BEFORE any other tool call, including taskmanager. Why mandatory: compaction may later destroy that message and the agent (you, next session) will not remember what the user actually asked for.\n';
865
+ description += '- **You are about to call `taskmanager` → `sync` with more than 3 tasks at once.** First, save a `memory` titled `plan/<feature>` containing the user\'s original request + your rationale for the plan. The task list is fragile (it can be wiped by a later sync); the plan/* memory is the source of truth.\n';
866
+ description += '- **You made a non-obvious decision** (chose approach A over B, fixed a tricky bug, discovered a constraint while exploring code, hit an unexpected error and figured out the cause). Save it as a `memory` with a non-`plan/` title before the next tool call. Format: title = the conclusion, content = the evidence. The next agent will not re-derive it.\n';
867
+ description += '- **The user gave you a preference or rule that should apply to all future work** ("never use emojis", "always use Tailwind", "the API base URL is X"). Save it immediately as a `memory`.\n\n';
868
+ description += '**`plan/*` memories auto-inject into your system prompt every turn under `## AGENT WORKING PLAN` — that\'s how they survive compaction. Update or delete them as the situation evolves; a stale plan is worse than no plan.**\n\n';
869
+ }
870
+
871
+ description += '**Distinction:**\n';
872
+ description += '- `memory` = persistent knowledge that survives sessions (why, constraints, durable facts, working plans).\n';
873
+ if (toolIds.includes('taskmanager')) {
874
+ description += '- `taskmanager` = step-by-step checkboxes for the CURRENT task (what to do next, in order).\n';
875
+ }
876
+ if (hasSkills) {
877
+ description += '- `skills` = reusable playbooks the team curated for recurring tasks.\n';
878
+ }
879
+ description += '\n';
880
+ }
881
+
798
882
  // Add exploration strategy if code-map is available
799
883
  if (toolIds.includes('code-map')) {
800
884
  description += '## CODE EXPLORATION STRATEGY\n\n';
@@ -817,6 +901,10 @@ class ToolsRegistry {
817
901
  * @returns {string} Enhanced system prompt
818
902
  */
819
903
  enhanceSystemPrompt(existingPrompt, capabilities = [], options = {}) {
904
+ // `options.apiType` ('responses' | 'chat_completion' | undefined)
905
+ // is forwarded to the description builder so native-API models get
906
+ // a trimmed prompt that doesn't duplicate the structured schemas.
907
+ // Old callers omit it and get pre-existing behaviour unchanged.
820
908
  const toolSection = this.generateToolDescriptionsForPrompt(capabilities, options);
821
909
 
822
910
  if (!toolSection.trim()) {
@@ -322,4 +322,18 @@ export function getToolSchemasForAgent(capabilities = []) {
322
322
  return OPENAI_FUNCTION_SCHEMAS.filter(s => allowed.has(s.name.toLowerCase()));
323
323
  }
324
324
 
325
+ /**
326
+ * Names of every tool that has a native function schema in this file.
327
+ * Importable as a Set so other modules (notably baseTool's system-prompt
328
+ * builder) can decide "is the structured schema the canonical source of
329
+ * truth for this tool, or do we still need to bake a text description
330
+ * into the system prompt?". When a model uses the Responses API (which
331
+ * is RLHFed for native function-calling), the structured schema in
332
+ * `tools:` is the canonical source — emitting the text description as
333
+ * well doubles the same information in the context window.
334
+ */
335
+ export const NATIVE_SCHEMA_TOOL_NAMES = new Set(
336
+ OPENAI_FUNCTION_SCHEMAS.map(s => s.name.toLowerCase())
337
+ );
338
+
325
339
  export default OPENAI_FUNCTION_SCHEMAS;