onbuzz 4.8.0 → 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -364,6 +364,148 @@ describe('ToolsRegistry', () => {
364
364
  expect(desc).toContain('HOW TO GET TOOL DOCUMENTATION');
365
365
  });
366
366
 
367
+ describe('OPERATING POSTURE section', () => {
368
+ // Minimal fakes so the registry will accept these as memory/skills/taskmanager.
369
+ // BaseTool derives `this.id` from the class name (lowercased, with
370
+ // "Tool" stripped) — so the class must be named exactly *Tool* and
371
+ // the id is derived. We override `this.id` after super() to pin it
372
+ // independently of the class name, which keeps the test classes
373
+ // readable. validateTool() also requires parseParameters.
374
+ class FakeMemoryTool extends BaseTool {
375
+ constructor() { super(); this.id = 'memory'; }
376
+ getDescription() { return 'Memory tool stub'; }
377
+ parseParameters() { return {}; }
378
+ async execute() { return { ok: true }; }
379
+ }
380
+ class FakeSkillsTool extends BaseTool {
381
+ constructor() { super(); this.id = 'skills'; }
382
+ getDescription() { return 'Skills tool stub'; }
383
+ parseParameters() { return {}; }
384
+ async execute() { return { ok: true }; }
385
+ }
386
+ class FakeTaskManagerTool extends BaseTool {
387
+ constructor() { super(); this.id = 'taskmanager'; }
388
+ getDescription() { return 'TaskManager tool stub'; }
389
+ parseParameters() { return {}; }
390
+ async execute() { return { ok: true }; }
391
+ }
392
+
393
+ test('appears when memory tool is in capabilities (proactive memory nudge)', async () => {
394
+ await registry.registerTool(FakeMemoryTool);
395
+ const desc = registry.generateToolDescriptionsForPrompt(['memory']);
396
+ expect(desc).toContain('OPERATING POSTURE');
397
+ expect(desc).toMatch(/memory.*list/i);
398
+ // Plan/* should be cross-referenced here so agents writing a plan
399
+ // memory isn't an isolated tip buried in the memory tool's own desc.
400
+ expect(desc).toContain('plan/');
401
+ });
402
+
403
+ test('appears when skills tool is in capabilities (proactive skills nudge)', async () => {
404
+ await registry.registerTool(FakeSkillsTool);
405
+ const desc = registry.generateToolDescriptionsForPrompt(['skills']);
406
+ expect(desc).toContain('OPERATING POSTURE');
407
+ expect(desc).toMatch(/skills.*list/i);
408
+ });
409
+
410
+ test('does NOT appear when neither memory nor skills is in capabilities', async () => {
411
+ await registry.registerTool(TestTool);
412
+ const desc = registry.generateToolDescriptionsForPrompt(['test']);
413
+ expect(desc).not.toContain('OPERATING POSTURE');
414
+ });
415
+
416
+ test('distinguishes memory vs taskmanager when both are present (so agents know which to use)', async () => {
417
+ await registry.registerTool(FakeMemoryTool);
418
+ await registry.registerTool(FakeTaskManagerTool);
419
+ const desc = registry.generateToolDescriptionsForPrompt(['memory', 'taskmanager']);
420
+ expect(desc).toContain('OPERATING POSTURE');
421
+ expect(desc).toContain('persistent knowledge'); // memory
422
+ expect(desc).toContain('step-by-step'); // taskmanager
423
+ });
424
+ });
425
+
426
+ // ── Per-model prompt shape: skip text docs for tools with native schemas
427
+ // when the target uses the Responses API (codex / o-series / gpt-5-pro).
428
+ describe('apiType="responses" — trims duplication with native function schemas', () => {
429
+ class FakeMemoryTool extends BaseTool {
430
+ constructor() { super(); this.id = 'memory'; }
431
+ getDescription() { return 'Memory tool stub with LONG description that would normally take many tokens'; }
432
+ getSummary() { return 'Persistent memory'; }
433
+ parseParameters() { return {}; }
434
+ async execute() { return { ok: true }; }
435
+ }
436
+ class FakeTerminalTool extends BaseTool {
437
+ constructor() { super(); this.id = 'terminal'; }
438
+ getDescription() { return 'Terminal tool LONG description that would normally take many tokens'; }
439
+ getSummary() { return 'Shell access'; }
440
+ parseParameters() { return {}; }
441
+ async execute() { return { ok: true }; }
442
+ }
443
+ class FakeWebTool extends BaseTool {
444
+ // 'web' is NOT in OPENAI_FUNCTION_SCHEMAS — its text doc must always appear.
445
+ constructor() { super(); this.id = 'web'; }
446
+ getDescription() { return 'Web tool LONG description that would normally take many tokens'; }
447
+ getSummary() { return 'Browser automation'; }
448
+ parseParameters() { return {}; }
449
+ async execute() { return { ok: true }; }
450
+ }
451
+
452
+ test('omits text description for tools that have native function schemas (memory, terminal)', async () => {
453
+ await registry.registerTool(FakeMemoryTool);
454
+ await registry.registerTool(FakeTerminalTool);
455
+
456
+ const responsesDesc = registry.generateToolDescriptionsForPrompt(
457
+ ['memory', 'terminal'],
458
+ { apiType: 'responses' },
459
+ );
460
+ // Header still present + one-line pointer to structured schema.
461
+ expect(responsesDesc).toContain('AVAILABLE TOOLS');
462
+ expect(responsesDesc).toContain('see structured schema');
463
+ // The big multi-line text doc must NOT be repeated.
464
+ expect(responsesDesc).not.toContain('### MEMORY TOOL');
465
+ expect(responsesDesc).not.toContain('### TERMINAL TOOL');
466
+ expect(responsesDesc).not.toContain('LONG description that would normally take many tokens');
467
+ });
468
+
469
+ test('keeps text description for tools that do NOT have native function schemas (e.g. web)', async () => {
470
+ await registry.registerTool(FakeWebTool);
471
+ const responsesDesc = registry.generateToolDescriptionsForPrompt(
472
+ ['web'],
473
+ { apiType: 'responses' },
474
+ );
475
+ // 'web' has no native schema → text doc MUST be present.
476
+ expect(responsesDesc).toContain('### WEB TOOL');
477
+ expect(responsesDesc).toContain('LONG description that would normally take many tokens');
478
+ });
479
+
480
+ test('BACKWARD COMPAT: without apiType option, behaves exactly as before (full text for everything)', async () => {
481
+ await registry.registerTool(FakeMemoryTool);
482
+ await registry.registerTool(FakeTerminalTool);
483
+
484
+ const defaultDesc = registry.generateToolDescriptionsForPrompt(['memory', 'terminal']);
485
+ // No apiType → keep the heavy text docs as today.
486
+ expect(defaultDesc).toContain('### MEMORY TOOL');
487
+ expect(defaultDesc).toContain('### TERMINAL TOOL');
488
+ });
489
+
490
+ test('BACKWARD COMPAT: apiType="chat_completion" is equivalent to no apiType', async () => {
491
+ await registry.registerTool(FakeMemoryTool);
492
+ const a = registry.generateToolDescriptionsForPrompt(['memory'], { apiType: 'chat_completion' });
493
+ const b = registry.generateToolDescriptionsForPrompt(['memory']);
494
+ expect(a).toBe(b);
495
+ });
496
+
497
+ test('enhanceSystemPrompt forwards apiType option to the description builder', async () => {
498
+ await registry.registerTool(FakeMemoryTool);
499
+ const native = registry.enhanceSystemPrompt('Base.', ['memory'], { apiType: 'responses' });
500
+ const inline = registry.enhanceSystemPrompt('Base.', ['memory']);
501
+ // Native form is meaningfully shorter (we dropped the per-tool block).
502
+ expect(native.length).toBeLessThan(inline.length);
503
+ // Both still contain the section headers and the original base prompt.
504
+ expect(native).toContain('Base.');
505
+ expect(native).toContain('AVAILABLE TOOLS');
506
+ });
507
+ });
508
+
367
509
  test('enhanceSystemPrompt appends tool docs', async () => {
368
510
  await registry.registerTool(TestTool);
369
511
  const enhanced = registry.enhanceSystemPrompt('Base prompt.', []);
@@ -15,6 +15,7 @@ import {
15
15
  ERROR_TYPES,
16
16
  SYSTEM_DEFAULTS
17
17
  } from '../utilities/constants.js';
18
+ import { NATIVE_SCHEMA_TOOL_NAMES } from './openaiFunctionSchemas.js';
18
19
 
19
20
  class BaseTool {
20
21
  constructor(config = {}, logger = null) {
@@ -690,8 +691,20 @@ class ToolsRegistry {
690
691
  includeUsageGuidelines = true,
691
692
  includeSecurityNotes = true,
692
693
  compact = false,
693
- layered = false
694
+ layered = false,
695
+ // 'responses' | 'chat_completion' | undefined.
696
+ // When 'responses', the target model uses native function-calling
697
+ // (Codex / o-series / gpt-5-pro). For tools that have a native
698
+ // schema in openaiFunctionSchemas.js, the structured schema sent
699
+ // in `tools:` IS the canonical source of truth for the model —
700
+ // so we skip baking the same information into the system prompt
701
+ // as text. This eliminates ~3K duplicated tokens per turn on the
702
+ // models that need it most. Defaults to undefined (= 'chat_completion'
703
+ // behaviour: include text descriptions). Old callers that don't
704
+ // pass this option get the previous behaviour verbatim — back-compat.
705
+ apiType = undefined,
694
706
  } = options;
707
+ const isNativeApi = apiType === 'responses';
695
708
 
696
709
  // Get tools to include — always inject 'help' so agents can query tool docs
697
710
  let toolIds = capabilities.length > 0
@@ -751,6 +764,19 @@ class ToolsRegistry {
751
764
  const tool = this.tools.get(toolId);
752
765
  if (!tool || !tool.isEnabled) continue;
753
766
 
767
+ // Skip text descriptions for tools that have a native function
768
+ // schema, when the target model uses the Responses API. The
769
+ // structured schema is the canonical source for these models.
770
+ // We DO still emit a one-line pointer so the agent isn't blind
771
+ // to the tool's existence (its capability list lives in the
772
+ // system prompt elsewhere too, but a single-line mention here
773
+ // costs ~10 tokens and is a useful breadcrumb).
774
+ if (isNativeApi && NATIVE_SCHEMA_TOOL_NAMES.has(toolId.toLowerCase())) {
775
+ const summary = this.toolSummaries.get(toolId) || `${toolId} tool`;
776
+ description += `- **${toolId}** — ${summary} (see structured schema)\n`;
777
+ continue;
778
+ }
779
+
754
780
  try {
755
781
  if (compact) {
756
782
  // Compact format - just tool name and brief description
@@ -795,6 +821,58 @@ class ToolsRegistry {
795
821
  description += '- **TOOL RESULTS ARE AVAILABLE ONLY AFTER YOUR MESSAGE ENDS**: Tools execute after your entire message is sent. You will NOT see any tool results until your next turn. This means: if the next tool call depends on results from a previous one, they MUST be in separate messages. You may batch independent tool calls in a single message, but never assume or guess the output of a tool — always wait for the actual result in the next turn before proceeding.\n\n';
796
822
  description += 'After invoking a tool, WAIT for the actual response. Do NOT generate imaginary responses.\n\n';
797
823
 
824
+ // ── Operating posture ────────────────────────────────────────────
825
+ // Cross-cutting habits agents should adopt VOLUNTARILY. The tool
826
+ // descriptions tell them WHAT each tool does; this block tells them
827
+ // WHEN to reach for them without being asked. Without this, agents
828
+ // tend to:
829
+ // • skip the memory/skills check at the start of a new task,
830
+ // re-discovering things the team already wrote down
831
+ // • never create a plan/* memory, losing the thread across the
832
+ // first compaction
833
+ // • only invoke `help`/`skills` after a failure, not proactively
834
+ // Only emitted when the relevant tools are actually in the agent's
835
+ // capability set — no point teaching "check skills" to an agent
836
+ // that doesn't have the skills tool.
837
+ const hasMemory = toolIds.includes('memory');
838
+ const hasSkills = toolIds.includes('skills');
839
+ if (hasMemory || hasSkills) {
840
+ description += '## OPERATING POSTURE\n\n';
841
+ description += 'Treat these as habits, not optional extras. Use them proactively, before you need them.\n\n';
842
+
843
+ if (hasMemory || hasSkills) {
844
+ description += '**At the start of a new task or topic shift:**\n';
845
+ if (hasMemory) {
846
+ description += '- Run `memory` → `list` (titles only) to scan for relevant context the team or your past self stored. If a title looks relevant, `read` it before improvising.\n';
847
+ }
848
+ if (hasSkills) {
849
+ description += '- Run `skills` → `list` to see if a skill already encodes how to do this task. If yes, follow its checklist instead of inventing one.\n';
850
+ }
851
+ description += '\n';
852
+ }
853
+
854
+ if (hasMemory) {
855
+ description += '**When you recognize the work is multi-turn or multi-session:**\n';
856
+ description += '- Save a `memory` entry with title starting `plan/` (e.g. `plan/refactor-auth`). The content auto-injects into your system prompt every turn under "AGENT WORKING PLAN" until you delete it. This is how you survive compaction — anything important enough to remember next session belongs here.\n';
857
+ description += '- Update or delete the plan as the situation changes. A stale plan is worse than no plan.\n\n';
858
+ }
859
+
860
+ if (hasMemory) {
861
+ description += '**When you learn something durable** (a user preference, a non-obvious constraint, an architectural fact the next agent will want):\n';
862
+ description += '- Save it as a `memory` (non-`plan/` title). One-shot facts go here, NOT in your reply.\n\n';
863
+ }
864
+
865
+ description += '**Distinction:**\n';
866
+ description += '- `memory` = persistent knowledge that survives sessions (why, constraints, durable facts, working plans).\n';
867
+ if (toolIds.includes('taskmanager')) {
868
+ description += '- `taskmanager` = step-by-step checkboxes for the CURRENT task (what to do next, in order).\n';
869
+ }
870
+ if (hasSkills) {
871
+ description += '- `skills` = reusable playbooks the team curated for recurring tasks.\n';
872
+ }
873
+ description += '\n';
874
+ }
875
+
798
876
  // Add exploration strategy if code-map is available
799
877
  if (toolIds.includes('code-map')) {
800
878
  description += '## CODE EXPLORATION STRATEGY\n\n';
@@ -817,6 +895,10 @@ class ToolsRegistry {
817
895
  * @returns {string} Enhanced system prompt
818
896
  */
819
897
  enhanceSystemPrompt(existingPrompt, capabilities = [], options = {}) {
898
+ // `options.apiType` ('responses' | 'chat_completion' | undefined)
899
+ // is forwarded to the description builder so native-API models get
900
+ // a trimmed prompt that doesn't duplicate the structured schemas.
901
+ // Old callers omit it and get pre-existing behaviour unchanged.
820
902
  const toolSection = this.generateToolDescriptionsForPrompt(capabilities, options);
821
903
 
822
904
  if (!toolSection.trim()) {
@@ -322,4 +322,18 @@ export function getToolSchemasForAgent(capabilities = []) {
322
322
  return OPENAI_FUNCTION_SCHEMAS.filter(s => allowed.has(s.name.toLowerCase()));
323
323
  }
324
324
 
325
+ /**
326
+ * Names of every tool that has a native function schema in this file.
327
+ * Importable as a Set so other modules (notably baseTool's system-prompt
328
+ * builder) can decide "is the structured schema the canonical source of
329
+ * truth for this tool, or do we still need to bake a text description
330
+ * into the system prompt?". When a model uses the Responses API (which
331
+ * is RLHFed for native function-calling), the structured schema in
332
+ * `tools:` is the canonical source — emitting the text description as
333
+ * well doubles the same information in the context window.
334
+ */
335
+ export const NATIVE_SCHEMA_TOOL_NAMES = new Set(
336
+ OPENAI_FUNCTION_SCHEMAS.map(s => s.name.toLowerCase())
337
+ );
338
+
325
339
  export default OPENAI_FUNCTION_SCHEMAS;